In [1]:
import glob
import re
import pickle

import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

In [2]:
SEED = 8455

In [3]:
cols_to_skip = ['title', 'raw_text', 'language']

In [4]:
df_html_data = pd.concat([pd.read_csv(csv, lineterminator='\n', usecols=lambda x: x not in cols_to_skip)
               for csv in glob.glob('../data/csv/*.csv')], ignore_index=True)
df_file_labels = pd.read_csv('../data/html_targets.csv')
df = df_html_data.merge(df_file_labels, left_on='filename', right_on='file')
df = df.drop(['file', 'filename'], axis=1)

In [5]:
X = df.drop(['sponsored'], axis=1)
y = df['sponsored']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

### RF Model

In [6]:
clf = RandomForestClassifier(class_weight='balanced',
                              criterion='entropy',
                              max_samples=0.9881470567266184,
                              min_samples_split=8,
                              n_estimators=270,
                              random_state=SEED)

In [7]:
clf.fit(X_train, y_train)

In [8]:
y_pred = clf.predict(X_test)

In [9]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     60946
           1       0.87      0.46      0.61      6459

    accuracy                           0.94     67405
   macro avg       0.91      0.73      0.79     67405
weighted avg       0.94      0.94      0.93     67405



In [10]:
sorted(zip(X.columns, clf.feature_importances_), key=lambda x: x[1] * -1)

[('num_scripts', 0.07224366313436272),
 ('num_links', 0.06394144780921854),
 ('num_digits', 0.0599622041704164),
 ('num_lines', 0.058971234910092804),
 ('num_tags', 0.05851682919896101),
 ('num_characters', 0.05498356033301143),
 ('num_images', 0.05238522845267652),
 ('num_headers', 0.0513175829584554),
 ('num_inputs', 0.051266067926821396),
 ('num_unique_words', 0.0509774571157897),
 ('num_words', 0.050975251686575504),
 ('num_paragraphs', 0.04899064056370643),
 ('num_lists', 0.048916805113322505),
 ('num_unique_characters', 0.038605078386261975),
 ('num_styles', 0.03312779741175026),
 ('num_italic_words', 0.030919384426319255),
 ('num_bold_words', 0.030314337731415203),
 ('num_buttons', 0.029426651769964077),
 ('num_forms', 0.027048981749980015),
 ('num_tables', 0.024625729125975084),
 ('num_iframes', 0.020186474840886405),
 ('has_google_analytics', 0.012149177741311287),
 ('has_universal_pixel', 0.010600431122076578),
 ('has_google_tag_manager', 0.010407511365657986),
 ('has_pubads'

In [11]:
with open('webapp_rf.pkl', 'wb') as file:
    pickle.dump(clf, file)