In [45]:
import glob
import re
import pickle

import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
SEED = 8455

In [3]:
cols_to_skip = ['raw_text', 'language']

In [49]:
df_html_data = pd.concat([pd.read_csv(csv, lineterminator='\n', usecols=lambda x: x not in cols_to_skip)
               for csv in glob.glob('../data/csv/*.csv')], ignore_index=True)
df_file_labels = pd.read_csv('../data/html_targets.csv')
df = df_html_data.merge(df_file_labels, left_on='filename', right_on='file')
df = df.drop(['file', 'filename'], axis=1)

In [5]:
stop_words = set(stopwords.words('english') + \
                 stopwords.words('french') + \
                 stopwords.words('german') + \
                 stopwords.words('spanish')
                )

In [6]:
lemmer = WordNetLemmatizer()
text_transformer = TfidfVectorizer(max_features=250, ngram_range=(1, 2))

In [7]:
df['title'] = df['title'].fillna('')
df['title'] = df['title'].map(lambda x: x.lower())
df['title'] = df['title'].map(lambda x: re.sub(r'[^\w\s]', '', x))
df['title'] = df['title'].map(lambda x: re.sub(r'\d+', '', x))
df['title'] = df['title'].map(lambda x: [lemmer.lemmatize(word) for word in x.split() if word not in stop_words])
df['title'] = df['title'].map(lambda x: ' '.join(x))

In [8]:
df_text = text_transformer.fit_transform(df['title'])

In [9]:
df_text = pd.DataFrame(df_text.toarray(), columns=text_transformer.get_feature_names_out())

In [10]:
df = df.join(df_text)

In [11]:
df = df.drop(['title'], axis=1)

In [12]:
X = df.drop(['sponsored'], axis=1)
y = df['sponsored']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

### RF Model

In [13]:
clf = RandomForestClassifier(class_weight='balanced',
                              criterion='entropy',
                              max_samples=0.9262414952228437,
                              min_samples_split=9,
                              n_estimators=289,
                              random_state=SEED)

In [31]:
clf.fit(X_train, y_train)

In [34]:
y_pred = clf.predict(X_test)

In [40]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97     60946
           1       0.87      0.47      0.61      6459

    accuracy                           0.94     67405
   macro avg       0.91      0.73      0.79     67405
weighted avg       0.94      0.94      0.93     67405



In [43]:
sorted(zip(X.columns, clf.feature_importances_), key=lambda x: x[1] * -1)

[('num_scripts', 0.05375852587720081),
 ('num_links', 0.048443714125413075),
 ('num_tags', 0.04754793445104348),
 ('num_lines', 0.04704443191353387),
 ('num_digits', 0.046524732031565645),
 ('num_characters', 0.04470769652235222),
 ('num_words', 0.04149311052182512),
 ('num_unique_words', 0.04143942031601846),
 ('num_headers', 0.040975081618243175),
 ('num_images', 0.040516069192487805),
 ('num_paragraphs', 0.03948274003528407),
 ('num_inputs', 0.03938725994849528),
 ('num_lists', 0.0390173546337236),
 ('num_unique_characters', 0.032792321708695824),
 ('num_styles', 0.027024974898678458),
 ('num_bold_words', 0.025064222189281515),
 ('num_italic_words', 0.02495769032685725),
 ('num_buttons', 0.02444263461804975),
 ('num_forms', 0.023672203229566136),
 ('num_tables', 0.020910757671013944),
 ('num_iframes', 0.017565486772287954),
 ('has_google_analytics', 0.010342273312204863),
 ('has_universal_pixel', 0.009531795860325992),
 ('blog', 0.00926018791646937),
 ('has_google_tag_manager', 0.00

In [47]:
with open('nativead_predict_rf.pkl', 'wb') as file:
    pickle.dump(clf, file)