In [1]:
import glob
import re

import pandas as pd
from scipy.stats import randint, uniform

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
SEED = 8455

In [3]:
cols_to_skip = ['raw_text', 'language']

In [4]:
df_html_data = pd.concat([pd.read_csv(csv, lineterminator='\n', usecols=lambda x: x not in cols_to_skip)
               for csv in glob.glob('../data/csv/*.csv')], ignore_index=True)
df_file_labels = pd.read_csv('../data/html_targets.csv')
df = df_html_data.merge(df_file_labels, left_on='filename', right_on='file')
df = df.drop(['file', 'filename'], axis=1)

In [5]:
stop_words = set(stopwords.words('english') + \
                 stopwords.words('french') + \
                 stopwords.words('german') + \
                 stopwords.words('spanish')
                )

In [6]:
lemmer = WordNetLemmatizer()
text_transformer = TfidfVectorizer(max_features=250, ngram_range=(1, 2))

In [7]:
df['title'] = df['title'].fillna('')
df['title'] = df['title'].map(lambda x: x.lower())
df['title'] = df['title'].map(lambda x: re.sub(r'[^\w\s]', '', x))
df['title'] = df['title'].map(lambda x: re.sub(r'\d+', '', x))
df['title'] = df['title'].map(lambda x: [lemmer.lemmatize(word) for word in x.split() if word not in stop_words])
df['title'] = df['title'].map(lambda x: ' '.join(x))

In [8]:
df_text = text_transformer.fit_transform(df['title'])

In [9]:
df_text = pd.DataFrame(df_text.toarray(), columns=text_transformer.get_feature_names_out())

In [10]:
df = df.join(df_text)

In [11]:
df = df.drop(['title'], axis=1)

In [12]:
X = df.drop(['sponsored'], axis=1)
y = df['sponsored']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

### Hyperparameter Tuning

In [13]:
class DummyEstimator(BaseEstimator):
    def fit(self): pass
    def score(self): pass

#### Randomized Search

In [14]:
pipe = Pipeline([
    ("clf", DummyEstimator())
])

In [19]:
param_distributions = [{
    'clf': [RandomForestClassifier()],
    'clf__n_estimators': randint(100, 300),
    'clf__class_weight': [None, 'balanced'],
    'clf__min_samples_split': randint(5, 10),
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_samples': uniform(0.1, 0.9)
},
{
    'clf': [CatBoostClassifier()],
    'clf__depth': randint(4, 10),
    'clf__learning_rate': uniform(0.1, 0.4),
    'clf__l2_leaf_reg': randint(1, 10),
    'clf__iterations': randint(200, 400)
},
{
    'clf': [XGBClassifier()],
    'clf__class_weight': [None, 'balanced'],
    'clf__max_depth': randint(3, 10),
    'clf__learning_rate': uniform(0.2, 0.1),
    'clf__subsample': uniform(0.75, 0.25),
    'clf__colsample_bytree': uniform(0.75, 0.25),
    'clf__gamma': uniform(0, 0.5),
    'clf__min_child_weight': randint(1, 10)
}
]

random_search = RandomizedSearchCV(pipe,
                                   param_distributions,
                                   n_iter=100,
                                   cv=10,
                                   verbose=2,
                                   random_state=SEED,
                                   scoring='f1_macro',
                                   n_jobs=-1)

%time random_search.fit(X_train, y_train) 
print(random_search.best_params_)
print(random_search.best_score_)
print(random_search.score(X_test, y_test))  

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] END clf=XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...), clf__class_weight=balanced, clf__colsample_bytree=1.2557340860902042, clf__gamma=0.14883575458027015, clf__learning_rate=0.030226971886820067, clf__max_depth=3, clf__min_child