In [21]:
import glob
import re

import pandas as pd
from scipy.stats import randint, uniform

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline

In [22]:
SEED = 8455

In [23]:
cols_to_skip = ['title', 'raw_text', 'language']

In [24]:
df_html_data = pd.concat([pd.read_csv(csv, lineterminator='\n', usecols=lambda x: x not in cols_to_skip)
               for csv in glob.glob('../data/csv/*.csv')], ignore_index=True)
df_file_labels = pd.read_csv('../data/html_targets.csv')
df = df_html_data.merge(df_file_labels, left_on='filename', right_on='file')
df = df.drop(['file', 'filename'], axis=1)

In [26]:
X = df.drop(['sponsored'], axis=1)
y = df['sponsored']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

### RF Model

In [40]:
clf = RandomForestClassifier()

In [44]:
param_distributions = {
    'n_estimators': randint(100, 300),
    'class_weight': ['balanced'],
    'min_samples_split': randint(5, 9),
    'criterion': ['gini', 'entropy'],
    'max_samples': uniform(0.75, 0.25)
}

random_search = RandomizedSearchCV(clf,
                                   param_distributions,
                                   n_iter=200,
                                   cv=10,
                                   verbose=2,
                                   scoring='f1_macro',
                                   n_jobs=-1)

%time random_search.fit(X_train, y_train) 
print(random_search.best_params_)
print(random_search.best_score_)
print(random_search.score(X_test, y_test))  

Fitting 10 folds for each of 200 candidates, totalling 2000 fits
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.7658118769573491, min_samples_split=6, n_estimators=246; total time= 4.6min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8288564849441977, min_samples_split=8, n_estimators=244; total time= 4.2min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.9460205249489678, min_samples_split=5, n_estimators=271; total time= 5.1min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.9460205249489678, min_samples_split=5, n_estimators=271; total time= 5.1min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.9780755199426211, min_samples_split=6, n_estimators=152; total time= 3.3min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.9780755199426211, min_samples_split=6, n_estimators=152; total time= 3.3min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.942076860075183, min_samples_split=6

[CV] END class_weight=balanced, criterion=entropy, max_samples=0.7658118769573491, min_samples_split=6, n_estimators=246; total time= 4.6min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.7658118769573491, min_samples_split=6, n_estimators=246; total time= 4.5min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.9460205249489678, min_samples_split=5, n_estimators=271; total time= 5.1min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8420218608122425, min_samples_split=8, n_estimators=119; total time= 2.3min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8420218608122425, min_samples_split=8, n_estimators=119; total time= 2.3min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8420218608122425, min_samples_split=8, n_estimators=119; total time= 2.4min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.9780755199426211, min_samples_split=6, n_estimators=152; total time= 3.2min
[CV] END class_w

[CV] END class_weight=balanced, criterion=entropy, max_samples=0.7658118769573491, min_samples_split=6, n_estimators=246; total time= 4.6min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8288564849441977, min_samples_split=8, n_estimators=244; total time= 4.2min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8288564849441977, min_samples_split=8, n_estimators=244; total time= 4.2min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.9460205249489678, min_samples_split=5, n_estimators=271; total time= 5.1min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8420218608122425, min_samples_split=8, n_estimators=119; total time= 2.4min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.9780755199426211, min_samples_split=6, n_estimators=152; total time= 3.3min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.942076860075183, min_samples_split=6, n_estimators=285; total time= 5.4min
[CV] END class_weight=bala

[CV] END class_weight=balanced, criterion=entropy, max_samples=0.7658118769573491, min_samples_split=6, n_estimators=246; total time= 4.5min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.7658118769573491, min_samples_split=6, n_estimators=246; total time= 4.5min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.9460205249489678, min_samples_split=5, n_estimators=271; total time= 5.1min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8420218608122425, min_samples_split=8, n_estimators=119; total time= 2.3min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8420218608122425, min_samples_split=8, n_estimators=119; total time= 2.3min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8420218608122425, min_samples_split=8, n_estimators=119; total time= 2.4min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.9780755199426211, min_samples_split=6, n_estimators=152; total time= 3.3min
[CV] END class_w

[CV] END class_weight=balanced, criterion=entropy, max_samples=0.7658118769573491, min_samples_split=6, n_estimators=246; total time= 4.6min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8288564849441977, min_samples_split=8, n_estimators=244; total time= 4.2min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.9460205249489678, min_samples_split=5, n_estimators=271; total time= 5.0min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.9460205249489678, min_samples_split=5, n_estimators=271; total time= 5.1min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.9780755199426211, min_samples_split=6, n_estimators=152; total time= 3.3min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.9780755199426211, min_samples_split=6, n_estimators=152; total time= 3.3min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.942076860075183, min_samples_split=6, n_estimators=285; total time= 5.4min
[CV] END class_weight=bala

[CV] END class_weight=balanced, criterion=entropy, max_samples=0.7658118769573491, min_samples_split=6, n_estimators=246; total time= 4.6min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8288564849441977, min_samples_split=8, n_estimators=244; total time= 4.2min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8288564849441977, min_samples_split=8, n_estimators=244; total time= 4.2min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.9460205249489678, min_samples_split=5, n_estimators=271; total time= 5.0min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8420218608122425, min_samples_split=8, n_estimators=119; total time= 2.3min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.9780755199426211, min_samples_split=6, n_estimators=152; total time= 3.3min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.942076860075183, min_samples_split=6, n_estimators=285; total time= 5.3min
[CV] END class_weight=bala

[CV] END class_weight=balanced, criterion=entropy, max_samples=0.7658118769573491, min_samples_split=6, n_estimators=246; total time= 4.6min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8288564849441977, min_samples_split=8, n_estimators=244; total time= 4.2min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8288564849441977, min_samples_split=8, n_estimators=244; total time= 4.2min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.9460205249489678, min_samples_split=5, n_estimators=271; total time= 5.1min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8420218608122425, min_samples_split=8, n_estimators=119; total time= 2.4min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.9780755199426211, min_samples_split=6, n_estimators=152; total time= 3.3min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.942076860075183, min_samples_split=6, n_estimators=285; total time= 5.4min
[CV] END class_weight=bala

[CV] END class_weight=balanced, criterion=entropy, max_samples=0.7658118769573491, min_samples_split=6, n_estimators=246; total time= 4.6min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8288564849441977, min_samples_split=8, n_estimators=244; total time= 4.2min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8288564849441977, min_samples_split=8, n_estimators=244; total time= 4.2min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.9460205249489678, min_samples_split=5, n_estimators=271; total time= 5.1min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8420218608122425, min_samples_split=8, n_estimators=119; total time= 2.4min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.9780755199426211, min_samples_split=6, n_estimators=152; total time= 3.3min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.942076860075183, min_samples_split=6, n_estimators=285; total time= 5.4min
[CV] END class_weight=bala

[CV] END class_weight=balanced, criterion=entropy, max_samples=0.9908390881563032, min_samples_split=8, n_estimators=159; total time= 3.5min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.7784889493326362, min_samples_split=7, n_estimators=113; total time= 1.9min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8413318227628331, min_samples_split=7, n_estimators=167; total time= 3.3min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8592515517525221, min_samples_split=5, n_estimators=186; total time= 3.3min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8592515517525221, min_samples_split=5, n_estimators=186; total time= 3.3min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8138148522565323, min_samples_split=7, n_estimators=147; total time= 2.5min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.9054037224917137, min_samples_split=8, n_estimators=179; total time= 3.3min
[CV] END class_weight=balanc

[CV] END class_weight=balanced, criterion=entropy, max_samples=0.9908390881563032, min_samples_split=8, n_estimators=159; total time= 3.5min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.7784889493326362, min_samples_split=7, n_estimators=113; total time= 1.9min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.7784889493326362, min_samples_split=7, n_estimators=113; total time= 1.8min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8413318227628331, min_samples_split=7, n_estimators=167; total time= 3.2min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8592515517525221, min_samples_split=5, n_estimators=186; total time= 3.3min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8138148522565323, min_samples_split=7, n_estimators=147; total time= 2.5min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.9054037224917137, min_samples_split=8, n_estimators=179; total time= 3.2min
[CV] END class_weight=balanc

[CV] END class_weight=balanced, criterion=gini, max_samples=0.7784889493326362, min_samples_split=7, n_estimators=113; total time= 1.9min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8413318227628331, min_samples_split=7, n_estimators=167; total time= 3.3min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8592515517525221, min_samples_split=5, n_estimators=186; total time= 3.3min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8138148522565323, min_samples_split=7, n_estimators=147; total time= 2.5min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8138148522565323, min_samples_split=7, n_estimators=147; total time= 2.5min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.9054037224917137, min_samples_split=8, n_estimators=179; total time= 3.2min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8705154822539748, min_samples_split=7, n_estimators=122; total time= 2.5min
[CV] END class_weight=balanc

[CV] END class_weight=balanced, criterion=entropy, max_samples=0.7500616706651135, min_samples_split=6, n_estimators=160; total time= 2.9min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.9908390881563032, min_samples_split=8, n_estimators=159; total time= 3.5min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.7784889493326362, min_samples_split=7, n_estimators=113; total time= 1.9min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.7784889493326362, min_samples_split=7, n_estimators=113; total time= 1.8min
[CV] END class_weight=balanced, criterion=entropy, max_samples=0.8413318227628331, min_samples_split=7, n_estimators=167; total time= 3.2min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8592515517525221, min_samples_split=5, n_estimators=186; total time= 3.3min
[CV] END class_weight=balanced, criterion=gini, max_samples=0.8138148522565323, min_samples_split=7, n_estimators=147; total time= 2.5min
[CV] END class_weight=bal

CPU times: user 4min 2s, sys: 14.3 s, total: 4min 16s
Wall time: 15h 36min 7s
{'class_weight': 'balanced', 'criterion': 'entropy', 'max_samples': 0.9881470567266184, 'min_samples_split': 8, 'n_estimators': 270}
0.787732101623682
0.7879615942062024


In [47]:
dff = pd.DataFrame(random_search.cv_results_).sort_values(by='rank_test_score')

In [48]:
dff.to_csv('webapp_hp_2.csv', index=False)

In [41]:
# first run
pd.DataFrame(random_search.cv_results_).sort_values(by='rank_test_score')[:20]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_samples,param_min_samples_split,param_n_estimators,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
27,153.997291,0.691942,1.480219,0.015767,balanced,entropy,0.991148,8,123,"{'class_weight': 'balanced', 'criterion': 'ent...",...,0.793141,0.787635,0.785128,0.784347,0.783349,0.793067,0.788598,0.786752,0.003881,1
14,285.855045,1.192394,3.606042,0.237547,balanced,gini,0.985554,8,253,"{'class_weight': 'balanced', 'criterion': 'gin...",...,0.792393,0.786958,0.784776,0.781301,0.782866,0.791673,0.78877,0.78564,0.004197,2
58,271.321293,0.750601,2.810344,0.10093,balanced,entropy,0.863027,8,237,"{'class_weight': 'balanced', 'criterion': 'ent...",...,0.792831,0.786159,0.783334,0.783719,0.78001,0.790837,0.785948,0.784973,0.004111,3
21,199.711382,0.972134,2.190346,0.01128,balanced,gini,0.961515,8,181,"{'class_weight': 'balanced', 'criterion': 'gin...",...,0.791745,0.784577,0.784019,0.782992,0.782766,0.792325,0.786209,0.784903,0.004189,4
15,267.114416,0.726086,3.329233,0.203265,balanced,gini,0.93266,8,245,"{'class_weight': 'balanced', 'criterion': 'gin...",...,0.792863,0.784305,0.785518,0.781829,0.781551,0.790482,0.786368,0.784462,0.004475,5
0,192.664166,5.816981,2.727787,0.134727,balanced,entropy,0.93618,7,148,"{'class_weight': 'balanced', 'criterion': 'ent...",...,0.791429,0.78379,0.78122,0.784349,0.781813,0.791771,0.786885,0.784404,0.004576,6
18,177.215345,1.022797,1.950425,0.031073,balanced,gini,0.980806,7,158,"{'class_weight': 'balanced', 'criterion': 'gin...",...,0.790836,0.78263,0.782731,0.780333,0.782031,0.788201,0.786255,0.783626,0.003736,7
32,208.995564,7.352111,1.995816,0.013213,balanced,entropy,0.977266,6,165,"{'class_weight': 'balanced', 'criterion': 'ent...",...,0.791489,0.78317,0.782334,0.782201,0.781749,0.790475,0.78567,0.783606,0.004409,8
33,216.674786,8.055897,2.398435,0.015085,balanced,gini,0.911332,8,201,"{'class_weight': 'balanced', 'criterion': 'gin...",...,0.792535,0.783676,0.78263,0.782256,0.779229,0.78879,0.7867,0.783564,0.004339,9
34,251.074862,12.642574,2.856391,0.156463,balanced,gini,0.944288,7,221,"{'class_weight': 'balanced', 'criterion': 'gin...",...,0.791235,0.783518,0.782553,0.781203,0.781557,0.788996,0.785788,0.783387,0.004368,10
