In [47]:
import pandas as pd
from numpy.random import seed
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from yellowbrick.classifier.classification_report import classification_report as yb_report
from yellowbrick.classifier.threshold import discrimination_threshold
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from mlxtend.evaluate import mcnemar_tables, mcnemar
from classifier import *

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [2]:
features = pd.read_csv('dataset.csv')
labels = features['label']
features = features.drop(labels=['token', 'length', 'label'], axis=1)
features = features.drop(labels='all.upper', axis=1)
features.head()

Unnamed: 0,has.vowels,has.special,just.letters,roman,english.word,long.char.seq,starts.with.two,pos,prev.pos,prev.pos2,next.pos,next.pos2,prev.dep,prev.dep2,next.dep,next.dep2
0,True,False,True,False,False,False,False,PROPN,,,PROPN,NOUN,,,compound,compound
1,True,False,True,False,False,False,False,PROPN,,,PROPN,NOUN,,,compound,root
2,True,False,True,False,False,False,False,PROPN,,,PROPN,PROPN,,,compound,compound
3,True,False,True,False,False,False,False,PROPN,,,PROPN,,,,root,
4,True,False,True,False,False,False,False,PROPN,,,PROPN,PUNCT,,,flat,punct


In [4]:
seed(SEED)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=SEED, stratify=labels)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(11605, 16) (4974, 16) (11605,) (4974,)


Training dataset distribution

In [5]:
y_train.value_counts()

0    10560
1     1045
Name: label, dtype: int64

Testing dataset distribution

In [6]:
y_test.value_counts()

0    4526
1     448
Name: label, dtype: int64

### Test simple CatBoost classifier

In [7]:
clf, results, feature_names = train_classifier(X_train, y_train, 'classifier-sklearn', X_test, y_test)

In [10]:
print(classification_report(results['predicted'], results['original']))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97      4624
           1       0.63      0.81      0.71       350

    accuracy                           0.95      4974
   macro avg       0.81      0.88      0.84      4974
weighted avg       0.96      0.95      0.96      4974



Confusion matrix

Feature importance

In [9]:
importance = clf.get_feature_importance(prettified=True)
importance['Feature Id'] = importance['Feature Id'].astype(int)
importance = pd.DataFrame(feature_names, columns=['feature']).reset_index()\
    .merge(importance, left_on='index', right_on='Feature Id')\
    .drop(labels='index', axis=1)
importance

Unnamed: 0,feature,Feature Id,Importances
0,encoder__x0_0,0,3.344331
1,encoder__x0_1,1,4.360279
2,encoder__x1_0,2,0.000000
3,encoder__x2_0,3,0.026742
4,encoder__x2_1,4,0.056007
...,...,...,...
265,encoder__x15_39,265,0.092724
266,encoder__x15_40,266,0.111859
267,encoder__x15_41,267,0.279886
268,encoder__x15_42,268,0.012579


### Testing ensembles

In [10]:
clf2, results2, feature_names = train_classifier(X_train, y_train, 'classifier-voting', X_test, y_test, classifier=voting_classifier)

Training classifier for subsample 1
Training classifier for subsample 2
Training classifier for subsample 3
Training classifier for subsample 4
Training classifier for subsample 5
Training classifier for subsample 6
Training classifier for subsample 7
Training classifier for subsample 8
Training classifier for subsample 9
Training classifier for subsample 10


In [11]:
print(classification_report(results2['predicted'], results2['original']))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97      4624
           1       0.63      0.81      0.71       350

    accuracy                           0.95      4974
   macro avg       0.81      0.88      0.84      4974
weighted avg       0.96      0.95      0.96      4974



In [12]:
print(confusion_matrix(results2['predicted'], results2['original']))

[[4458  166]
 [  68  282]]


### Testing weighted CatBoost

In [11]:
clf3, results3, _ = train_classifier(X_train, y_train, None, X_test, y_test, classifier=weighted_catboost_classifier)

In [12]:
print(classification_report(results3['predicted'], results3['original']))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      4454
           1       0.78      0.68      0.73       520

    accuracy                           0.95      4974
   macro avg       0.87      0.83      0.85      4974
weighted avg       0.94      0.95      0.94      4974



In [13]:
print(confusion_matrix(results3['predicted'], results3['original']))

[[4357   97]
 [ 169  351]]


### Random Forest classifier

In [60]:
def random_forest_classifier(train_data, labels):
    clf = RandomForestClassifier(n_estimators=200, class_weight='balanced')
    clf.fit(train_data, labels)
    return clf

In [61]:
rf, results4, _ = train_classifier(X_train, y_train, None, X_test, y_test, classifier=random_forest_classifier)

In [62]:
print(classification_report(results4['predicted'], results4['original']))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      4577
           1       0.67      0.76      0.71       397

    accuracy                           0.95      4974
   macro avg       0.83      0.86      0.84      4974
weighted avg       0.95      0.95      0.95      4974



In [17]:
print(confusion_matrix(results4['predicted'], results4['original']))

[[4458  168]
 [  68  280]]


### CatBoost classifier with categorical input

In [18]:
cb = plain_catboost_classifier(X_train, y_train)


bestTest = 0.1356332301
bestIteration = 294

0:	loss: 0.1356332	best: 0.1356332 (0)	total: 7.52s	remaining: 3m 37s

bestTest = 0.1358253038
bestIteration = 74

1:	loss: 0.1358253	best: 0.1356332 (0)	total: 15.1s	remaining: 3m 31s

bestTest = 0.1359034238
bestIteration = 373

2:	loss: 0.1359034	best: 0.1356332 (0)	total: 22.1s	remaining: 3m 19s

bestTest = 0.1378345669
bestIteration = 61

3:	loss: 0.1378346	best: 0.1356332 (0)	total: 29.4s	remaining: 3m 11s

bestTest = 0.1353892742
bestIteration = 347

4:	loss: 0.1353893	best: 0.1353893 (4)	total: 36.4s	remaining: 3m 2s

bestTest = 0.1352748354
bestIteration = 196

5:	loss: 0.1352748	best: 0.1352748 (5)	total: 43.8s	remaining: 2m 55s

bestTest = 0.1353923589
bestIteration = 497

6:	loss: 0.1353924	best: 0.1352748 (5)	total: 51.3s	remaining: 2m 48s

bestTest = 0.1356291673
bestIteration = 130

7:	loss: 0.1356292	best: 0.1352748 (5)	total: 59s	remaining: 2m 42s

bestTest = 0.135922902
bestIteration = 687

8:	loss: 0.1359229	best: 0.13527

In [19]:
test_data = X_test.copy()
test_data.fillna(value='NA', inplace=True)
results5 = cb.predict(test_data.astype('category'))

In [20]:
print(classification_report(results5, y_test))

              precision    recall  f1-score   support

           0       0.98      0.97      0.98      4570
           1       0.70      0.77      0.73       404

    accuracy                           0.95      4974
   macro avg       0.84      0.87      0.85      4974
weighted avg       0.96      0.95      0.96      4974



In [21]:
cb.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,pos,13.860154
1,next.dep,13.507289
2,next.pos,11.556329
3,english.word,9.913548
4,prev.dep,8.913898
5,prev.pos,8.666733
6,next.dep2,8.609144
7,next.pos2,8.469728
8,prev.dep2,7.44726
9,prev.pos2,5.286422


### Weighted CatBoost classifier with categorical input 

In [22]:
cb = plain_catboost_classifier(X_train, y_train, weighted=True)


bestTest = 0.3288358029
bestIteration = 180

0:	loss: 0.3288358	best: 0.3288358 (0)	total: 7.56s	remaining: 3m 39s

bestTest = 0.3317518728
bestIteration = 23

1:	loss: 0.3317519	best: 0.3288358 (0)	total: 15.5s	remaining: 3m 36s

bestTest = 0.3316208746
bestIteration = 112

2:	loss: 0.3316209	best: 0.3288358 (0)	total: 23s	remaining: 3m 26s

bestTest = 0.3282740598
bestIteration = 27

3:	loss: 0.3282741	best: 0.3282741 (3)	total: 30.8s	remaining: 3m 19s

bestTest = 0.3274094285
bestIteration = 102

4:	loss: 0.3274094	best: 0.3274094 (4)	total: 38s	remaining: 3m 9s

bestTest = 0.3296085643
bestIteration = 67

5:	loss: 0.3296086	best: 0.3274094 (4)	total: 46s	remaining: 3m 3s

bestTest = 0.3296851484
bestIteration = 95

6:	loss: 0.3296851	best: 0.3274094 (4)	total: 53.4s	remaining: 2m 55s

bestTest = 0.3295217145
bestIteration = 35

7:	loss: 0.3295217	best: 0.3274094 (4)	total: 1m 1s	remaining: 2m 48s

bestTest = 0.3289116161
bestIteration = 108

8:	loss: 0.3289116	best: 0.3274094 (4)	

In [23]:
test_data = X_test.copy()
test_data.fillna(value='NA', inplace=True)
results6 = cb.predict(test_data.astype('category'))
print(classification_report(results6, y_test))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      4508
           1       0.75      0.73      0.74       466

    accuracy                           0.95      4974
   macro avg       0.86      0.85      0.86      4974
weighted avg       0.95      0.95      0.95      4974



### XGBoost classifier

In [24]:
params = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1.5, 5],
    'subsample': [0.6, 1.0],
    'colsample_bytree': [0.6, 1.0],
    'max_depth': [3, 5]
}

def xgb_classifier(train_data, labels):
    clf = XGBClassifier(learning_rate=0.01, n_estimators=100)
    search = GridSearchCV(clf, param_grid=params, cv=2, n_jobs=1)
    search.fit(train_data, labels)
    return search.best_estimator_

In [25]:
%%capture
xgb, resxgb, _ = train_classifier(X_train, y_train, None, X_test, y_test, classifier=xgb_classifier)

In [27]:
print(classification_report(resxgb['predicted'], resxgb['original']))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      4570
           1       0.69      0.76      0.73       404

    accuracy                           0.95      4974
   macro avg       0.83      0.87      0.85      4974
weighted avg       0.96      0.95      0.95      4974



### Weighted XGBoost classifier

In [28]:
def xgb_classifier(train_data, labels):
    classes=np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    clf = XGBClassifier(learning_rate=0.01, n_estimators=100, scale_pos_weight=np.max(weights/np.min(weights)))
    search = GridSearchCV(clf, param_grid=params, cv=2, n_jobs=1)
    search.fit(train_data, labels)
    return search.best_estimator_

In [29]:
%%capture
xgb, resxgb2, _ = train_classifier(X_train, y_train, None, X_test, y_test, classifier=xgb_classifier)

In [30]:
print(classification_report(resxgb2['predicted'], resxgb2['original']))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      4466
           1       0.77      0.68      0.72       508

    accuracy                           0.95      4974
   macro avg       0.86      0.83      0.84      4974
weighted avg       0.94      0.95      0.94      4974



### McNemara tables test

Statistically compare RandomForest, weighted CatBoost and weighted XGBoost classifier perfromance:

In [63]:
contingency = mcnemar_tables(results['original'], results3['predicted'], results4['predicted'], resxgb2['predicted']) 
model_names = {'model_0': 'RandomForest', 'model_1': 'CatBoost', 'model_2': 'XGBoost'}
keys = list(tbl.keys())
for oldname, newname in model_names.items():
    keys = list(map(lambda x: x.replace(oldname, newname), keys))
contingency = dict(zip(keys, tbl.values()))
pvals = {key: mcnemar(val)[1] for key, val in contingency.items()} 
pvals

{'RandomForest vs CatBoost': 0.028818733250275556,
 'RandomForest vs XGBoost': 0.6830913983096086,
 'CatBoost vs XGBoost': 0.010428224435337065}

## Test GradientBoosting and Random Forest classifiers with optimal threshold identification

In [None]:
train_data, train_labels, encoders, ohe, label_encoder = preprocess_data(X_train, y_train)
test_data, test_labels = preprocess_test_data(X_test, y_test, encoders, ohe, label_encoder)

In [None]:
gbm = GradientBoostingClassifier()
yb_report(gbm, train_data, train_labels, X_test=test_data, y_test=test_labels)

In [None]:
discrimination_threshold(gbm, train_data, train_labels)

In [None]:
rf = RandomForestClassifier(n_estimators=200)
discrimination_threshold(rf, train_data, train_labels)