In [1]:
import os
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
from classifier import *

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
X_train, y_train = load_data(os.path.join('features', 'combined', 'train.csv'))
X_test, y_test = load_data(os.path.join('features', 'combined', 'test.csv'))
X_train.head()

Unnamed: 0,word.isupper,word.istitle,word.isdigit,postag,dep,dep.head.pos,+1:word.istitle,+1:word.isupper,+1:word.isdigit,+1:postag,...,-1:word.isdigit,-1:postag,-2:word.istitle,-2:word.isupper,-2:word.isdigit,-2:postag,-3:word.istitle,-3:word.isupper,-3:word.isdigit,-3:postag
0,0,1,0,DET,det,PROPN,1,0,0,PROPN,...,0,,0,0,0,,0,0,0,
1,0,1,0,PROPN,compound,PROPN,1,0,0,PROPN,...,0,DET,0,0,0,,0,0,0,
2,0,1,0,PROPN,nsubj,VERB,0,0,0,VERB,...,0,PROPN,1,0,0,DET,0,0,0,
3,0,0,0,VERB,root,,0,0,0,ADP,...,0,PROPN,1,0,0,PROPN,1,0,0,DET
4,0,0,0,ADP,case,NOUN,0,0,0,DET,...,0,VERB,1,0,0,PROPN,1,0,0,PROPN


In [3]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(52540, 36) (17795, 36) (52540,) (17795,)


Training dataset distribution

In [4]:
y_train.value_counts()

C      19819
E      19012
O       9511
B-E     1051
B-C     1050
E-C     1049
E-E     1048
Name: label, dtype: int64

Testing dataset distribution

In [5]:
y_test.value_counts()

C      6834
E      6636
O      2921
E-E     351
B-E     351
B-C     351
E-C     351
Name: label, dtype: int64

### Test simple CatBoost classifier

In [6]:
clf, results, feature_names = train_classifier(X_train, y_train, 'classifier-sklearn', X_test, y_test)

  return f(*args, **kwargs)


In [7]:
results

Unnamed: 0,predicted,original,Class_0,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6
0,B-C,B-C,0.912650,0.072883,0.009014,0.002331,0.000240,0.000254,0.002628
1,C,C,0.018887,0.005942,0.866880,0.072092,0.008557,0.016650,0.010993
2,C,C,0.013077,0.001759,0.901403,0.061277,0.000649,0.000220,0.021616
3,C,C,0.224557,0.004290,0.673280,0.083002,0.001650,0.000648,0.012575
4,C,C,0.003061,0.001320,0.912399,0.075321,0.000396,0.000079,0.007424
...,...,...,...,...,...,...,...,...,...
17790,C,C,0.000298,0.001273,0.496306,0.314934,0.000053,0.000562,0.186575
17791,C,C,0.001534,0.002882,0.680633,0.187092,0.000043,0.000013,0.127802
17792,C,C,0.000081,0.000025,0.634560,0.262804,0.000171,0.000090,0.102270
17793,C,C,0.000016,0.000069,0.650182,0.235493,0.011198,0.006582,0.096460


In [8]:
print(classification_report(results['predicted'], results['original']))

              precision    recall  f1-score   support

         B-C       0.71      0.69      0.70       361
         B-E       0.47      0.63      0.54       262
           C       0.76      0.62      0.68      8448
           E       0.66      0.63      0.64      6871
         E-C       0.53      0.64      0.58       288
         E-E       0.63      0.66      0.64       336
           O       0.23      0.54      0.32      1229

    accuracy                           0.62     17795
   macro avg       0.57      0.63      0.59     17795
weighted avg       0.67      0.62      0.64     17795



Confusion matrix

In [9]:
print(confusion_matrix(results['predicted'], results['original']))

[[ 250   74    7    2    0    0   28]
 [  23  164    9   12    0    0   54]
 [  49   32 5198 2040   53   37 1039]
 [  23   51 1330 4354   32   33 1048]
 [   0    0   13   10  185   45   35]
 [   0    0    1    3   63  221   48]
 [   6   30  276  215   18   15  669]]


Feature importance

In [10]:
importance = clf.get_feature_importance(prettified=True)
importance['Feature Id'] = importance['Feature Id'].astype(int)
importance = pd.DataFrame(feature_names, columns=['feature']).reset_index()\
    .merge(importance, left_on='index', right_on='Feature Id')\
    .drop(labels='index', axis=1)
importance

Unnamed: 0,feature,Feature Id,Importances
0,encoder__x0_0,0,0.162115
1,encoder__x0_1,1,0.750867
2,encoder__x0_2,2,0.055671
3,encoder__x0_3,3,0.266657
4,encoder__x0_4,4,0.128237
...,...,...,...
381,-2:word.isupper,381,0.243191
382,-2:word.isdigit,382,0.112362
383,-3:word.istitle,383,0.987705
384,-3:word.isupper,384,0.198616


### Testing ensembles

In [11]:
clf2, results2, feature_names = train_classifier(X_train, y_train, 'classifier-voting', X_test, y_test, classifier=voting_classifier)

Training classifier for subsample 1
Training classifier for subsample 2
Training classifier for subsample 3
Training classifier for subsample 4
Training classifier for subsample 5
Training classifier for subsample 6
Training classifier for subsample 7
Training classifier for subsample 8
Training classifier for subsample 9
Training classifier for subsample 10


In [12]:
print(classification_report(results2['predicted'], results2['original']))

              precision    recall  f1-score   support

         B-C       0.71      0.69      0.70       361
         B-E       0.47      0.63      0.54       262
           C       0.76      0.62      0.68      8448
           E       0.66      0.63      0.64      6871
         E-C       0.53      0.64      0.58       288
         E-E       0.63      0.66      0.64       336
           O       0.23      0.54      0.32      1229

    accuracy                           0.62     17795
   macro avg       0.57      0.63      0.59     17795
weighted avg       0.67      0.62      0.64     17795



In [13]:
print(confusion_matrix(results2['predicted'], results2['original']))

[[ 250   74    7    2    0    0   28]
 [  23  164    9   12    0    0   54]
 [  49   32 5198 2040   53   37 1039]
 [  23   51 1330 4354   32   33 1048]
 [   0    0   13   10  185   45   35]
 [   0    0    1    3   63  221   48]
 [   6   30  276  215   18   15  669]]


### Testing weighted CatBoost

In [14]:
clf3, results3, _ = train_classifier(X_train, y_train, None, X_test, y_test, classifier=weighted_catboost_classifier)

  return f(*args, **kwargs)


In [15]:
print(classification_report(results3['predicted'], results3['original']))

              precision    recall  f1-score   support

         B-C       0.77      0.42      0.54       652
         B-E       0.71      0.38      0.49       654
           C       0.62      0.65      0.63      6546
           E       0.55      0.68      0.61      5404
         E-C       0.71      0.35      0.47       713
         E-E       0.73      0.42      0.53       605
           O       0.47      0.42      0.44      3221

    accuracy                           0.58     17795
   macro avg       0.65      0.47      0.53     17795
weighted avg       0.59      0.58      0.57     17795



In [16]:
print(confusion_matrix(results3['predicted'], results3['original']))

[[ 271   75  132   98    1    1   74]
 [  43  248  114  130    0    0  119]
 [  17    9 4227 1622   17   16  638]
 [  14   14 1174 3662    5   10  525]
 [   2    0  164  132  249   63  103]
 [   1    0   87   96   70  255   96]
 [   3    5  936  896    9    6 1366]]


### Random Forest classifier

In [13]:
rf, results4, _ = train_classifier(X_train, y_train, None, X_test, y_test, classifier=random_forest_classifier)

In [14]:
print(classification_report(results4['predicted'], results4['original']))

              precision    recall  f1-score   support

           C       0.74      0.63      0.68      8594
           E       0.63      0.65      0.64      7113
           O       0.37      0.55      0.45      2098

    accuracy                           0.63     17805
   macro avg       0.58      0.61      0.59     17805
weighted avg       0.65      0.63      0.64     17805



In [15]:
print(confusion_matrix(results4['predicted'], results4['original']))

[[5425 2179  990]
 [1506 4646  961]
 [ 404  531 1163]]


### CatBoost classifier with categorical input

In [6]:
cb = plain_catboost_classifier(X_train, y_train)

Learning rate set to 0.096843
0:	learn: 1.8135921	total: 799ms	remaining: 13m 17s
1:	learn: 1.7099359	total: 1.43s	remaining: 11m 55s
2:	learn: 1.6277905	total: 1.96s	remaining: 10m 51s
3:	learn: 1.5591303	total: 2.52s	remaining: 10m 28s
4:	learn: 1.5016072	total: 3.12s	remaining: 10m 20s
5:	learn: 1.4548277	total: 3.76s	remaining: 10m 22s
6:	learn: 1.4132600	total: 4.34s	remaining: 10m 15s
7:	learn: 1.3780887	total: 5.06s	remaining: 10m 27s
8:	learn: 1.3474487	total: 5.78s	remaining: 10m 36s
9:	learn: 1.3193963	total: 6.57s	remaining: 10m 50s
10:	learn: 1.2941938	total: 7.22s	remaining: 10m 49s
11:	learn: 1.2714629	total: 7.96s	remaining: 10m 55s
12:	learn: 1.2520669	total: 8.66s	remaining: 10m 57s
13:	learn: 1.2361493	total: 9.42s	remaining: 11m 3s
14:	learn: 1.2202295	total: 10.1s	remaining: 11m 4s
15:	learn: 1.2057064	total: 10.8s	remaining: 11m 4s
16:	learn: 1.1949110	total: 11.5s	remaining: 11m 6s
17:	learn: 1.1834001	total: 12.3s	remaining: 11m 8s
18:	learn: 1.1732213	total: 13s

In [7]:
import joblib
output_dir = 'classifier-catboost'
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)
joblib.dump(cb, os.path.join(output_dir, 'model.joblib'))

['classifier-catboost/model.joblib']

In [8]:
test_data = X_test.copy()
test_data.fillna(value='NA', inplace=True)
categorical_features = test_data.columns[test_data.dtypes == object]
test_data[categorical_features] = test_data[categorical_features].astype('category')
predicted = cb.predict(test_data)

In [9]:
print(classification_report(predicted, y_test))

              precision    recall  f1-score   support

         B-C       0.66      0.71      0.68       330
         B-E       0.47      0.63      0.54       261
           C       0.75      0.61      0.68      8402
           E       0.65      0.63      0.64      6781
         E-C       0.49      0.67      0.56       258
         E-E       0.60      0.71      0.65       298
           O       0.28      0.57      0.38      1465

    accuracy                           0.62     17795
   macro avg       0.56      0.65      0.59     17795
weighted avg       0.66      0.62      0.63     17795



In [9]:
cb.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,-3:postag,10.963038
1,-1:postag,9.44688
2,-2:postag,8.89617
3,dep,7.906769
4,+3:postag,7.766352
5,-1:dep,7.1051
6,-3:dep,6.785113
7,-2:dep,6.595109
8,postag,6.282959
9,+2:postag,5.807175


### Weighted CatBoost classifier with categorical input 

In [12]:
cb = plain_catboost_classifier(X_train, y_train, weighted=True)


bestTest = 0.8562652585
bestIteration = 999

0:	loss: 0.8562653	best: 0.8562653 (0)	total: 3m 29s	remaining: 3m 29s

bestTest = 0.856993063
bestIteration = 999

1:	loss: 0.8569931	best: 0.8562653 (0)	total: 7m 9s	remaining: 0us
Estimating final quality...


In [13]:
test_data = X_test.copy()
test_data.fillna(value='NA', inplace=True)
categorical_features = test_data.columns[test_data.dtypes == object]
test_data[categorical_features] = test_data[categorical_features].astype('category')
predicted = cb.predict(test_data)
print(classification_report(predicted, y_test))

              precision    recall  f1-score   support

           C       0.69      0.66      0.67     11454
           E       0.56      0.68      0.61      8799
           _       0.57      0.44      0.50      5933

    accuracy                           0.61     26186
   macro avg       0.61      0.59      0.59     26186
weighted avg       0.62      0.61      0.61     26186

