In [160]:
import warnings
import logging
import re
from functools import partial

# External imports
import pandas as pd, numpy as np, seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import joblib
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_validate
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import (
    precision_recall_curve, 
    roc_curve,
    fbeta_score,
    confusion_matrix,
    precision_score,
    recall_score,
    make_scorer
)
from sklearn.exceptions import UndefinedMetricWarning
from xgboost import XGBClassifier

%matplotlib inline
%load_ext autoreload
%autoreload 2

from lib.jupyter_helpers import notebook_setup
notebook_setup()

log = logging.getLogger(__name__)
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
)

# warnings.filterwarnings("ignore", category=DeprecationWarning)
# warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
path = "../title_detector/sample/train_sections_data.csv"
df = pd.read_csv(path, encoding="latin1")
import re
def to_snake(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
df.columns = [to_snake(col) for col in df.columns]
df.head(2)

Unnamed: 0,text,is_bold,is_italic,is_underlined,left,right,top,bottom,font_type,label,unnamed: 10,unnamed: 11,unnamed: 12
0,NOTES TO THE ACCOUNTS FOR THE 52 WEEK PERIOD E...,True,False,False,49.5,544.0,67.3,96.1,New Times Roman,1,,,
1,DERIVATIVE INSTRUMENTS,False,False,False,41.5,140.2,49.0,54.7,New Times Roman,1,,,


In [230]:
df.describe(include="all")

Unnamed: 0,Text,IsBold,IsItalic,IsUnderlined,Left,Right,Top,Bottom,FontType,Label,Unnamed: 10,Unnamed: 11,Unnamed: 12
count,14215,14215,14215,14215,14215.0,14215.0,14215.0,14215.0,14215,14215.0,0.0,0.0,1
unique,11421,2,2,2,,,,,1,,,,1
top,,False,False,False,,,,,New Times Roman,,,,.
freq,632,10849,13159,14095,,,,,14215,,,,1
mean,,,,,71.077418,451.751952,285.437557,317.711894,,0.259937,,,
std,,,,,56.290498,270.240403,196.358206,198.41224,,0.438615,,,
min,,,,,28.3,31.8,19.9,35.5,,0.0,,,
25%,,,,,33.1,177.05,139.3,174.8,,0.0,,,
50%,,,,,51.0,523.5,250.9,286.2,,0.0,,,
75%,,,,,96.3,728.1,375.7,408.9,,1.0,,,


We can drop FontType as it has only one unique value

In [231]:
df.drop("FontType", axis=1, inplace=True)
df = df.iloc[:, :-3]
df.head()

Unnamed: 0,Text,IsBold,IsItalic,IsUnderlined,Left,Right,Top,Bottom,Label
0,NOTES TO THE ACCOUNTS FOR THE 52 WEEK PERIOD E...,True,False,False,49.5,544.0,67.3,96.1,1
1,DERIVATIVE INSTRUMENTS,False,False,False,41.5,140.2,49.0,54.7,1
2,-6,False,False,False,113.5,122.2,409.3,414.9,0
3,The following performance graph and related in...,False,False,False,112.9,706.9,339.0,360.3,0
4,Typical service offerings include supply chain...,False,False,False,33.1,808.5,328.2,369.9,0


In [232]:
def to_snake(name):
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
df.columns = [to_snake(col) for col in df.columns]
df.columns

Index(['text', 'is_bold', 'is_italic', 'is_underlined', 'left', 'right', 'top',
       'bottom', 'label'],
      dtype='object')

sns.set_style("ticks")
sns.pairplot(
    df.drop("text", axis=1),
    hue = 'label',
    diag_kind = "kde",
    kind = "scatter",
    palette = "husl",
    height=4
)
plt.show()

In [233]:
df.head()

Unnamed: 0,text,is_bold,is_italic,is_underlined,left,right,top,bottom,label
0,NOTES TO THE ACCOUNTS FOR THE 52 WEEK PERIOD E...,True,False,False,49.5,544.0,67.3,96.1,1
1,DERIVATIVE INSTRUMENTS,False,False,False,41.5,140.2,49.0,54.7,1
2,-6,False,False,False,113.5,122.2,409.3,414.9,0
3,The following performance graph and related in...,False,False,False,112.9,706.9,339.0,360.3,0
4,Typical service offerings include supply chain...,False,False,False,33.1,808.5,328.2,369.9,0


### Simple preprocessing: remove non-ascii characters

In [234]:
df["text"] = df["text"].str.replace(r'[^\x00-\x7F]+', ' ')

### Get feature-labels

In [235]:
X = df.drop("label", axis=1)
y = df["label"]

In [236]:
y.value_counts()

0    10520
1     3695
Name: label, dtype: int64

## Models

1) Baselines: text classification based on feature engineering from the whole text, and using a classical classifier, e.g. SVM, RandomForest, LogisticRegression
* Len(text)
* Are they all capitals
* Tf-idf
* LDA?

2) LSTM-based: FastAI?

3) BertForTextClassification: BERT model with a linear layer on top of the pooled output.

In [90]:
pipe = Pipeline([
    # the classifier params will be populated by the param_grid
    ('scaling', MinMaxScaler()),
    ('classifier', SVC()),
])
my_cross_validate(pipe, X.drop("text", axis=1), y, scoring = scoring)

precision --> Train: 0.924, Test: 0.922,
recall --> Train: 0.884, Test: 0.88,
fbeta --> Train: 0.903, Test: 0.9,


In [91]:
pipe = Pipeline([
    # the classifier params will be populated by the param_grid
    ('scaling', MinMaxScaler()),
    ('classifier', RandomForestClassifier())
])
my_cross_validate(pipe, X.drop("text", axis=1), y, scoring = scoring)

precision --> Train: 0.998, Test: 0.957,
recall --> Train: 0.994, Test: 0.934,
fbeta --> Train: 0.996, Test: 0.945,


In [114]:
pipe = Pipeline([
    # the reduce_dim stage is populated by the param_grid
    ('scaling', MinMaxScaler()),
    ('classifier', SVC()),
])
param_grid = []
for kernel in ['rbf', 'linear']:
    gamma_dict = {'gamma': [1e-2, 1e-4, 'auto']} if kernel == "rbf" else {}
    param_grid.append({'kernel': [kernel],
                       "random_state": [1],
                       'C': [0.1, 1, 10, 100], 
                       "class_weight": ["balanced"],
                       **gamma_dict,
                      })
for idx, sub_param_grid in enumerate(param_grid):
    param_grid[idx] = {f"classifier__{key}": value for key, value in sub_param_grid.items()}
model = grid_search_algo(
    pipe, param_grid, X.drop("text", axis=1), y, scoring=scoring, refit=False,
)
analyse_grid_search(model)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  3.8min
[Parallel(n_jobs=3)]: Done 160 out of 160 | elapsed:  6.8min finished


Unnamed: 0,11,8,5,6,9,12,13,14,15,2,3,10,0,4,7,1
mean_test_recall,0.926114,0.926114,0.926114,0.926114,0.926114,0.926114,0.926114,0.926114,0.926114,0.924488,0.919615,0.919615,0.850331,0.850331,0.850331,1
mean_test_precision,0.907143,0.900718,0.89573,0.884818,0.884818,0.884818,0.884818,0.884818,0.884818,0.884858,0.886645,0.886645,0.933542,0.933542,0.933542,0.259937
mean_test_fbeta,0.916444,0.913147,0.910589,0.904931,0.904931,0.904931,0.904931,0.904931,0.904931,0.904169,0.902757,0.902757,0.889907,0.889907,0.889907,0.412618
mean_train_recall,0.926116,0.926116,0.926116,0.926116,0.926116,0.926116,0.926116,0.926116,0.926116,0.925545,0.919621,0.919621,0.850338,0.850338,0.850338,1
mean_train_precision,0.908473,0.900951,0.895945,0.884696,0.884696,0.884696,0.884696,0.884696,0.884696,0.884941,0.886514,0.886514,0.933453,0.933453,0.933453,0.259937
mean_train_fbeta,0.917208,0.913359,0.910779,0.904932,0.904932,0.904932,0.904932,0.904932,0.904932,0.904785,0.902763,0.902763,0.889958,0.889958,0.889958,0.412619
param_classifier__C,100,10,1,10,100,0.1,1,10,100,0.1,1,100,0.1,1,10,0.1
param_classifier__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced
param_classifier__gamma,auto,auto,auto,0.01,0.01,,,,,auto,0.01,0.0001,0.01,0.0001,0.0001,0.0001
param_classifier__kernel,rbf,rbf,rbf,rbf,rbf,linear,linear,linear,linear,rbf,rbf,rbf,rbf,rbf,rbf,rbf


Unnamed: 0,max,min
mean_test_recall,1.0,0.850331
mean_test_precision,0.933542,0.259937
mean_test_fbeta,0.916444,0.412618
mean_train_recall,1.0,0.850338
mean_train_precision,0.933453,0.259937
mean_train_fbeta,0.917208,0.412619


In [115]:
pipe = Pipeline([
    # the reduce_dim stage is populated by the param_grid
    ('scaling', MinMaxScaler()),
    ('classifier', RandomForestClassifier()),
])
param_grid = {
        "n_estimators": [100],
        "min_samples_split": [2, 5, 7],
        "max_features": ["sqrt", None],
        "max_depth": [3, 5, 10],
        "random_state": [1],
        "class_weight": ["balanced"]
    }
param_grid = {f"classifier__{key}": value for key, value in param_grid.items()}
result = grid_search_algo(
    pipe, param_grid, X.drop("text", axis=1), y, scoring=scoring, refit=False,
)
analyse_grid_search(result)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   21.5s
[Parallel(n_jobs=3)]: Done 180 out of 180 | elapsed:  2.2min finished


Unnamed: 0,15,16,17,12,13,14,10,11,9,8,7,6,1,2,0,5,4,3
mean_test_recall,0.959941,0.960754,0.961025,0.958318,0.959129,0.959402,0.949928,0.949928,0.949928,0.938286,0.938016,0.937746,0.937744,0.937744,0.937744,0.917989,0.917989,0.917989
mean_test_precision,0.939201,0.936779,0.935077,0.930451,0.928564,0.927618,0.919225,0.919225,0.919225,0.910085,0.910068,0.910267,0.899015,0.899015,0.899015,0.914648,0.914648,0.914648
mean_test_fbeta,0.949402,0.94855,0.947815,0.944142,0.943557,0.943202,0.934265,0.934265,0.934265,0.92388,0.923741,0.923715,0.917849,0.917849,0.917849,0.916228,0.916228,0.916228
mean_train_recall,0.980815,0.979371,0.977507,0.972515,0.971703,0.971373,0.952398,0.952248,0.952428,0.941182,0.941121,0.941061,0.939047,0.939047,0.939047,0.918328,0.918328,0.918328
mean_train_precision,0.958649,0.955049,0.952281,0.94548,0.941966,0.941041,0.922551,0.922433,0.922581,0.913909,0.913746,0.913843,0.899799,0.899799,0.899799,0.914836,0.914836,0.914836
mean_train_fbeta,0.969604,0.967057,0.964729,0.958806,0.956602,0.955965,0.937235,0.937101,0.937265,0.927337,0.927223,0.927246,0.919,0.919,0.919,0.916578,0.916578,0.916578
param_classifier__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced
param_classifier__max_depth,10,10,10,10,10,10,5,5,5,5,5,5,3,3,3,3,3,3
param_classifier__max_features,,,,sqrt,sqrt,sqrt,,,,sqrt,sqrt,sqrt,sqrt,sqrt,sqrt,,,
param_classifier__min_samples_split,2,5,7,2,5,7,5,7,2,7,5,2,5,7,2,7,5,2


Unnamed: 0,max,min
mean_test_recall,0.961025,0.917989
mean_test_precision,0.939201,0.899015
mean_test_fbeta,0.949402,0.916228
mean_train_recall,0.980815,0.918328
mean_train_precision,0.958649,0.899799
mean_train_fbeta,0.969604,0.916578


## Add text lenght info

In [237]:
# TODO: do this more efficient? with .str. in pandas?
df["char_length"] = df.text.map(len)

In [238]:
import spacy
from tqdm import tqdm_notebook as tqdm

In [239]:
# TODO: load lg?
nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger", "ner"])

In [240]:
documents = []
for doc in tqdm(df["text"]):
    documents.append(nlp(doc))

HBox(children=(IntProgress(value=0, max=14215), HTML(value='')))




In [241]:
# TODO: do this more efficient
df["token_length"] = list(map(len, documents))

In [242]:
df.head()

Unnamed: 0,text,is_bold,is_italic,is_underlined,left,right,top,bottom,label,char_length,token_length
0,NOTES TO THE ACCOUNTS FOR THE 52 WEEK PERIOD E...,True,False,False,49.5,544.0,67.3,96.1,1,66,13
1,DERIVATIVE INSTRUMENTS,False,False,False,41.5,140.2,49.0,54.7,1,22,2
2,-6,False,False,False,113.5,122.2,409.3,414.9,0,2,1
3,The following performance graph and related in...,False,False,False,112.9,706.9,339.0,360.3,0,525,99
4,Typical service offerings include supply chain...,False,False,False,33.1,808.5,328.2,369.9,0,1061,164


In [147]:
pipe = Pipeline([
    # the reduce_dim stage is populated by the param_grid
    ('scaling', MinMaxScaler()),
    ('classifier', RandomForestClassifier()),
])
param_grid = {
        "n_estimators": [100, 300],
        "min_samples_split": [2, 5, 7],
        "max_features": ["sqrt", None],
        "max_depth": [3, 5, 10, 25],
        "random_state": [1],
        "class_weight": ["balanced"]
    }
param_grid = {f"classifier__{key}": value for key, value in param_grid.items()}
result = grid_search_algo(
    pipe, param_grid, X.drop("text", axis=1), y, scoring=scoring, refit=False,
)
analyse_grid_search(result)

Fitting 10 folds for each of 48 candidates, totalling 480 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   36.7s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:  3.9min
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed: 12.3min
[Parallel(n_jobs=3)]: Done 480 out of 480 | elapsed: 14.4min finished


Unnamed: 0,45,43,44,42,47,46,38,39,40,36,41,37,31,33,30,32,35,34,25,27,29,28,24,26,19,21,18,23,20,22,16,14,12,13,17,15,4,0,2,3,5,1,9,7,11,6,10,8
mean_test_recall,0.962926,0.956429,0.962383,0.954806,0.965091,0.965092,0.959405,0.959132,0.96238,0.951011,0.96265,0.9502,0.961031,0.961573,0.960759,0.961843,0.962655,0.962655,0.959406,0.960219,0.959947,0.960218,0.957783,0.959407,0.950206,0.950206,0.949665,0.949665,0.949394,0.949394,0.939099,0.939913,0.940182,0.93856,0.937206,0.937478,0.937746,0.937746,0.937746,0.931794,0.931794,0.931794,0.916638,0.916638,0.916638,0.916367,0.916367,0.916367
mean_test_precision,0.949667,0.955204,0.948868,0.955628,0.942976,0.942722,0.948185,0.947909,0.94405,0.954918,0.942824,0.954102,0.938041,0.936351,0.936039,0.934892,0.933496,0.933222,0.932736,0.930574,0.930327,0.929634,0.931672,0.929329,0.91947,0.91947,0.91967,0.919436,0.919409,0.919402,0.912006,0.910318,0.908933,0.910375,0.911492,0.910532,0.900717,0.900717,0.900717,0.905254,0.905254,0.905254,0.914817,0.914817,0.914817,0.914561,0.914561,0.914561
mean_test_fbeta,0.956201,0.955773,0.955539,0.955183,0.953867,0.953737,0.953736,0.953468,0.953107,0.952941,0.952608,0.952136,0.949353,0.948746,0.948195,0.948129,0.9478,0.947665,0.945856,0.945135,0.944877,0.944641,0.94451,0.944091,0.934545,0.934545,0.934385,0.934258,0.934118,0.934116,0.925255,0.924811,0.924213,0.924204,0.924104,0.923753,0.918777,0.918777,0.918777,0.918279,0.918279,0.918279,0.915663,0.915663,0.915663,0.915394,0.915394,0.915394
std_test_fbeta,0.00382169,0.00391429,0.00348275,0.00381343,0.00404722,0.00403222,0.00562283,0.00535055,0.00503966,0.00608773,0.00501931,0.0069093,0.0046257,0.00561403,0.0041245,0.00446333,0.0054576,0.00496695,0.00603134,0.00589709,0.00596707,0.00690321,0.00616846,0.00590572,0.0060448,0.0060448,0.00582127,0.00587175,0.00580523,0.00564097,0.00845343,0.00799346,0.00793962,0.00715578,0.00654983,0.00701933,0.00936433,0.00936433,0.00936433,0.00852239,0.00852239,0.00852239,0.00803728,0.00803728,0.00803728,0.0079135,0.0079135,0.0079135
mean_train_recall,0.998827,1,0.998226,0.99997,0.996031,0.99561,0.998617,0.999158,0.996211,1,0.996752,1,0.980274,0.978469,0.980183,0.97868,0.976695,0.976966,0.973177,0.972155,0.971944,0.972004,0.972966,0.972125,0.952639,0.952518,0.952939,0.952308,0.952849,0.952699,0.940851,0.941663,0.941753,0.940069,0.940099,0.939979,0.938836,0.938836,0.938836,0.932551,0.932551,0.932551,0.918087,0.918087,0.918087,0.918118,0.918118,0.918118
mean_train_precision,0.987104,0.999459,0.98651,0.999459,0.976533,0.976582,0.981414,0.981539,0.970674,0.999459,0.970178,0.999459,0.958377,0.954479,0.957726,0.953793,0.951437,0.951454,0.945817,0.942153,0.940551,0.940117,0.945254,0.941739,0.922542,0.922506,0.922296,0.922383,0.922263,0.922144,0.91412,0.914235,0.913577,0.914096,0.913912,0.914142,0.900448,0.900448,0.900448,0.906126,0.906126,0.906126,0.915313,0.915313,0.915313,0.915012,0.915012,0.915012
mean_train_fbeta,0.99293,0.999729,0.992333,0.999714,0.986185,0.986004,0.98994,0.99027,0.983276,0.999729,0.983285,0.999729,0.9692,0.966324,0.968822,0.966073,0.963899,0.964038,0.959301,0.956918,0.955989,0.955794,0.958909,0.95669,0.937348,0.937271,0.937366,0.937106,0.937305,0.937171,0.927284,0.92774,0.927444,0.926899,0.926819,0.926879,0.919238,0.919238,0.919238,0.919147,0.919147,0.919147,0.916696,0.916696,0.916696,0.916561,0.916561,0.916561
param_classifier__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced
param_classifier__max_depth,25,25,25,25,25,25,25,25,25,25,25,25,10,10,10,10,10,10,10,10,10,10,10,10,5,5,5,5,5,5,5,5,5,5,5,5,3,3,3,3,3,3,3,3,3,3,3,3
param_classifier__max_features,,,,,,,sqrt,sqrt,sqrt,sqrt,sqrt,sqrt,,,,,,,sqrt,sqrt,sqrt,sqrt,sqrt,sqrt,,,,,,,sqrt,sqrt,sqrt,sqrt,sqrt,sqrt,sqrt,sqrt,sqrt,sqrt,sqrt,sqrt,,,,,,


Unnamed: 0,max,min
mean_test_recall,0.965092,0.916367
mean_test_precision,0.955628,0.900717
mean_test_fbeta,0.956201,0.915394
std_test_fbeta,0.009364,0.003483
mean_train_recall,1.0,0.918087
mean_train_precision,0.999459,0.900448
mean_train_fbeta,0.999729,0.916561


## Add easy spaCy features

In [243]:
[i for i in documents[19]]

[Item, 4, .]

In [244]:
documents[19].to_array(
            ["IS_STOP", "IS_UPPER", "IS_LOWER", "IS_DIGIT", "IS_PUNCT", "IS_ASCII"]
        ).sum(axis=0) / len(documents[19])

array([0.        , 0.        , 0.        , 0.33333333, 0.33333333,
       1.        ])

In [245]:
# TODO: do it on the dataframe directly?
feature_names = ["IS_STOP", "IS_UPPER", "IS_LOWER", "IS_DIGIT", "IS_PUNCT","IS_ASCII"]
features = np.zeros((len(documents), len(feature_names)))
for idx, doc in enumerate(tqdm(documents)):
    features[idx, :] = doc.to_array(feature_names).sum(axis=0) / len(doc)
for idx, feature in enumerate(feature_names):
    df[feature.lower()] = features[:, idx]

HBox(children=(IntProgress(value=0, max=14215), HTML(value='')))




In [246]:
df.head(15)

Unnamed: 0,text,is_bold,is_italic,is_underlined,left,right,top,bottom,label,char_length,token_length,is_stop,is_upper,is_lower,is_digit,is_punct,is_ascii
0,NOTES TO THE ACCOUNTS FOR THE 52 WEEK PERIOD E...,True,False,False,49.5,544.0,67.3,96.1,1,66,13,0.0,0.769231,0.0,0.230769,0.0,1.0
1,DERIVATIVE INSTRUMENTS,False,False,False,41.5,140.2,49.0,54.7,1,22,2,0.0,1.0,0.0,0.0,0.0,1.0
2,-6,False,False,False,113.5,122.2,409.3,414.9,0,2,1,0.0,0.0,0.0,0.0,0.0,1.0
3,The following performance graph and related in...,False,False,False,112.9,706.9,339.0,360.3,0,525,99,0.424242,0.010101,0.666667,0.030303,0.131313,1.0
4,Typical service offerings include supply chain...,False,False,False,33.1,808.5,328.2,369.9,0,1061,164,0.146341,0.018293,0.72561,0.006098,0.189024,1.0
5,A substantial portion of contract and administ...,False,False,False,33.1,808.6,165.5,189.2,0,533,90,0.388889,0.011111,0.866667,0.011111,0.044444,1.0
6,PMA applications must be supported by valid sc...,False,False,False,112.4,724.9,261.4,336.2,0,1926,335,0.402985,0.068657,0.80597,0.0,0.098507,1.0
7,We are subject to healthcare regulation and en...,False,False,False,33.1,808.3,49.0,64.3,0,326,57,0.315789,0.0,0.807018,0.0,0.157895,1.0
8,"Against the outlined challenges, developing ec...",False,False,False,68.0,541.4,561.0,615.9,0,482,78,0.25641,0.0,0.820513,0.0,0.141026,1.0
9,The results of our operations are dependent on...,False,False,False,43.9,807.9,110.7,125.5,0,433,70,0.285714,0.0,0.814286,0.0,0.142857,1.0


In [157]:
pipe = Pipeline([
    # the reduce_dim stage is populated by the param_grid
    ('scaling', MinMaxScaler()),
    ('classifier', RandomForestClassifier()),
])
param_grid = {
        "n_estimators": [100, 300],
        "min_samples_split": [2, 5, 7],
        "max_features": ["sqrt", None],
        "max_depth": [10, 25, 35],
        "random_state": [1],
        "class_weight": ["balanced"]
    }
param_grid = {f"classifier__{key}": value for key, value in param_grid.items()}
result = grid_search_algo(
    pipe, param_grid, X.drop("text", axis=1), y, scoring=scoring, refit=False,
)
analyse_grid_search(result)

Fitting 10 folds for each of 36 candidates, totalling 360 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:  6.6min
[Parallel(n_jobs=3)]: Done 360 out of 360 | elapsed: 13.8min finished


Unnamed: 0,24,21,33,19,31,12,18,30,35,23,13,25,22,34,32,20,26,27,14,16,29,17,15,28,6,7,11,9,8,10,0,1,3,2,4,5
mean_test_recall,0.949936,0.960215,0.960215,0.953989,0.953989,0.949397,0.953179,0.953179,0.963193,0.963193,0.950478,0.949936,0.962109,0.962109,0.958318,0.958318,0.95589,0.956432,0.955349,0.959139,0.958599,0.95887,0.955621,0.958328,0.96049,0.960219,0.962384,0.961302,0.961573,0.961302,0.959681,0.95914,0.959411,0.959142,0.958329,0.95914
mean_test_precision,0.956814,0.946632,0.946632,0.952594,0.952594,0.956769,0.952842,0.952584,0.942753,0.942753,0.954761,0.95476,0.941685,0.941685,0.94529,0.94529,0.9471,0.946125,0.946563,0.942751,0.943243,0.942738,0.945579,0.94244,0.939319,0.938791,0.935472,0.936152,0.93545,0.934909,0.932049,0.932495,0.929818,0.929778,0.929495,0.928098
mean_test_fbeta,0.953293,0.953286,0.953286,0.953234,0.953234,0.952998,0.952945,0.952816,0.952781,0.952781,0.952539,0.952257,0.951714,0.951714,0.951666,0.951666,0.951401,0.951179,0.950865,0.9508,0.950777,0.950659,0.950499,0.950247,0.94972,0.949317,0.948674,0.94849,0.948264,0.947858,0.945619,0.945592,0.944339,0.94419,0.943655,0.94332
std_test_fbeta,0.00597357,0.00606831,0.00606831,0.00621346,0.00621346,0.00680619,0.00638036,0.00645512,0.00624036,0.00624036,0.00674059,0.00666633,0.00660139,0.00660139,0.00708831,0.00708831,0.0076748,0.00818944,0.00779449,0.00805927,0.00842411,0.00874865,0.00803775,0.00813505,0.00878523,0.0084191,0.00788064,0.00797419,0.00905522,0.0084119,0.00826713,0.00896235,0.00859575,0.00800257,0.00861499,0.00848844
mean_train_recall,1,0.999008,0.999008,1,1,1,0.99997,0.99997,0.99594,0.99594,1,1,0.99564,0.99564,0.998286,0.998286,0.998557,0.999338,0.998527,0.99561,0.996602,0.996782,0.999369,0.9957,0.980785,0.980665,0.977597,0.979191,0.978951,0.977567,0.972756,0.973087,0.971553,0.971884,0.970771,0.971463
mean_train_precision,0.999459,0.986959,0.986959,0.999459,0.999459,0.999459,0.999459,0.999459,0.976704,0.976675,0.999459,0.999459,0.976409,0.976409,0.986539,0.98651,0.981934,0.982267,0.981905,0.970371,0.970314,0.97012,0.982122,0.970459,0.9591,0.958672,0.951534,0.954677,0.954891,0.951726,0.946073,0.946007,0.942671,0.942113,0.940347,0.94077
mean_train_fbeta,0.999729,0.992946,0.992946,0.999729,0.999729,0.999729,0.999714,0.999714,0.986228,0.986213,0.999729,0.999729,0.98593,0.98593,0.992378,0.992363,0.990175,0.990729,0.990145,0.982828,0.983282,0.98327,0.99067,0.982917,0.96982,0.969542,0.964388,0.966778,0.96677,0.964473,0.959228,0.959355,0.956893,0.956766,0.955316,0.95587
param_classifier__class_weight,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced,balanced
param_classifier__max_depth,35,25,35,25,35,25,25,35,35,25,25,35,25,35,35,25,35,35,25,25,35,25,25,35,10,10,10,10,10,10,10,10,10,10,10,10
param_classifier__max_features,sqrt,,,,,sqrt,,,,,sqrt,sqrt,,,,,sqrt,sqrt,sqrt,sqrt,sqrt,sqrt,sqrt,sqrt,,,,,,,sqrt,sqrt,sqrt,sqrt,sqrt,sqrt


Unnamed: 0,max,min
mean_test_recall,0.963193,0.949397
mean_test_precision,0.956814,0.928098
mean_test_fbeta,0.953293,0.94332
std_test_fbeta,0.009055,0.005974
mean_train_recall,1.0,0.970771
mean_train_precision,0.999459,0.940347
mean_train_fbeta,0.999729,0.955316


TODO:
* More data analyisis and preprocessing: 
  * if only punct probably never title
  * if only numbers? for sure not if negative
  * remove those cases from training and **bypass** mlpipeline if so
  * removing stop words won't help as titles are many times stop words
* Add TF-IDF features (+PCA?)
* Add LDA features
* Nested CV
* add spaCy features
  * NER?
* two-stage classifier: https://www.aclweb.org/anthology/I13-1114