## Logistic Regression with small Bert encodings

In [10]:
import numpy as np
import pandas as pd
# for model:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
# for scoring:
from sklearn import metrics
from sklearn.metrics import f1_score

In [6]:
# requires toxic-tr
x = np.loadtxt("toxic_bert_matrix_small.out", delimiter=",")
df = pd.read_csv('toxic-train-clean-small.csv')
y = df.iloc[:, 2:8]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((104228, 768), (26057, 768), (104228, 6), (26057, 6))

In [11]:
pipe = make_pipeline(OneVsRestClassifier(LogisticRegression(max_iter=500, class_weight='balanced')))
param_grid = {'onevsrestclassifier__estimator__solver': ['liblinear']} 
grid = GridSearchCV(pipe, param_grid, cv=15, scoring='roc_auc', verbose=3)

grid3 = grid.fit(X_train, y_train)

Fitting 15 folds for each of 1 candidates, totalling 15 fits
[CV] onevsrestclassifier__estimator__solver=liblinear ................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.965928775955896, total= 9.0min
[CV] onevsrestclassifier__estimator__solver=liblinear ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  9.0min remaining:    0.0s


[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9640330644403972, total= 9.6min
[CV] onevsrestclassifier__estimator__solver=liblinear ................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 18.6min remaining:    0.0s


[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.972971500795251, total= 9.0min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9671562314378462, total= 9.5min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9740035855615091, total= 9.2min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9715891400171431, total= 9.3min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9718038045121776, total= 9.1min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9675409509086439, total= 9.2min
[CV] onevsrestclassifier__estimator__solver=liblinear ................

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 137.9min finished


In [12]:
grid3.best_score_

0.9696622763154467

In [13]:
predicted_y_test = grid3.predict(X_test)
predicted_y_test[:1]

array([[1, 1, 1, 1, 1, 1]])

In [15]:
y_pred_prob = grid3.predict_proba(X_test)
y_pred_prob[:1]

array([[0.99925066, 0.9966995 , 0.998688  , 0.99977871, 0.99852208,
        0.89831415]])

In [16]:
auc_score = metrics.roc_auc_score(y_test, y_pred_prob)
auc_score

0.9705530315442147

In [17]:
f1_score(y_test, predicted_y_test, average='micro')

0.510601273068645