## Logistic Regression with Bert encodings

In [22]:
import numpy as np
import pandas as pd
# for model:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
# for scoring:
from sklearn import metrics
from sklearn.metrics import f1_score

In [14]:
x = np.loadtxt("toxic_bert_matrix.out", delimiter=",")
df = pd.read_csv('toxic-train-clean.csv')
y = df.iloc[:, 2:8]

KeyboardInterrupt: 

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((127656, 768), (31915, 768), (127656, 6), (31915, 6))

In [12]:
# liblinear used because of onevsrest:
pipe = make_pipeline(OneVsRestClassifier(LogisticRegression(max_iter=500, class_weight='balanced')))
param_grid = {'onevsrestclassifier__estimator__solver': ['liblinear']} 
grid = GridSearchCV(pipe, param_grid, cv=15, scoring='roc_auc', verbose=3)

grid3 = grid.fit(X_train, y_train)

Fitting 15 folds for each of 1 candidates, totalling 15 fits
[CV] onevsrestclassifier__estimator__solver=liblinear ................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.963071802794602, total=10.5min
[CV] onevsrestclassifier__estimator__solver=liblinear ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 10.5min remaining:    0.0s


[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9552871345620022, total=10.4min
[CV] onevsrestclassifier__estimator__solver=liblinear ................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 20.9min remaining:    0.0s


[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9601566273421375, total=10.6min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9626310232200085, total=10.5min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9575434781502684, total=11.3min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9550062767151036, total=10.7min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9531261541113167, total=10.9min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.964623481215454, total=10.8min
[CV] onevsrestclassifier__estimator__solver=liblinear ................

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 160.9min finished


In [15]:
grid3.best_score_

0.960278976189342

In [17]:
predicted_y_test = grid3.predict(X_test)
predicted_y_test[:1]

array([[1, 0, 0, 0, 0, 0]])

In [19]:
y_pred_prob = grid3.predict_proba(X_test)
y_pred_prob[:1]

array([[5.69946396e-01, 7.74700657e-03, 3.07358235e-01, 1.08244501e-04,
        1.74827084e-01, 1.10772028e-01]])

In [20]:
auc_score = metrics.roc_auc_score(y_test, y_pred_prob)
auc_score

0.9588967691138731

In [23]:
f1_score(y_test, predicted_y_test, average='micro')

0.4903430497719611