## Logistic Regression with Basilica encodings

In [1]:
import numpy as np
import pandas as pd
# for model:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
# for scoring:
from sklearn import metrics
from sklearn.metrics import f1_score

In [2]:
# need to use the basilica_y for the y matrix, the basilica embeddings are 1 row short
# of toxic-clean-train.csv.
x = np.loadtxt("basilica_toxic_clean.out", delimiter=",")
y = pd.read_csv('basilica_y.csv')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.2)

In [4]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((127656, 768), (31914, 768), (127656, 6), (31914, 6))

In [8]:
# onevsrest so using liblinear, sag results in convergence errors.
pipe = make_pipeline(OneVsRestClassifier(LogisticRegression(max_iter=500, class_weight='balanced')))
param_grid = {'onevsrestclassifier__estimator__solver': ['liblinear']} 
grid = GridSearchCV(pipe, param_grid, cv=10, scoring='roc_auc', verbose=3)

grid2 = grid.fit(X_train, y_train)

Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV] onevsrestclassifier__estimator__solver=liblinear ................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9712276148352349, total=12.3min
[CV] onevsrestclassifier__estimator__solver=liblinear ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 12.3min remaining:    0.0s


[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9661021707631203, total=13.4min
[CV] onevsrestclassifier__estimator__solver=liblinear ................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 25.8min remaining:    0.0s


[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9702816669865174, total=15.9min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9731819187687122, total=16.0min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9623188787950818, total=15.7min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9664180464261439, total=15.5min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9671031762233154, total=14.7min
[CV] onevsrestclassifier__estimator__solver=liblinear ................
[CV]  onevsrestclassifier__estimator__solver=liblinear, score=0.9782493832741258, total=14.7min
[CV] onevsrestclassifier__estimator__solver=liblinear ...............

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 150.3min finished


In [9]:
predicted_y_test = grid2.predict(X_test)
predicted_y_test[:1]

array([[0, 0, 0, 0, 0, 0]])

In [10]:
y_pred_prob = grid2.predict_proba(X_test)
y_pred_prob[:1]

array([[1.12143511e-02, 3.82834865e-06, 4.01889150e-03, 1.91071221e-07,
        2.44666439e-02, 7.75593665e-03]])

In [11]:
auc_score = metrics.roc_auc_score(y_test, y_pred_prob)
auc_score

0.9727434677199321

In [12]:
f1_score(y_test, predicted_y_test, average='micro')

0.5416098226466576