In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import os
import pandas as pd
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

from trainevalutils import generate_model_report
from trainevalutils import *

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE

In [2]:
SEED = 3742

In [3]:
OUTPUT_NAME = 'results_rescaling.pkl'

OUTPUT_DIR = '../results/'
for dir_ in [OUTPUT_DIR]:
    if not os.path.exists(dir_):
        os.makedirs(dir_)

In [10]:
FILE = "../data/w2v-vectors_goog_pren.parquet.gzip"
data = pd.read_parquet(FILE)

In [11]:
### load train - val indexes
train_idx = pickle.load(open("../data/wip/train_idx.pkl", 'rb'))
val_idx = pickle.load(open("../data/wip/val_idx.pkl", 'rb'))

### split data by indexes
training_data = data.loc[train_idx]
validation_data = data.loc[val_idx]

In [12]:
del data

In [13]:
features = 'google-news_w2v_mean_prenorm'
target = 'target'

X_train_raw = training_data[features]
X_val_raw = validation_data[features]

y_train = training_data[target]
y_val = validation_data[target]

In [14]:
# Stack
X_train_stacked = np.stack(X_train_raw)
X_val_stacked = np.stack(X_val_raw)

# handling negative values
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_stacked)
X_val = scaler.transform(X_val_stacked)

In [15]:
def run_classifier(clf, model_name, X_tr, y_tr):
    results = []
    clf.fit(X_tr, y_tr)
    title = model_name+'_val'
    print("\n"+title+"\n")
    result_val = evaluate(title, clf, X_val, y_val)
    results.append(result_val)
    print("=====================================================\n")
    return results

### Model LRw2vgoogCWbal
Logistic Regression with default class weight (balanced)

In [12]:
print("\nResults of Logistic Regression on the validation set\n")
logreg_CWbal = LogisticRegression(class_weight='balanced')
results_lr_CWbal = run_classifier(logreg_CWbal, 'LRw2vgoogCWbal', X_train, y_train)

with open('tmp/res/LRw2vgoogCWbal.pkl','wb') as f:
    pickle.dump(results_lr_CWbal, f)


Results of Logistic Regression on the validation set


LRw2vgoogCWbal_val

Accuracy: 0.8384686171793596
Precision (macro): 0.7343620130781237
Recall (macro): 0.7978306304374582
F1-score (macro): 0.7508956111806893

Classification report:
              precision    recall  f1-score   support

     Physics       0.97      0.88      0.92    153261
 Mathematics       0.79      0.84      0.82     60818
Computer Sc.       0.81      0.75      0.78     50476
       Other       0.37      0.72      0.49     14722

    accuracy                           0.84    279277
   macro avg       0.73      0.80      0.75    279277
weighted avg       0.87      0.84      0.85    279277




### Model LRw2vgoogCWgscv
Logistic Regression with custom weights calculated using grid search cross validation in the previous Jupyter Notebook

In [13]:
print("\nResults of Logistic Regression on the validation set\n")
grid_result = pickle.load(open("tmp/grid_result.pkl", 'rb'))
logreg_CWgscv = LogisticRegression(**grid_result.best_params_)
results_lr_CWgscv = run_classifier(logreg_CWgscv, 'LRw2vgoogCWgscv', X_train, y_train)

with open('tmp/res/LRw2vgoogCWgscv.pkl','wb') as f:
    pickle.dump(results_lr_CWgscv, f)


Results of Logistic Regression on the validation set


LRw2vgoogCWgscv_val

Accuracy: 0.842181776515789
Precision (macro): 0.735224127740908
Recall (macro): 0.7934784923253753
F1-score (macro): 0.7544930190732568

Classification report:
              precision    recall  f1-score   support

     Physics       0.97      0.88      0.92    153261
 Mathematics       0.78      0.85      0.81     60818
Computer Sc.       0.80      0.78      0.79     50476
       Other       0.39      0.67      0.50     14722

    accuracy                           0.84    279277
   macro avg       0.74      0.79      0.75    279277
weighted avg       0.87      0.84      0.85    279277




### Model LRw2vgoogUnder
Logistic Regression with random undersampling

In [14]:
randomUnderSampler = RandomUnderSampler()
X_undersampled, y_undersampled = randomUnderSampler.fit_resample(X_train, y_train)

In [19]:
y_undersampled.value_counts()

0    83630
1    83630
2    83630
3    83630
Name: target, dtype: int64

In [20]:
print("\nResults of Logistic Regression on the validation set\n")
logreg_under= LogisticRegression()
results_lr_under = run_classifier(logreg_under, 'LRw2vgoogUnder', X_undersampled, y_undersampled)

with open('tmp/res/LRw2vgoogUnder.pkl','wb') as f:
    pickle.dump(results_lr_under, f)


Results of Logistic Regression on the validation set


LRw2vgoogUnder_val

Accuracy: 0.8335487705754502
Precision (macro): 0.7297957568753975
Recall (macro): 0.7955150015151787
F1-score (macro): 0.7460560133537233

Classification report:
              precision    recall  f1-score   support

     Physics       0.97      0.87      0.92    153261
 Mathematics       0.78      0.85      0.81     60818
Computer Sc.       0.81      0.74      0.77     50476
       Other       0.36      0.73      0.48     14722

    accuracy                           0.83    279277
   macro avg       0.73      0.80      0.75    279277
weighted avg       0.87      0.83      0.84    279277




### Model LRw2vgoogOver
Logistic Regression with random random over sampling

In [21]:
randomOverSampler = RandomOverSampler()
X_oversampled, y_oversampled = randomOverSampler.fit_resample(X_train, y_train)

In [22]:
y_oversampled.value_counts()

2    870462
0    870462
1    870462
3    870462
Name: target, dtype: int64

In [23]:
print("\nResults of Logistic Regression on the validation set\n")
logreg_over= LogisticRegression()
results_lr_over = run_classifier(logreg_over, 'LRw2vgoogOver', X_oversampled, y_oversampled)

with open('tmp/res/LRw2vgoogOver.pkl','wb') as f:
    pickle.dump(results_lr_over, f)


Results of Logistic Regression on the validation set


LRw2vgoogOver_val

Accuracy: 0.8386977803399492
Precision (macro): 0.7341777862261765
Recall (macro): 0.7972390781084011
F1-score (macro): 0.7509814788535876

Classification report:
              precision    recall  f1-score   support

     Physics       0.97      0.88      0.92    153261
 Mathematics       0.79      0.84      0.82     60818
Computer Sc.       0.81      0.75      0.78     50476
       Other       0.37      0.72      0.49     14722

    accuracy                           0.84    279277
   macro avg       0.73      0.80      0.75    279277
weighted avg       0.87      0.84      0.85    279277




### Model LRw2vgoogSmote
Logistic Regression with SMOTE

In [18]:
smote = SMOTE(random_state=SEED)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

In [19]:
pickle.dump(X_smote, open("tmp/X_LRw2vgoogSmote.pkl", 'wb'))
pickle.dump(y_smote, open("tmp/y_LRw2vgoogSmote.pkl", 'wb'))

In [4]:
X_smote = pickle.load(open("tmp/X_LRw2vgoogSmote.pkl", 'rb'))

In [5]:
y_smote = pickle.load(open("tmp/y_LRw2vgoogSmote.pkl", 'rb'))

In [6]:
y_smote.value_counts()

2    870462
0    870462
1    870462
3    870462
Name: target, dtype: int64

In [16]:
print("\nResults of Logistic Regression on the validation set\n")
logreg_smote = LogisticRegression()
results_lr_smote = run_classifier(logreg_smote, 'LRw2vgoogSmote', X_smote, y_smote)

with open('tmp/res/LRw2vgoogSmote.pkl','wb') as f:
    pickle.dump(results_lr_smote, f)


Results of Logistic Regression on the validation set


LRw2vgoogSmote_val

Accuracy: 0.8395750455640816
Precision (macro): 0.7347512707237002
Recall (macro): 0.7938723337245261
F1-score (macro): 0.7507743106462288

Classification report:
              precision    recall  f1-score   support

     Physics       0.96      0.88      0.92    153261
 Mathematics       0.79      0.84      0.81     60818
Computer Sc.       0.81      0.75      0.78     50476
       Other       0.38      0.70      0.49     14722

    accuracy                           0.84    279277
   macro avg       0.73      0.79      0.75    279277
weighted avg       0.87      0.84      0.85    279277




In [22]:
saved_res = []
files = os.listdir('tmp/res')
for f in files:
    with open('tmp/res/'+f, 'rb') as f:
        saved_res.append(pd.DataFrame(pickle.load(f)))

In [25]:
df_result = pd.concat(saved_res)
df_result

Unnamed: 0,Description,Accuracy,Precision,Recall,F1-score
0,LRw2vgoogCWbal_val,0.838469,0.734362,0.797831,0.750896
0,LRw2vgoogCWgscv_val,0.842182,0.735224,0.793478,0.754493
0,LRw2vgoogUnder_val,0.833549,0.729796,0.795515,0.746056
0,LRw2vgoogOver_val,0.838698,0.734178,0.797239,0.750981
0,LRw2vgoogSmote_val,0.839575,0.734751,0.793872,0.750774


In [26]:
df_result.to_pickle(OUTPUT_DIR+OUTPUT_NAME)