## W2V google NB and LR train

Naive Bayes and Logistic Regression models are trained using mean W2V vectors as input. The result of each model is used to obtain preditions from the validation set and analyse the performance of each classifier. 

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import os
import pandas as pd
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

from trainevalutils import generate_model_report
from trainevalutils import *

In [2]:
OUTPUT_NAME = 'results_w2v-google_NBLR.pkl'

OUTPUT_DIR = '../results/'
for dir_ in [OUTPUT_DIR]:
    if not os.path.exists(dir_):
        os.makedirs(dir_)

In [3]:
FILE = "../data/w2v-vectors_goog_pren.parquet.gzip"
data = pd.read_parquet(FILE)

In [4]:
### load train - val indexes
train_idx = pickle.load(open("../data/wip/train_idx.pkl", 'rb'))
val_idx = pickle.load(open("../data/wip/val_idx.pkl", 'rb'))

### split data by indexes
training_data = data.loc[train_idx]
validation_data = data.loc[val_idx]

In [5]:
del data

In [6]:
features = 'google-news_w2v_mean_prenorm'
target = 'target'

X_train_raw = training_data[features]
X_val_raw = validation_data[features]

y_train = training_data[target]
y_val = validation_data[target]

In [7]:
# Stack
X_train_stacked = np.stack(X_train_raw)
X_val_stacked = np.stack(X_val_raw)

# handling negative values
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train_stacked)
X_val = scaler.transform(X_val_stacked)

In [8]:
def run_classifier(clf, model_name):
    results = []
    clf.fit(X_train, y_train)
    title = model_name+'_val'
    print("\n"+title+"\n")
    result_val = evaluate(title, clf, X_val, y_val)
    results.append(result_val)
    print("=====================================================\n")
    return results

In [9]:
print("\nResults of Naive Bayes on the validation set\n")
naivebayes = MultinomialNB()
results_nb = run_classifier(naivebayes, 'NBw2vgoog')


Results of Naive Bayes on the validation set


NBw2vgoog_val

Accuracy: 0.5487777367989487
Precision (macro): 0.13719443419973718
Recall (macro): 0.25
F1-score (macro): 0.1771647808978633

Classification report:
              precision    recall  f1-score   support

     Physics       0.55      1.00      0.71    153261
 Mathematics       0.00      0.00      0.00     60818
Computer Sc.       0.00      0.00      0.00     50476
       Other       0.00      0.00      0.00     14722

    accuracy                           0.55    279277
   macro avg       0.14      0.25      0.18    279277
weighted avg       0.30      0.55      0.39    279277




In [10]:
print("\nResults of Logistic Regression on the validation set\n")
logreg = LogisticRegression()
results_lr = run_classifier(logreg, 'LRw2vgoog')


Results of Logistic Regression on the validation set


LRw2vgoog_val

Accuracy: 0.8655707415934717
Precision (macro): 0.7802052803963566
Recall (macro): 0.7469486442263301
F1-score (macro): 0.7582487018677337

Classification report:
              precision    recall  f1-score   support

     Physics       0.93      0.94      0.93    153261
 Mathematics       0.82      0.82      0.82     60818
Computer Sc.       0.78      0.84      0.81     50476
       Other       0.59      0.39      0.47     14722

    accuracy                           0.87    279277
   macro avg       0.78      0.75      0.76    279277
weighted avg       0.86      0.87      0.86    279277




In [11]:
df_result = pd.concat([pd.DataFrame(results_nb),
                       pd.DataFrame(results_lr)])

df_result

Unnamed: 0,Description,Accuracy,Precision,Recall,F1-score
0,NBw2vgoog_val,0.548778,0.137194,0.25,0.177165
0,LRw2vgoog_val,0.865571,0.780205,0.746949,0.758249


In [12]:
df_result.to_pickle(OUTPUT_DIR+OUTPUT_NAME)