# TFIDF NB LR train

Naive Bayes and Logistic Regression models are trained using TF-IDF vectors as input. The result of each model is used to obtain preditions from the validation set and analyse the performance of each classifier. 

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import os
import pandas as pd
import pickle
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

from trainevalutils import *

In [2]:
target_names = pickle.load(open("../data/class_labels_dic.pkl", 'rb'))

In [3]:
OUTPUT_NAME = 'results_TFIDF_NBLR-google.pkl'

OUTPUT_DIR = '../results/'
for dir_ in [OUTPUT_DIR]:
    if not os.path.exists(dir_):
        os.makedirs(dir_)

In [4]:
X_train_1_1 = pickle.load(open('../data/wip/TFIDF/TFIDF_X_train_1_1.pkl', 'rb'))
X_train_1_2 = pickle.load(open('../data/wip/TFIDF/TFIDF_X_train_1_2.pkl', 'rb'))                       
y_train = pickle.load(open('../data/wip/TFIDF/TFIDF_y_train.pkl', 'rb'))
 
X_val_1_1 = pickle.load(open('../data/wip/TFIDF/TFIDF_X_val_1_1.pkl', 'rb'))
X_val_1_2 = pickle.load(open('../data/wip/TFIDF/TFIDF_X_val_1_2.pkl', 'rb'))                          
y_val = pickle.load(open('../data/wip/TFIDF/TFIDF_y_val.pkl', 'rb'))

In [5]:
def run_classifier(clf, model_name):
    results = []
    model_versions = ['1','2']
    X_train_sets = [X_train_1_1, X_train_1_2]
    X_val_sets = [X_val_1_1, X_val_1_2]
    for mv, Xtr, Xvl in zip(model_versions , X_train_sets, X_val_sets): 
        clf.fit(Xtr, y_train)
        name = model_name+mv
        title = name+'_val'
        print("\n"+title+"\n")
        result_val = evaluate(title, clf, Xvl, y_val)
        results.append(result_val)
        print("=====================================================\n")
    return results

In [6]:
print("\nResults of Naive Bayes on the validation set\n")
naivebayes = MultinomialNB()
results_nb = run_classifier(naivebayes, 'NBtfifd')


Results of Naive Bayes on the validation set


NBtfifd1_val

Accuracy: 0.8903239436115398
Precision (macro): 0.805928704517044
Recall (macro): 0.7995244325816661
F1-score (macro): 0.8003980153644861

Classification report:
              precision    recall  f1-score   support

     Physics       0.97      0.93      0.95    153261
 Mathematics       0.85      0.89      0.87     60818
Computer Sc.       0.80      0.89      0.84     50476
       Other       0.61      0.49      0.55     14722

    accuracy                           0.89    279277
   macro avg       0.81      0.80      0.80    279277
weighted avg       0.89      0.89      0.89    279277



NBtfifd2_val

Accuracy: 0.8908180766765612
Precision (macro): 0.8009309571791775
Recall (macro): 0.7984639659822216
F1-score (macro): 0.7978496514726086

Classification report:
              precision    recall  f1-score   support

     Physics       0.97      0.93      0.95    153261
 Mathematics       0.85      0.89      0.87     60818

In [7]:
print("\nResults of Logistic Regression on the validation set\n")
logreg = LogisticRegression()
results_lr = run_classifier(logreg, 'LRtfidf')


Results of Logistic Regression on the validation set


LRtfidf1_val

Accuracy: 0.9165273187552143
Precision (macro): 0.8455097143859243
Recall (macro): 0.8231682215114892
F1-score (macro): 0.8324118381611826

Classification report:
              precision    recall  f1-score   support

     Physics       0.96      0.97      0.96    153261
 Mathematics       0.90      0.91      0.90     60818
Computer Sc.       0.86      0.89      0.87     50476
       Other       0.66      0.53      0.59     14722

    accuracy                           0.92    279277
   macro avg       0.85      0.82      0.83    279277
weighted avg       0.91      0.92      0.91    279277



LRtfidf2_val

Accuracy: 0.9172219695857518
Precision (macro): 0.8456523712730994
Recall (macro): 0.8225431512284617
F1-score (macro): 0.8320273849055693

Classification report:
              precision    recall  f1-score   support

     Physics       0.96      0.97      0.97    153261
 Mathematics       0.90      0.91      0.90 

In [8]:
df_result = pd.concat([pd.DataFrame(results_nb),
                       pd.DataFrame(results_lr)])

df_result

Unnamed: 0,Description,Accuracy,Precision,Recall,F1-score
0,NBtfifd1_val,0.890324,0.805929,0.799524,0.800398
1,NBtfifd2_val,0.890818,0.800931,0.798464,0.79785
0,LRtfidf1_val,0.916527,0.84551,0.823168,0.832412
1,LRtfidf2_val,0.917222,0.845652,0.822543,0.832027


In [9]:
df_result.to_pickle(OUTPUT_DIR+OUTPUT_NAME)