In [1]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from numpy import load
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics

In [2]:
# Load the X_train, Y_train, X_test and Y_test
X_train = np.load('../data/fnc-1/x_train_doc2vec.npy')
X_test = np.load('../data/fnc-1/x_test_doc2vec.npy')
Y_train = np.load('../data/fnc-1/y_train.npy', allow_pickle=True)
Y_test = load('../data/fnc-1/y_test.npy', allow_pickle=True)

In [3]:
# Converting the labels into int type to prevent unknown type error 
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [4]:
# Random Forest Classifier 
randomForest = RandomForestClassifier(random_state=1)
randomForest.fit(X_train, Y_train)
Y_predictions = randomForest.predict(X_test)
print('Accuracy: ', accuracy_score(Y_test, Y_predictions) * 100, "%")
# Getting the confusion matrix
print('Confusion Matrix:\n', confusion_matrix(Y_test, Y_predictions))
# Precision score
print('Precision Score: ', precision_score(Y_test, Y_predictions, average="micro"))
# Recall score
print('Recall Score: ', recall_score(Y_test, Y_predictions, average="micro"))
print(classification_report(Y_test, Y_predictions))

Accuracy:  72.00251839609648 %
Confusion Matrix:
 [[    2     0   158  1743]
 [    0     0    89   608]
 [    0     0   266  4198]
 [    0     0   319 18030]]
Precision Score:  0.7200251839609648
Recall Score:  0.7200251839609648
              precision    recall  f1-score   support

           0       1.00      0.00      0.00      1903
           1       0.00      0.00      0.00       697
           2       0.32      0.06      0.10      4464
           3       0.73      0.98      0.84     18349

    accuracy                           0.72     25413
   macro avg       0.51      0.26      0.24     25413
weighted avg       0.66      0.72      0.62     25413



  _warn_prf(average, modifier, msg_start, len(result))


In [5]:
# Logitsic Regression Classifier 
logisticRegression = LogisticRegression(solver='saga', multi_class='multinomial')
logisticRegression.fit(X_train, Y_train)
lr_Y_predictions = logisticRegression.predict(X_test)
print('Accuracy: ', accuracy_score(Y_test, lr_Y_predictions) * 100, "%")
# Getting the confusion matrix
print('Confusion Matrix:\n', confusion_matrix(Y_test, lr_Y_predictions))
# Precision score
print('Precision Score: ', precision_score(Y_test, lr_Y_predictions, average="micro"))
# Recall score
print('Recall Score: ', recall_score(Y_test, lr_Y_predictions, average="micro"))
print(classification_report(Y_test, lr_Y_predictions))

Accuracy:  72.05760831070712 %
Confusion Matrix:
 [[   69    16    82  1736]
 [   50     9    40   598]
 [   36     6   209  4213]
 [   37     8   279 18025]]
Precision Score:  0.7205760831070712
Recall Score:  0.7205760831070712
              precision    recall  f1-score   support

           0       0.36      0.04      0.07      1903
           1       0.23      0.01      0.02       697
           2       0.34      0.05      0.08      4464
           3       0.73      0.98      0.84     18349

    accuracy                           0.72     25413
   macro avg       0.42      0.27      0.25     25413
weighted avg       0.62      0.72      0.63     25413





In [6]:
# Gaussian Naive Bayes Classifier 
gaussianNB = GaussianNB()
gaussianNB.fit(X_train, Y_train)
gnb_Y_predictions = gaussianNB.predict(X_test)
print('Accuracy: ', accuracy_score(Y_test, gnb_Y_predictions) * 100, "%")
# Getting the confusion matrix
print('Confusion Matrix:\n', confusion_matrix(Y_test, gnb_Y_predictions))
# Precision score
print('Precision Score: ', precision_score(Y_test, gnb_Y_predictions, average="micro"))
# Recall score
print('Recall Score: ', recall_score(Y_test, gnb_Y_predictions, average="micro"))
print(classification_report(Y_test, gnb_Y_predictions))

Accuracy:  60.48085625467281 %
Confusion Matrix:
 [[  113    75   523  1192]
 [   40    60   214   383]
 [  297   169  1179  2819]
 [  579   737  3015 14018]]
Precision Score:  0.6048085625467281
Recall Score:  0.6048085625467281
              precision    recall  f1-score   support

           0       0.11      0.06      0.08      1903
           1       0.06      0.09      0.07       697
           2       0.24      0.26      0.25      4464
           3       0.76      0.76      0.76     18349

    accuracy                           0.60     25413
   macro avg       0.29      0.29      0.29     25413
weighted avg       0.60      0.60      0.60     25413



In [7]:
# Bernoulli Naive Bayes Classifier 
bernoilliNB = BernoulliNB()
bernoilliNB.fit(X_train, Y_train)
bnb_Y_predictions = bernoilliNB.predict(X_test)
print('Accuracy: ', accuracy_score(Y_test, gnb_Y_predictions) * 100, "%")
# Getting the confusion matrix
print('Confusion Matrix:\n', confusion_matrix(Y_test, bnb_Y_predictions))
# Precision score
print('Precision Score: ', precision_score(Y_test, bnb_Y_predictions, average="micro"))
# Recall score
print('Recall Score: ', recall_score(Y_test, bnb_Y_predictions, average="micro"))
print(classification_report(Y_test, bnb_Y_predictions))

Accuracy:  60.48085625467281 %
Confusion Matrix:
 [[  259    61   606   977]
 [  115    49   222   311]
 [  432    65  1783  2184]
 [ 1009   182  4011 13147]]
Precision Score:  0.5996143705977256
Recall Score:  0.5996143705977256
              precision    recall  f1-score   support

           0       0.14      0.14      0.14      1903
           1       0.14      0.07      0.09       697
           2       0.27      0.40      0.32      4464
           3       0.79      0.72      0.75     18349

    accuracy                           0.60     25413
   macro avg       0.34      0.33      0.33     25413
weighted avg       0.63      0.60      0.61     25413



In [8]:
# Decision Tree Classifier 
decisionTree = DecisionTreeClassifier(random_state=1)
decisionTree.fit(X_train, Y_train )
dt_Y_predictions = decisionTree.predict(X_test)
print('Accuracy: ', accuracy_score(Y_test, dt_Y_predictions) * 100, "%")
# Getting the confusion matrix
print('Confusion Matrix:\n', confusion_matrix(Y_test, dt_Y_predictions))
# Precision score
print('Precision Score: ', precision_score(Y_test, dt_Y_predictions, average="micro"))
# Recall score
print('Recall Score: ', recall_score(Y_test, dt_Y_predictions, average="micro"))
print(classification_report(Y_test, dt_Y_predictions))

Accuracy:  57.10069649392043 %
Confusion Matrix:
 [[  208    53   410  1232]
 [   76    21   190   410]
 [  472   116  1009  2867]
 [ 1491   335  3250 13273]]
Precision Score:  0.5710069649392043
Recall Score:  0.5710069649392043
              precision    recall  f1-score   support

           0       0.09      0.11      0.10      1903
           1       0.04      0.03      0.03       697
           2       0.21      0.23      0.22      4464
           3       0.75      0.72      0.73     18349

    accuracy                           0.57     25413
   macro avg       0.27      0.27      0.27     25413
weighted avg       0.58      0.57      0.58     25413



In [None]:
# # LinearSVC Classifier 
# linearSVC = LinearSVC(multi_class='crammer_singer', C=0.5)
# linearSVC.fit(X_train, Y_train )
# lsvc_Y_predictions = linearSVC.predict(X_test)
# print('Accuracy: ', accuracy_score(Y_test, lsvc_Y_predictions) * 100, "%")
# # Getting the confusion matrix
# print('Confusion Matrix:\n', confusion_matrix(Y_test, lsvc_Y_predictions))
# # Precision score
# print('Precision Score: ', precision_score(Y_test, lsvc_Y_predictions, average="micro"))
# # Recall score
# print('Recall Score: ', recall_score(Y_test, lsvc_Y_predictions, average="micro"))
# print(classification_report(Y_test, lsvc_Y_predictions))

In [9]:
# MLP Classifier(Neural Net)
mlpClassifier = MLPClassifier(random_state=1)
mlpClassifier.fit(X_train, Y_train )
mlp_Y_predictions = mlpClassifier.predict(X_test)
print('Accuracy: ', accuracy_score(Y_test, mlp_Y_predictions) * 100, "%")
# Getting the confusion matrix
print('Confusion Matrix:\n', confusion_matrix(Y_test, mlp_Y_predictions))
# Precision score
print('Precision Score: ', precision_score(Y_test, mlp_Y_predictions, average="micro"))
# Recall score
print('Recall Score: ', recall_score(Y_test, mlp_Y_predictions, average="micro"))
print(classification_report(Y_test, mlp_Y_predictions))

Accuracy:  62.86546255853304 %
Confusion Matrix:
 [[  207    24   330  1342]
 [   89     9   117   482]
 [  355    41   923  3145]
 [  950   122  2440 14837]]
Precision Score:  0.6286546255853304
Recall Score:  0.6286546255853304
              precision    recall  f1-score   support

           0       0.13      0.11      0.12      1903
           1       0.05      0.01      0.02       697
           2       0.24      0.21      0.22      4464
           3       0.75      0.81      0.78     18349

    accuracy                           0.63     25413
   macro avg       0.29      0.28      0.28     25413
weighted avg       0.59      0.63      0.61     25413



In [16]:
models_list = ['Random Forest', 'Logistic Regression', 'Gaussian NB', 'Bernoulli NB', 'Decision Tree', 'MLP']
rf_accuracy = accuracy_score(Y_test, Y_predictions) * 100 
lr_accuracy = accuracy_score(Y_test, lr_Y_predictions) * 100
gnb_accuracy = accuracy_score(Y_test, gnb_Y_predictions) * 100
bnb_accuracy = accuracy_score(Y_test, bnb_Y_predictions) * 100
dt_accuracy = accuracy_score(Y_test, dt_Y_predictions) * 100
# lsvc_accuracy = accuracy_score(Y_test, lsvc_Y_predictions) * 100
mlp_accuracy = accuracy_score(Y_test, mlp_Y_predictions) * 100
models_accuracy = [rf_accuracy, lr_accuracy, gnb_accuracy, bnb_accuracy, dt_accuracy,mlp_accuracy]
summary = {'model': models_list, 'accuracy': models_accuracy}
models_summary = pd.DataFrame(summary)
models_summary.set_index('model', inplace=True)
models_summary.index.name = None

In [17]:
models_summary

Unnamed: 0,accuracy
Random Forest,72.002518
Logistic Regression,72.057608
Gaussian NB,60.480856
Bernoulli NB,59.961437
Decision Tree,57.100696
MLP,62.865463
