In [None]:
import os
import sys
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import numpy as np

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.dataset import PreprocessData, Word2Vec


In [None]:
# config
data_dir = "../data"
dataset = "PubMed_20k_RCT"

lower = False
stemming = False
rem_stop_words = False
lemmatisation = False

preprocess = False

In [None]:
preprocesser = PreprocessData(data_dir=data_dir,
                              dataset=dataset,
                              lower=lower, 
                              rem_stop_words=rem_stop_words, 
                              stemming=stemming, 
                              lemmatisation=lemmatisation)
dev, train, test = preprocesser.createFiles()

In [None]:
df = pd.concat([dev, train, test], ignore_index=True, axis=0)
word2vec = Word2Vec(corpus_file = df["Sentences"], 
                 vector_size = 1000, 
                 window = 10, 
                 min_count = 1, 
                 sg = 1)

word2vec.setup()

y_train = [word2vec.label_to_vec[label] for label in train["Labels"]].tolist().values
x_train = [word2vec.sentence_to_vector(sentence) for sentence in train["Sentences"]].values

y_test = [word2vec.label_to_vec[label] for label in test["Labels"]].tolist().values
x_test = [word2vec.sentence_to_vector(sentence) for sentence in test["Sentences"]].values

In [None]:
def print_metrics(pred_test, y_test, pred_train, y_train):
    print("test accuracy", str(np.mean(pred_test == y_test)))
    print("train accuracy", str(np.mean(pred_train == y_train)))
    print("\n Metrics and Confusion \n")
    print(metrics.confusion_matrix(y_test, pred_test))
    print(metrics.classification_report(y_test, pred_test))

In [None]:

model = KNeighborsClassifier(n_neighbors = 25)
model.fit(x_train, y_train)
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print_metrics(pred_test, y_test, pred_train, y_train)

In [None]:

model = DecisionTreeClassifier(max_depth = 25)
model.fit(x_train, y_train)
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print_metrics(pred_test, y_test, pred_train, y_train)

In [None]:
model = MLPClassifier(max_iter=1)
model.fit(x_train, y_train)
pred_train = model.predict(x_train)
pred_test = model.predict(x_test)
print_metrics(pred_test, y_test, pred_train, y_train)