# Exercise 6 - Part C

### Necessary imports

In [1]:
import pandas as pd
import numpy as np
import os
import json
import multiprocessing
import random
from sklearn import utils
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier

### Dataset loading

In [2]:
# Create the structure of pandas dataframe
df = pd.DataFrame(columns=["synset", "synonyms", "definition", "target"])

for filename in os.listdir("data/"):
    with open(os.path.join("data/", filename), 'r') as f:
        if filename == "1.json":
            data = json.loads(f.read())
            synsets = data["dataset"]
            for index, synset in enumerate(synsets):
                df.loc[len(df)] = [
                    synset.split(":")[0],
                    synset.split(":")[1].split("|")[0].strip(),
                    synset.split(":")[2].strip(),
                    data["answers"][index]
                ]
        else:
            data = json.loads(f.read())
            answers = data["answers"]
            for index, answer in enumerate(answers):
                df.at[index, 'target'] = df.iloc[index]['target'] + ", " + answer

# Produce an unique target label that is the common one
df['target'] = df['target'].apply(lambda x: max(set(x.split(", ")), key=x.split(", ").count))
df['target'] = df['target'].map({'basic': 0, 'advanced': 1})

## First Method: Using doc2vec to create a vector representation of each definition and then using a RandomForestClassifier to classify the definitions

### Pre-processing of definitions

In [3]:
# Gensim pre-process (tokenize, remove stopwords, tokenize)
df['definition'] = df['definition'].apply(simple_preprocess)

In [33]:
def tagged_documents(df: pd.DataFrame) -> list:
    # Splitting the dataset
    X_train, X_test, y_train, y_test = train_test_split(df['definition'], df['target'], test_size=0.2, random_state=random.randint(0, 1000))

    # Creating the tagged documents for the Doc2Vec model
    train_tagged = []
    for index, sentence in enumerate(X_train):
        train_tagged.append(TaggedDocument(sentence, [y_train.iloc[index]]))

    test_tagged = []
    for index, sentence in enumerate(X_test):
        test_tagged.append(TaggedDocument(sentence, [y_test.iloc[index]]))
    
    return train_tagged, test_tagged

### Doc2vec model training

In [34]:
def doc2vec_model(train_tagged: list) -> Doc2Vec:
    # Creating the Doc2Vec model
    cores = multiprocessing.cpu_count()
    model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores, epochs=30)
    model_dbow.build_vocab([x for x in train_tagged])

    # Training the Doc2Vec model
    model_dbow.train(utils.shuffle([x for x in train_tagged]), total_examples=len(train_tagged), epochs=model_dbow.epochs)
    
    return model_dbow

In [35]:
# Creating the final train and test vectors
def vec_for_classifier(model, tagged_docs):
    sents = tagged_docs
    y, X = zip(*[(doc.tags[0], model.infer_vector(doc.words, epochs=20)) for doc in sents])
    return y, X

### Testing the doc2vec model with RandomForestClassifier

In [38]:
# Training the Random Forest Model
train_tagged, test_tagged = tagged_documents(df)
model_dbow = doc2vec_model(train_tagged)
y_train, X_train = vec_for_classifier(model_dbow, train_tagged)
y_test, X_test = vec_for_classifier(model_dbow, test_tagged)

rfc = RandomForestClassifier(max_depth=4)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Accuracy 0.5841584158415841
F1 score: 0.5776919797242883


### K-Validation of the models

In [37]:
# 10-fold cross validation: i retrain the doc2vec model and the random forest model 10 times
accuracy = []
f1 = []

for i in range(10):
    train_tagged, test_tagged = tagged_documents(df)
    model_dbow = doc2vec_model(train_tagged)
    y_train, X_train = vec_for_classifier(model_dbow, train_tagged)
    y_test, X_test = vec_for_classifier(model_dbow, test_tagged)

    rfc = RandomForestClassifier(max_depth=4)
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)
    accuracy.append(accuracy_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred, average='weighted'))

print("Mean Accuracy: ", np.mean(accuracy))
print("Mean F1 score: ", np.mean(f1))

Mean Accuracy:  0.6198019801980197
Mean F1 score:  0.620067424332132
