# Exercise 6 - Part C

### Necessary imports

In [2]:
import pandas as pd
import numpy as np
import os
import json
import multiprocessing
from sklearn.model_selection import cross_val_score
from sklearn import utils
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
tqdm.pandas(desc="progress-bar")

### Dataset loading

In [3]:
# Create the structure of pandas dataframe
df = pd.DataFrame(columns=["synset", "synonyms", "definition", "target"])

for filename in os.listdir("data/"):
    with open(os.path.join("data/", filename), 'r') as f:
        if filename == "1.json":
            data = json.loads(f.read())
            synsets = data["dataset"]
            for index, synset in enumerate(synsets):
                df.loc[len(df)] = [
                    synset.split(":")[0],
                    synset.split(":")[1].split("|")[0].strip(),
                    synset.split(":")[2].strip(),
                    data["answers"][index]
                ]
        else:
            data = json.loads(f.read())
            answers = data["answers"]
            for index, answer in enumerate(answers):
                df.at[index, 'target'] = df.iloc[index]['target'] + ", " + answer

# Produce an unique target label that is the common one
df['target'] = df['target'].apply(lambda x: max(set(x.split(", ")), key=x.split(", ").count))
df['target'] = df['target'].map({'basic': 0, 'advanced': 1})

### Pre-processing of definitions

In [4]:
# Gensim pre-process (tokenize, remove stopwords, tokenize)
df['definition'] = df['definition'].apply(simple_preprocess)

In [5]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df['definition'], df['target'], test_size=0.2, random_state=42)

# Creating the tagged documents for the Doc2Vec model
train_tagged = []
for index, sentence in enumerate(X_train):
    train_tagged.append(TaggedDocument(sentence, [y_train.iloc[index]]))

test_tagged = []
for index, sentence in enumerate(X_test):
    test_tagged.append(TaggedDocument(sentence, [y_test.iloc[index]]))

### Doc2vec model training

In [6]:
# Creating the Doc2Vec model
cores = multiprocessing.cpu_count()
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged)])

100%|██████████| 403/403 [00:00<?, ?it/s]


In [7]:
# Training the Doc2Vec model
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged)]), total_examples=len(train_tagged), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 403/403 [00:00<00:00, 118567.94it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<00:00, 49211.15it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<00:00, 49973.53it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<?, ?it/s]
100%|██████████| 403/403 [00:00<00:00, 106081.62it/s]
100%|██████████| 403/403 [00:00<00:00, 90671.84it/s]
100%|██████████| 40

In [8]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, epochs=20)) for doc in sents])
    return targets, regressors

In [12]:
# Training the Logistic Regression model
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)
logreg = LogisticRegression(n_jobs=1, C=1e5, max_iter=2000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Testing accuracy 0.5445544554455446
Testing F1 score: 0.5461746174617462


### K-Validation

In [13]:
# 10-fold cross-validation on training set
scores = cross_val_score(logreg, X_train, y_train, cv=10)
print("Cross-validation mean score - Training Set: {}".format(scores.mean()))

Cross-validation mean score - Training Set: 0.5007926829268292
