# Exercise 6 - Part C

### Necessary imports

In [150]:
import pandas as pd
import numpy as np
import os
import json
import multiprocessing
import random
from sklearn import utils
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

### Dataset loading

In [151]:
# Create the structure of pandas dataframe
def load_dataset():
    df = pd.DataFrame(columns=["synset", "synonyms", "definition", "target", "timeDiffs", "isHard"])

    for filename in os.listdir("data/"):
        with open(os.path.join("data/", filename), 'r') as f:
            if filename == "1.json":
                data = json.loads(f.read())
                synsets = data["dataset"]
                for index, synset in enumerate(synsets):
                    df.loc[len(df)] = [
                        synset.split(":")[0],
                        synset.split(":")[1].split("|")[0].strip(),
                        synset.split(":")[2].strip(),
                        data["answers"][index],
                        data["timeDiffs"][index],
                        data["isHard"][index]
                    ]
            elif filename != "glove.6B.100d.txt":
                data = json.loads(f.read())
                answers = data["answers"]
                for index, answer in enumerate(answers):
                    df.at[index, 'target'] = df.iloc[index]['target'] + ", " + answer
                    df.at[index, 'timeDiffs'] = str(df.iloc[index]['timeDiffs']) + ", " + str(data["timeDiffs"][index])
                    df.at[index, 'isHard'] = str(df.iloc[index]['isHard']) + ", " + str(data["isHard"][index])

    # Produce an unique target label and isHard that is the common one
    df['target'] = df['target'].apply(lambda x: max(set(x.split(", ")), key=x.split(", ").count))
    df['target'] = df['target'].map({'basic': 0, 'advanced': 1})
    df['isHard'] = df['isHard'].apply(lambda x: max(set(x.split(", ")), key=x.split(", ").count))
    df['isHard'] = df['isHard'].map({'False': 0, 'True': 1})

    # Avarage timeDiffs for each synset
    df['timeDiffs'] = df['timeDiffs'].apply(lambda x: np.mean([float(i) for i in x.split(", ")]))

    return df

## First Method: Doc2vec + Random Forest Classifier

- Doc2vec Embeddings for definitions

### Pre-processing of definitions and creating the tagged documents for doc2vec model

In [152]:
# Gensim pre-process (lowercase and tokenize)
df = load_dataset()
df['definition'] = df['definition'].apply(simple_preprocess)

# Create tagged documents for each definition
df['tagged_docs'] = df.apply(lambda x: TaggedDocument(x['definition'], [x['target']]), axis=1)

### Doc2vec model training

In [153]:
def doc2vec_model(train_tagged: list) -> Doc2Vec:
    # Creating the Doc2Vec model
    cores = multiprocessing.cpu_count()
    model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores, epochs=30)
    model_dbow.build_vocab(train_tagged)

    # Training the Doc2Vec model
    model_dbow.train(utils.shuffle(train_tagged), total_examples=len(train_tagged), epochs=model_dbow.epochs)
    
    return model_dbow

In [154]:
# Creating the final train and test vectors
def vec_for_classifier(model: Doc2Vec, tagged_docs: list) -> tuple[tuple, tuple]:
    # We have a list of tuples (target, vector of features) for each sentence and then we zip them to 
    # get a list of targets and a list of vectors
    y, X = zip(*[(doc.tags[0], model.infer_vector(doc.words, epochs=20)) for doc in tagged_docs])
    return X, y

### K-Validation of the model

In [155]:
# Finding the best parameters for Random Forest Classifier
rfc = RandomForestClassifier()

model_dbow = doc2vec_model(df['tagged_docs'])
X, y = vec_for_classifier(model_dbow, df['tagged_docs'])

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,5],
    'criterion' :['gini', 'entropy']
}

CV_rfc = GridSearchCV(estimator = rfc, param_grid = param_grid, cv = 3, refit = True, verbose = 1)
CV_rfc.fit(X, y)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [156]:
# 10-fold cross validation
accuracy = []
f1 = []

for i in range(10):

    # Split the dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(df['tagged_docs'], df['target'], test_size=0.2, random_state=random.randint(0, 1000))

    model_dbow = doc2vec_model(X_train)
    X_train, y_train = vec_for_classifier(model_dbow, X_train)
    X_test, y_test = vec_for_classifier(model_dbow, X_test)

    rfc = RandomForestClassifier(
        n_estimators=CV_rfc.best_params_['n_estimators'],
        max_features=CV_rfc.best_params_['max_features'],
        max_depth=CV_rfc.best_params_['max_depth'],
        criterion=CV_rfc.best_params_['criterion']
    )
    rfc.fit(X_train, y_train)
    
    y_pred = rfc.predict(X_test)
    accuracy.append(accuracy_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred, average='weighted'))

print("Mean Accuracy: ", np.mean(accuracy))
print("Mean F1 score: ", np.mean(f1))

Mean Accuracy:  0.5742574257425742
Mean F1 score:  0.5737887176618872


## Second Method: Sentence Bert + Support Vector Classifier

- Sentence Bert Embeddings for definitions

In [157]:
# Reloading the dataset
df = load_dataset()

# Obtaining the Bert Embeddings
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# I compute the embeddings for the dataset
bert_embeddings = model.encode(df['definition'], convert_to_tensor=True)

### K-Fold Validation of the model

In [158]:
# Finding the best parameters for SVC
svc = SVC()

param_grid = {
    'C': [1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.001, 0.0001],
    'kernel': ['linear', 'rbf']
}

CV_svc = GridSearchCV(estimator = svc, param_grid = param_grid, cv = 10, refit = True, verbose = 1)
CV_svc.fit(bert_embeddings, df['target'])

Fitting 10 folds for each of 32 candidates, totalling 320 fits


In [159]:
# 10-fold cross validation
accuracy = []
f1 = []

for i in range(10):
    # Splitting the dataset in train and test
    X_train, X_test, y_train, y_test = train_test_split(bert_embeddings, df['target'], test_size=0.2, random_state=random.randint(0, 1000))

    # Training the model
    svc = SVC(
        C = CV_svc.best_params_['C'],
        gamma = CV_svc.best_params_['gamma'],
        kernel = CV_svc.best_params_['kernel']
    )
    svc.fit(X_train, y_train)

    # Predicting the test set
    y_pred = svc.predict(X_test)
    accuracy.append(accuracy_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred, average='weighted'))

print("Mean Accuracy: ", np.mean(accuracy))
print("Mean F1 score: ", np.mean(f1))

Mean Accuracy:  0.7019801980198019
Mean F1 score:  0.7010795224798815


## Third Method: Glove embeddings + DNN

- Embeddings of the synonyms for each synset, that they are the lemmas of the synset

### Preparation of the Tokenizer and the embedding matrix for the DNN model

In [160]:
# Loading the GloVe Word Embeddings
embeddings_index = dict()
with open('data/glove.6B.100d.txt', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs

In [161]:
def get_tokenizer(training_set: pd.Series, test_set: pd.Series, df: pd.DataFrame) -> tuple[np.ndarray, np.ndarray, Tokenizer, int, int]:
    # Prepare tokenizer: the Tokenizer converts the words into integers
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(training_set)
    vocab_size = len(tokenizer.word_index) + 1

    # Integer encode the documents
    encoded_docs = tokenizer.texts_to_sequences(training_set)

    # Pad documents to a max length of the word with the most synonyms
    # for having a fixed length vectors
    max_length = max([len(s.split()) for s in df['synonyms']])
    padded_train = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

    # Tokenize the test set
    X_test = tokenizer.texts_to_sequences(test_set)
    padded_test = pad_sequences(X_test, maxlen=max_length, padding='post')

    return padded_train, padded_test, tokenizer, vocab_size, max_length

In [162]:
# Creating Embedding matrix: a matrix of one embedding for each unique word in the training dataset
# The result is a matrix of weights only for words we will see during training, with the embedding
# weights obtained from the GloVe embedding

def get_embedding_matrix(tokenizer: Tokenizer, vocab_size: int) -> np.ndarray:
	embedding_matrix = np.zeros((vocab_size, 100))
	for word, i in tokenizer.word_index.items():
		embedding_vector = embeddings_index.get(word)
		if embedding_vector is not None:
			embedding_matrix[i] = embedding_vector
	
	return embedding_matrix

In [163]:
def fit_model(padded_train: np.ndarray, embedding_matrix: np.ndarray, vocab_size: int, target: pd.Series, max_length: int) -> Sequential:
    # Defining NN model for classification
    model = Sequential()

    # Embedding layer: can be seeded with the GloVe word embedding weights
    model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False))

    # Flattens the input
    model.add(Flatten())

    # Deeply connected neural network layer
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy')

    # Fit the model
    model.fit(padded_train, target, epochs=50, verbose=0)

    return model

### K-Validation of the model

In [164]:
# 10-fold cross validation
accuracy = []
f1 = []

for i in range(10):
    # Splitting the dataset in train and test
    X_train, X_test, y_train, y_test = train_test_split(df['synonyms'], df['target'], test_size=0.2, random_state=random.randint(0, 1000))
    
    X_padded, y_padded, tokenizer, vocab_size, max_length = get_tokenizer(X_train, X_test, df)
    embeddings_matrix = get_embedding_matrix(tokenizer, vocab_size)
    model = fit_model(X_padded, embeddings_matrix, vocab_size, y_train, max_length)

    # Predicting the test set
    y_pred = model.predict(y_padded).round()
    accuracy.append(accuracy_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred, average='weighted'))

print("Mean Accuracy: ", np.mean(accuracy))
print("Mean F1 score: ", np.mean(f1))

Mean Accuracy:  0.6594059405940593
Mean F1 score:  0.6579329778365641


## Fourth Method: Sentence Bert + Glove + RF

- Sentence Bert embeddings for the definitions
- Glove embeddings for the synonyms
- RF model for the classification (because has a better performance than the SVM or DNN)

### Loading dataset, GloVe embeddings and defining two support functions for getting the embeddings of the synonyms and obtaining the final embeddings

In [165]:
# Reloading the dataset and GloVe embeddings
df = load_dataset()

# Loading the GloVe Word Embeddings
embeddings_index = dict()
with open('data/glove.6B.100d.txt', encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coefs

In [166]:
# For each synonyms of a synonym group, i compute the sum embedding

def get_synonyms_embeddings(df: pd.DataFrame) -> list:
    embeddings_synonyms = []

    for synonym in df['synonyms']:
        synonyms = synonym.split(', ')
        avg_embedding = []
        for word in synonyms:
            if word in embeddings_index:
                avg_embedding.append(embeddings_index[word])
            else:
                avg_embedding.append(np.zeros(100))
        embeddings_synonyms.append(np.sum(avg_embedding, axis=0))

    return embeddings_synonyms

In [167]:
def get_final_embeddings(df: pd.DataFrame, bert_embeddings_definition: list, glove_embeddings_synonyms: list) -> list:

    # Creating a single vector with the embeddings of the definition, the synonyms, the timeDiffs and the isHard
    final_embeddings = []
    for i in range(len(df)):
        emb = []
        for embedding in bert_embeddings_definition[i].numpy().tolist():
            emb.append(embedding)
        for embedding in glove_embeddings_synonyms[i]:
            emb.append(embedding)
        emb.append(df['timeDiffs'].iloc[i])
        emb.append(df['isHard'].iloc[i])
        final_embeddings.append(emb)

    return final_embeddings

### Computing the GloVe embeddings for the synonyms, the Sentence Bert embeddings for the definitions and I append them together in a single matrix with also the "timeDiffs" and the "isHard" features

In [168]:
# I compute the embeddings for the synonyms
glove_embeddings_synonyms = get_synonyms_embeddings(df)

# Obtaining the Bert Embeddings
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# I compute the embeddings for the dataset
bert_embeddings_definition = model.encode(df['definition'].to_list(), convert_to_tensor=True)

# I compute the final embeddings
final_embeddings = get_final_embeddings(df, bert_embeddings_definition, glove_embeddings_synonyms)

# I add a column to the dataset with the embeddings
df['embeddings'] = final_embeddings

### K-Validation of the model

In [169]:
# Finding the best parameters for Random Forest Classifier
rfc = RandomForestClassifier()

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,5],
    'criterion' :['gini', 'entropy']
}

CV_rfc = GridSearchCV(estimator = rfc, param_grid = param_grid, cv = 3, refit = True, verbose = 1)
CV_rfc.fit(df['embeddings'].tolist(), df['target'].tolist())

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [170]:
# 10-fold cross validation
accuracy = []
f1 = []

for i in range(10):
    # Splitting the dataset in train and test
    X_train, X_test, y_train, y_test = train_test_split(df['embeddings'].tolist(), df['target'].tolist(), test_size=0.2, random_state=random.randint(0, 1000))
    
    rfc = RandomForestClassifier(
        n_estimators=CV_rfc.best_params_['n_estimators'],
        max_features=CV_rfc.best_params_['max_features'],
        max_depth=CV_rfc.best_params_['max_depth'],
        criterion=CV_rfc.best_params_['criterion']
    )
    rfc.fit(X_train, y_train)

    # Predicting the test set
    y_pred = rfc.predict(X_test)
    accuracy.append(accuracy_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred, average='weighted'))  

print("Mean Accuracy: ", np.mean(accuracy))
print("Mean F1 score: ", np.mean(f1))

Mean Accuracy:  0.8465346534653465
Mean F1 score:  0.8454603245599493
