# Mini Project 1

In [None]:
print('Hello MP1!')


### Imported libraries used for the project
1. jupiter
2. scikit-learn
3. gensim
4. nltk
5. numpy
6. pandas
7. matplotlib

`conda install jupyter scikit-learn gensim nltk numpy pandas matplotlib`

## 1. Dataset Preparation & Analysis (5pts)

1.2. Load the dataset. You can use `gzip.open` and `json.load` to do that.

In [None]:
import gzip
import json

dataset = gzip.open('goemotions.json.gz')
dataset_json = json.load(dataset)

# Close the gz dataset once your finished loading the data as a json object
dataset.close()


1.3. (5pts) Extract the posts and the 2 sets of labels (emotion and sentiment), then plot the distribution
of the posts in each category and save the graphic (a histogram or pie chart) in pdf. Do this for both
the emotion and the sentiment categories. You can use `matplotlib.pyplot` and `savefig` to do this.
This pre-analysis of the dataset will allow you to determine if the classes are balanced, and which
metric is more appropriate to use to evaluate the performance of your classifiers.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter

numpy_dataset = np.array(dataset_json)

# Get column only for emotion and sentiment
emotion_dataset_col = numpy_dataset[:, 1]
sentiment_dataset_col = numpy_dataset[:, 2]

# Count the number of times each value appears
emotion_count = Counter(emotion_dataset_col)
sentiment_count = Counter(sentiment_dataset_col)


plt.pie(emotion_count.values(), None, emotion_count.keys())
plt.savefig('emotions_pie_chart')

plt.close()

plt.pie(sentiment_count.values(), None, sentiment_count.keys())
plt.savefig('sentiment_pie_chart')

plt.close()


## 2. Words as Features (35pts)

In [None]:
# We used this to speed up the time for making classification but its not necesarry.
# import torch

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print("Device: " + str(device))

2.1. □ (5pts) Process the dataset using `feature_extraction.text.CountVectorizer` to extract tokens/words
and their frequencies. Display the number of tokens (the size of the vocabulary) in the dataset.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


# Phrases are in the first column of the dataset
phrases = numpy_dataset[:, 0]

# Process the dataset
vectorizer_emotions = CountVectorizer()

# X value is the processed_dataset
X_emotions = vectorizer_emotions.fit_transform(phrases)

print("Number of features (tokens in the vocabulary) =",
      len(vectorizer_emotions.get_feature_names_out()))


In [None]:
emotions = numpy_dataset[:, 1]

emotions_and_phrases = phrases.copy()

for count, i in enumerate(phrases):
    emotions_and_phrases[count] = i + " " + emotions[count]

vectorizer_sentiments = CountVectorizer()
X_sentiments = vectorizer_sentiments.fit_transform(emotions_and_phrases)

print("Number of features (tokens in the vocabulary) including emotions =",
      len(vectorizer_sentiments.get_feature_names_out()))


2.2. □ (2pts) Split the dataset into 80% for training and 20% for testing. For this, you can use `train_test_split`.

In [None]:
# Split the dataset
from sklearn.model_selection import train_test_split

# Split the dataset
training_dataset, testing_dataset = train_test_split(
    numpy_dataset, train_size=0.8, test_size=0.2)

# Split the feature vector of emotions
training_X_emotions, testing_X_emotions = train_test_split(
    X_emotions, train_size=0.8, test_size=0.2)

# Split the feature vector of sentiments
training_X_sentiments, testing_X_sentiments = train_test_split(
    X_sentiments, train_size=0.8, test_size=0.2)

# Print the size of both datasets
print("Size of training set =", training_dataset.shape[0])
print("Size of testing set =", testing_dataset.shape[0])


2.3. Train and test the following classifiers, for both the emotion and the sentiment classification, using
word frequency as features.

* 2.3.1. □ (3pts) **Base-MNB**: a Multinomial Naive Bayes Classifier `(naive_bayes.MultinomialNB.html)`
with the default parameters.

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Create the object classifiers for emotions
emotions_classifier_mb = MultinomialNB()

# Fit the model with training_X as X and columns of training_dataset as y
emotions_classifier_mb.fit(X=training_X_emotions,
                           y=training_dataset[:, 1])

# Make predictions with testing_X as X
emotion_prediction_mb = emotions_classifier_mb.predict(X=testing_X_emotions)
print(emotion_prediction_mb)


In [None]:
# Part 2.4 for Multinomial classification
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Append Emotions results
performance_file = open("performance_2.3.1", "w")
performance_file.write(
    "-----Emotions classification (Multinomial Naive Bayes)-----\n")

performance_file.write(
    f"Emotions hyperparameters = {emotions_classifier_mb.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_mb)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_mb, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()


In [None]:
# Create the object classifiers for sentiments
sentiment_classifier_mb = MultinomialNB()

# Fit the model with training_X as X and columns of training_dataset as y
sentiment_classifier_mb.fit(X=training_X_sentiments,
                            y=training_dataset[:, 2])

# Make predictions with testing_X as X
sentiment_prediction_mb = sentiment_classifier_mb.predict(
    X=testing_X_sentiments)
print(sentiment_prediction_mb)


In [None]:
# Append Sentiments results
performance_file = open("performance_2.3.1", "a")
performance_file.write(
    "-----Sentiments classification (Multinomial Naive Bayes)-----\n")

performance_file.write(
    f"Sentiments hyperparameters = {sentiment_classifier_mb.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 2], y_pred=sentiment_prediction_mb)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 2], y_pred=sentiment_prediction_mb, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()


* 2.3.2. □ (3pts) **Base-DT:** a Decision Tree `(tree.DecisionTreeClassifier)` with the default parameters.

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create the object classifiers for emotions
emotions_classifier_dt = DecisionTreeClassifier()

# Fit the model with training_X as X and columns of training_dataset as y
emotions_classifier_dt.fit(X=training_X_emotions,
                           y=training_dataset[:, 1])

# Make predictions with testing_X as X
emotion_prediction_dt = emotions_classifier_dt.predict(X=testing_X_emotions)
print(emotion_prediction_dt)


In [None]:
# Part 2.4 for DecisionTree classification


# Append Emotions results
performance_file = open("performance_2.3.2", "w")
performance_file.write("-----Emotions classification (Decision Tree)-----\n")

performance_file.write(
    f"Emotions hyperparameters = {emotions_classifier_dt.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_dt)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_dt, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")
performance_file.close()


In [None]:
# Create the object classifiers for sentiments
sentiment_classifier_dt = DecisionTreeClassifier()

# Fit the model with training_X as X and columns of training_dataset as y
sentiment_classifier_dt.fit(X=training_X_sentiments,
                            y=training_dataset[:, 2])

# Make predictions with testing_X as X
sentiment_prediction_dt = sentiment_classifier_dt.predict(
    X=testing_X_sentiments)
print(sentiment_prediction_dt)


In [None]:

# Append Sentiments results
performance_file = open("performance_2.3.2", "a")
performance_file.write(
    "-----Sentiments classification (Decision Tree)-----\n")

performance_file.write(
    f"Sentiments hyperparameters = {sentiment_classifier_dt.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 2], y_pred=sentiment_prediction_dt)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 2], y_pred=sentiment_prediction_dt, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()


* 2.3.3. □ (3pts) **Base-MLP:** a Multi-Layered Perceptron `(neural network.MLPClassifier)` with the
default parameters.

In [None]:
# import MLPClassifier
from sklearn.neural_network import MLPClassifier

# Create the object classifiers for emotions
emotions_classifier_mlp = MLPClassifier(verbose=True,max_iter=1)

# Fit the model with training_X as X and columns of training_dataset as y
emotions_classifier_mlp.fit(X=training_X_emotions, y=training_dataset[:, 1])

# Make predictions with testing_X as X
emotion_prediction_mlp = emotions_classifier_mlp.predict(X=testing_X_emotions)
print(emotion_prediction_mlp)

In [None]:
# Part 2.4 for MLP classification


# Append Emotions results
performance_file = open("performance_2.3.3", "w")
performance_file.write("-----Emotions classification (multi layer perceptron)-----\n")

performance_file.write(
    f"Emotions hyperparameters = {emotions_classifier_mlp.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_mlp)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_mlp, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")
performance_file.close()


In [None]:
# Create the object classifiers for sentiments
sentiment_classifier_mlp = MLPClassifier(verbose=True,max_iter=1)

# Fit the model with training_X as X and columns of training_dataset as y
sentiment_classifier_mlp.fit(X=training_X_sentiments, y=training_dataset[:, 2])

# Make predictions with testing_X as X
sentiment_prediction_mlp = sentiment_classifier_mlp.predict(X=testing_X_sentiments)

print(sentiment_prediction_mlp)

In [None]:
# Append Sentiments results
performance_file = open("performance_2.3.3", "a")
performance_file.write(
    "-----Sentiments classification (Multi layer perceptron)-----\n")

performance_file.write(
    f"Sentiments hyperparameters = {sentiment_classifier_mlp.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 2], y_pred=sentiment_prediction_mlp)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 2], y_pred=sentiment_prediction_mlp, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()

* 2.3.4. □ (3pts) **Top-MNB:** a better performing Multinomial Naive Bayes Classifier found using `GridSearchCV`.
The gridsearch will allow you to find the best combination of hyper-parameters, as determined
by the evaluation function that you have determined in step 1.3. The only hyper-parameter that
you will experiment with is `alphafloat` with values 0.5, 0 and 2 other values of your choice.

In [None]:
from sklearn.model_selection import GridSearchCV

# hyperparameter used in gridsearch
hyperparam = {'alpha': [0, 0.5, 1.0, 5.0]}

# emotions gridsearch for Top Multinomial Naive Bayes
emo_top_mnb_gridsearch = GridSearchCV(emotions_classifier_mb, param_grid=hyperparam)
emo_top_mnb_gridsearch.fit(X=training_X_emotions, y=training_dataset[:, 1])
emo_prediction_tmb = emo_top_mnb_gridsearch.predict(X=testing_X_emotions)
print(emo_prediction_tmb)


In [None]:
# Part 2.4 for Top Multinomial Naive Bayes classification with GridSearchCV (Emotions)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Append Emotions results
performance_file = open("performance_2.3.4", "w")
performance_file.write("-----Emotions classification (Top Multinomial Naive Bayes with GridSearchCV)-----\n")

performance_file.write(f"Emotions hyperparamenters = {emo_top_mnb_gridsearch.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(y_true=testing_dataset[:, 1], y_pred=emo_prediction_tmb)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(y_true=testing_dataset[:, 1], y_pred=emo_prediction_tmb, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(f"----------------------------------------------------------\n\n")

performance_file.close()

In [None]:
# sentiments gridsearch for Top Multinomial Naive Bayes
sen_top_mnb_gridsearch = GridSearchCV(sentiment_classifier_mb, param_grid=hyperparam)
sen_top_mnb_gridsearch.fit(X=training_X_sentiments, y=training_dataset[:, 2])
sen_prediction_tmb = sen_top_mnb_gridsearch.predict(X=testing_X_sentiments)
print(sen_prediction_tmb)

In [None]:
# Part 2.4 for Top Multinomial Naive Bayes classification with GridSearchCV (Sentiment)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Append Sentiments results
performance_file = open("performance_2.3.4", "a")
performance_file.write("-----Sentiment classification (Top Multinomial Naive Bayes with GridSearchCV)-----\n")

performance_file.write(f"Sentiment hyperparamenters = {sen_top_mnb_gridsearch.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(y_true=testing_dataset[:, 2], y_pred=sen_prediction_tmb)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(y_true=testing_dataset[:, 2], y_pred=sen_prediction_tmb, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(f"----------------------------------------------------------\n\n")

performance_file.close()

* 2.3.5. □ (3pts) **Top-DT:** a better performing Decision Tree found using `GridSearchCV.` The hyperparameters
that you will experiment with are:
  * `criterion:` gini or entropy
  * `max depth:` 2 different values of your choice
  * `min samples split:` 3 different values of your choice

In [None]:
from sklearn.model_selection import GridSearchCV
# apply hyper parameters
param_grid = {'criterion': ['gini', 'entropy'],
              'max_depth': [2, 4], 'min_samples_split': [2, 5, 10]}

# create objects classifier with grid search
grid_search_emotion_classifier = GridSearchCV(
    emotions_classifier_dt, param_grid)

# Fit the model with training_X as X and columns of training_dataset as y
grid_search_emotion_classifier.fit(
    X=training_X_emotions, y=training_dataset[:, 1])

# Make predictions with testing_X as X
grid_search_emotion_predict = grid_search_emotion_classifier.predict(
    X=testing_X_emotions)
print(grid_search_emotion_predict)

In [None]:
# Part 2.4 for gridsearch Dt classification


# Append Emotions results
performance_file = open("performance_2.3.5", "w")
performance_file.write("-----Emotions classification (Grid Search Decision Tree)-----\n")

performance_file.write(
    f"Emotions hyperparameters = {grid_search_emotion_classifier.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 1], y_pred=grid_search_emotion_predict)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 1], y_pred=grid_search_emotion_predict, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")
performance_file.close()

In [None]:
# create objects classifier with grid search
grid_search_sentiment_classifier = GridSearchCV(sentiment_classifier_dt, param_grid)

# Fit the model with training_X as X and columns of training_dataset as y
grid_search_sentiment_classifier.fit(X=training_X_sentiments, y=training_dataset[:, 2])

# Make predictions with testing_X as X
grid_search_sentiment_predict = grid_search_sentiment_classifier.predict(X=testing_X_sentiments)
print(grid_search_sentiment_predict)

In [None]:
# Append Sentiments results
performance_file = open("performance_2.3.5", "a")
performance_file.write(
    "-----Sentiments classification (Grid Search Decision Tree)-----\n")

performance_file.write(
    f"Sentiments hyperparameters = {grid_search_sentiment_classifier.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 2], y_pred=grid_search_sentiment_predict)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 2], y_pred=grid_search_sentiment_predict, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()

* 2.3.6. □ (3pts) **Top-MLP:** a better performing Multi-Layered Perceptron found using GridSearchCV.
The hyper-parameters that you will experiment with are:
    * `activation:` sigmoid, tanh, relu and identity
    * 2 network architectures of your choice: for eg, 2 hidden layers with 30+50 nodes and 3 hidden
layers with 10 + 10 + 10
    * `solver:` Adam and stochastic gradient descent

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

# hyperparameter used in gridsearch
hyperparam = {'activation': ['sigmoid', 'tanh', 'relu', 'identity'],
                'hidden_layer_sizes': [2, 3],
                'solver': ['Adam', 'sgd']}

emo_top_mlp_gridsearch = GridSearchCV(emotions_classifier_mlp, param_grid=hyperparam)
emo_top_mlp_gridsearch.fit(X=training_X_emotions, y=training_dataset[:, 1])
emo_prediction_tmlp = emo_top_mlp_gridsearch.predict(X=testing_X_emotions)
print(emo_prediction_tmlp)


In [None]:
# Part 2.4 for Top Multi-Layered Percentron classification with GridSearchCV (Emotions)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Append Emotions results
performance_file = open("performance_2.3.6", "w")
performance_file.write("-----Emotions classification (Top Multi-Layered Percentron with GridSearchCV)-----\n")

performance_file.write(f"Emotions hyperparamenters = {emo_top_mlp_gridsearch.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(y_true=testing_dataset[:, 1], y_pred=emo_prediction_tmlp)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(y_true=testing_dataset[:, 1], y_pred=emo_prediction_tmlp, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(f"----------------------------------------------------------\n\n")

performance_file.close()

In [None]:
sen_top_mlp_gridsearch = GridSearchCV(emotions_classifier_mlp, param_grid=hyperparam)
sen_top_mlp_gridsearch.fit(X=training_X_emotions, y=training_dataset[:, 2])
sen_prediction_tmlp = sen_top_mlp_gridsearch.predict(X=testing_X_emotions)
print(emo_prediction_tmlp)

In [None]:
# Part 2.4 for Top Multi-Layered Percentron classification with GridSearchCV (Sentiment)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Append Sentiment results
performance_file = open("performance_2.3.6", "a")
performance_file.write("-----Sentiment classification (Top Multi-Layered Percentron with GridSearchCV)-----\n")

performance_file.write(f"Sentiment hyperparamenters = {sen_top_mlp_gridsearch.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(y_true=testing_dataset[:, 2], y_pred=sen_prediction_tmlp)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(y_true=testing_dataset[:, 2], y_pred=sen_prediction_tmlp, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(f"----------------------------------------------------------\n\n")

performance_file.close()

2.4. □ (5pts) For each of the 6 classifiers above and each of the classification tasks (emotion or sentiment),
produce and save the following information in a file called `performance`:
* a string clearly describing the model (e.g. the model name + hyper-parameter values) and the
classification task (emotion or sentiment)
* the confusion matrix – use `metrics.confusion_matrix`
* the precision, recall, and F1-measure for each class, and the accuracy, macro-

2.5. □ (7.5pts) **Do your own exploration:** Do only one of the following, depending on your own interest:
* Use tf-idf instead of word frequencies and redo all substeps of 2.3 above – you can use `TfidfTransformer`
for this. Display the results of this experiment.
* Remove stop words and redo all substeps of 2.3 above – you can use the parameter of `CountVectorizer`
for this. Display the results of this experiment.
* Play with `train_test_split` in order have different splits of 80% training, 20% test sets and
different sizes of training sets and redo all substeps of 2.3 above. Show and explain how the
performance of your models vary depending on the training/test sets are used.

### Option chosen: play with train_test_split (50%:50%)
Step 1: change the training and test set sizes to 50% <br><br>

<b>Hypothesis:
we can assume this will have a net negative outcome 
since the training set is losing 30% of its testable material.
Size of original training set = 137456 Size of original testing set = 34364
Size of new training set = 85910 Size of new testing set = 85910</b>

In [None]:
# Split the dataset
from sklearn.model_selection import train_test_split

tr_size = 0.5 # training size percentage
te_size = 0.5 # testing size percentage

# Split the dataset
new_training_dataset, new_testing_dataset = train_test_split(
    numpy_dataset, train_size = tr_size, test_size = te_size)

# Split the feature vector of emotions
new_training_X_emotions, new_testing_X_emotions = train_test_split(
    X_emotions, train_size = tr_size, test_size = te_size)

# Split the feature vector of sentiments
new_training_X_sentiments, new_testing_X_sentiments = train_test_split(
    X_sentiments, train_size = tr_size, test_size = te_size)

# Print the size of both datasets
print("Size of training set =", new_training_dataset.shape[0])
print("Size of testing set =", new_testing_dataset.shape[0])


2.5 - 2.3.1 Redo Base-MNB

In [None]:
from sklearn.naive_bayes import MultinomialNB

# Create the object classifiers for emotions
new_emotions_classifier_mb = MultinomialNB()

# Fit the model with training_X as X and columns of training_dataset as y
new_emotions_classifier_mb.fit(X=new_training_X_emotions,
                           y=new_training_dataset[:, 1])

# Make predictions with testing_X as X
new_emotion_prediction_mb = new_emotions_classifier_mb.predict(X=new_testing_X_emotions)
print(new_emotion_prediction_mb)

In [None]:
# Documenting Multinomial classification (Emotions & Sentiment)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Append Emotions results to a new performance document
performance_file = open("performance_2.5", "a")
performance_file.write(
    "-----Emotions classification (Multinomial Naive Bayes)-----\n")

performance_file.write(
    f"Emotions hyperparameters = {new_emotions_classifier_mb.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=new_testing_dataset[:, 1], y_pred=new_emotion_prediction_mb)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=new_testing_dataset[:, 1], y_pred=new_emotion_prediction_mb, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()

In [None]:
# Create the object classifiers for sentiments
new_sentiment_classifier_mb = MultinomialNB()

# Fit the model with training_X as X and columns of training_dataset as y
new_sentiment_classifier_mb.fit(X=new_training_X_sentiments,
                            y=new_training_dataset[:, 2])

# Make predictions with testing_X as X
new_sentiment_prediction_mb = new_sentiment_classifier_mb.predict(
    X=new_testing_X_sentiments)
print(new_sentiment_prediction_mb)

In [None]:
# Append Sentiments results to a new performance document
performance_file = open("performance_2.5", "a")
performance_file.write(
    "-----Sentiments classification (Multinomial Naive Bayes)-----\n")

performance_file.write(
    f"Sentiments hyperparameters = {new_sentiment_classifier_mb.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=new_testing_dataset[:, 2], y_pred=new_sentiment_prediction_mb)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=new_testing_dataset[:, 2], y_pred=new_sentiment_prediction_mb, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()

2.5 - 2.3.2 Redo Base-DT

In [None]:
from sklearn.tree import DecisionTreeClassifier


# Create the object classifiers for emotions
new_emotions_classifier_dt = DecisionTreeClassifier()

# Fit the model with training_X as X and columns of training_dataset as y
new_emotions_classifier_dt.fit(X=new_training_X_emotions,
                           y=new_training_dataset[:, 1])

# Make predictions with testing_X as X
new_emotion_prediction_dt = new_emotions_classifier_dt.predict(X=new_testing_X_emotions)
print(new_emotion_prediction_dt)

In [None]:
# Documenting DecisionTree classification (Emotions & Sentiment)

# Append Emotions results
performance_file = open("performance_2.5", "a")
performance_file.write("-----Emotions classification (Decision Tree)-----\n")

performance_file.write(
    f"Emotions hyperparameters = {new_emotions_classifier_dt.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=new_testing_dataset[:, 1], y_pred=new_emotion_prediction_dt)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=new_testing_dataset[:, 1], y_pred=new_emotion_prediction_dt, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")
performance_file.close()

In [None]:
# Create the object classifiers for sentiments
new_sentiment_classifier_dt = DecisionTreeClassifier()

# Fit the model with training_X as X and columns of training_dataset as y
new_sentiment_classifier_dt.fit(X=new_training_X_sentiments,
                            y=new_training_dataset[:, 2])

# Make predictions with testing_X as X
new_sentiment_prediction_dt = new_sentiment_classifier_dt.predict(
    X=new_testing_X_sentiments)
print(new_sentiment_prediction_dt)

In [None]:
# Append Sentiments results
performance_file = open("performance_2.5", "a")
performance_file.write(
    "-----Sentiments classification (Decision Tree)-----\n")

performance_file.write(
    f"Sentiments hyperparameters = {new_sentiment_classifier_dt.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=new_testing_dataset[:, 2], y_pred=new_sentiment_prediction_dt)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=new_testing_dataset[:, 2], y_pred=new_sentiment_prediction_dt, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()

2.5 - 2.3.3 Redo Base-MLP

In [None]:
# import MLPClassifier
from sklearn.neural_network import MLPClassifier

# Create the object classifiers for emotions
new_emotions_classifier_mlp = MLPClassifier(verbose=True,max_iter=1)

# Fit the model with training_X as X and columns of training_dataset as y
new_emotions_classifier_mlp.fit(X=training_X_emotions, y=training_dataset[:, 1])

# Make predictions with testing_X as X
new_emotion_prediction_mlp = new_emotions_classifier_mlp.predict(X=new_testing_X_emotions)
print(new_emotion_prediction_mlp)

In [None]:
# Documenting MLPClassifier classification (Emotions & Sentiment)

# Append Emotions results
performance_file = open("performance_2.5", "a")
performance_file.write("-----Emotions classification (multi layer perceptron)-----\n")

performance_file.write(
    f"Emotions hyperparameters = {new_emotions_classifier_mlp.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=new_testing_dataset[:, 1], y_pred=new_emotion_prediction_mlp)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=new_testing_dataset[:, 1], y_pred=new_emotion_prediction_mlp, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")
performance_file.close()

In [None]:
# Create the object classifiers for sentiments
new_sentiment_classifier_mlp = MLPClassifier(verbose=True,max_iter=1)

# Fit the model with training_X as X and columns of training_dataset as y
new_sentiment_classifier_mlp.fit(X=new_training_X_sentiments, y=new_training_dataset[:, 2])

# Make predictions with testing_X as X
new_sentiment_prediction_mlp = new_sentiment_classifier_mlp.predict(X=new_testing_X_sentiments)

print(new_sentiment_prediction_mlp)

In [None]:
# Append Sentiments results

performance_file = open("performance_2.5", "a")
performance_file.write(
    "-----Sentiments classification (Multi layer perceptron)-----\n")

performance_file.write(
    f"Sentiments hyperparameters = {new_sentiment_classifier_mlp.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=new_testing_dataset[:, 2], y_pred=new_sentiment_prediction_mlp)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=new_testing_dataset[:, 2], y_pred=new_sentiment_prediction_mlp, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()

2.5 - 2.3.4 Redo Top-MNB

In [None]:
from sklearn.model_selection import GridSearchCV

# hyperparameter used in gridsearch
hyperparam = {'alpha': [0, 0.5, 1.0, 5.0]}

# emotions gridsearch for Top Multinomial Naive Bayes
new_emo_top_mnb_gridsearch = GridSearchCV(new_emotions_classifier_mb, param_grid=hyperparam)
new_emo_top_mnb_gridsearch.fit(X=new_training_X_emotions, y=new_training_dataset[:, 1])
new_emo_prediction_tmb = new_emo_top_mnb_gridsearch.predict(X=new_testing_X_emotions)
print(new_emo_prediction_tmb)


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Documenting Top Multinomial Naive Bayes classification with GridSearchCV (Emotions)

# Append Emotions results
performance_file = open("performance_2.5", "a")
performance_file.write("-----Emotions classification (Top Multinomial Naive Bayes with GridSearchCV)-----\n")

performance_file.write(f"Emotions hyperparamenters = {new_emo_top_mnb_gridsearch.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(y_true=new_testing_dataset[:, 1], y_pred=new_emo_prediction_tmb)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(y_true=new_testing_dataset[:, 1], y_pred=new_emo_prediction_tmb, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(f"----------------------------------------------------------\n\n")

performance_file.close()

In [None]:
from sklearn.model_selection import GridSearchCV

# hyperparameter used in gridsearch
hyperparam = {'alpha': [0, 0.5, 1.0, 5.0]}

# sentiments gridsearch for Top Multinomial Naive Bayes
new_sen_top_mnb_gridsearch = GridSearchCV(new_sentiment_classifier_mb, param_grid=hyperparam)
new_sen_top_mnb_gridsearch.fit(X=new_training_X_sentiments, y=new_training_dataset[:, 2])
new_sen_prediction_tmb = new_sen_top_mnb_gridsearch.predict(X=new_testing_X_sentiments)
print(new_sen_prediction_tmb)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Documenting Top Multinomial Naive Bayes classification with GridSearchCV (Sentiment)

# Append Emotions results
performance_file = open("performance_2.5", "a")
performance_file.write("-----Sentiment classification (Top Multinomial Naive Bayes with GridSearchCV)-----\n")

performance_file.write(f"Sentiment hyperparamenters = {new_sen_top_mnb_gridsearch.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(y_true=new_testing_dataset[:, 2], y_pred=new_sen_prediction_tmb)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(y_true=new_testing_dataset[:, 2], y_pred=new_sen_prediction_tmb, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(f"----------------------------------------------------------\n\n")

performance_file.close()

2.5 - 2.3.5 Redo Top-DT

In [None]:
from sklearn.model_selection import GridSearchCV

# apply hyper parameters
param_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [2, 4], 'min_samples_split': [2, 5, 10]}

# create objects classifier with grid search
new_grid_search_emotion_classifier = GridSearchCV(new_emotions_classifier_dt, param_grid)

# Fit the model with training_X as X and columns of training_dataset as y
new_grid_search_emotion_classifier.fit(X=new_training_X_emotions, y=new_training_dataset[:, 1])

# Make predictions with testing_X as X
new_grid_search_emotion_predict = new_grid_search_emotion_classifier.predict(X=new_testing_X_emotions)
print(new_grid_search_emotion_predict)

In [None]:
# Documenting for gridsearch Dt classification

# Append Emotions results
performance_file = open("performance_2.5", "a")
performance_file.write("-----Emotions classification (Grid Search Decision Tree)-----\n")

performance_file.write(
    f"Emotions hyperparameters = {new_grid_search_emotion_classifier.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=new_testing_dataset[:, 1], y_pred=new_grid_search_emotion_predict)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=new_testing_dataset[:, 1], y_pred=new_grid_search_emotion_predict, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")
performance_file.close()

In [None]:
# create objects classifier with grid search
new_grid_search_sentiment_classifier = GridSearchCV(new_sentiment_classifier_dt, param_grid)

# Fit the model with training_X as X and columns of training_dataset as y
new_grid_search_sentiment_classifier.fit(X=new_training_X_sentiments, y=new_training_dataset[:, 2])

# Make predictions with testing_X as X
new_grid_search_sentiment_predict = new_grid_search_sentiment_classifier.predict(X=new_testing_X_sentiments)
print(new_grid_search_sentiment_predict)

In [None]:
# Append Sentiments results
performance_file = open("performance_2.5", "a")
performance_file.write(
    "-----Sentiments classification (Grid Search Decision Tree)-----\n")

performance_file.write(
    f"Sentiments hyperparameters = {new_grid_search_sentiment_classifier.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=new_testing_dataset[:, 2], y_pred=new_grid_search_sentiment_predict)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=new_testing_dataset[:, 2], y_pred=new_grid_search_sentiment_predict, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()

2.5 - 2.3.6 Redo Top-MLP

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

# hyperparameter used in gridsearch
hyperparam = {'activation': ['sigmoid', 'tanh', 'relu', 'identity'],
                'hidden_layer_sizes': [2, 3],
                'solver': ['Adam', 'sgd']}
# Create the object classifiers for emotions
new_emo_top_mlp_gridsearch = GridSearchCV(emotions_classifier_mlp, param_grid=hyperparam)
# Fit the model with new_training_X as X and columns of new_training_dataset as y
new_emo_top_mlp_gridsearch.fit(X=new_training_X_emotions, y=new_training_dataset[:, 1])
# Make predictions with new_testing_X as X
new_emo_prediction_tmlp = new_emo_top_mlp_gridsearch.predict(X=new_testing_X_emotions)
print(new_emo_prediction_tmlp)



In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Documenting Top Multi-Layered Percentron classification with GridSearchCV (Emotions)

# Append Emotions results
performance_file = open("performance_2.5", "a")
performance_file.write("-----Emotions classification (Top Multi-Layered Percentron with GridSearchCV)-----\n")

performance_file.write(f"Emotions hyperparamenters = {new_emo_top_mlp_gridsearch.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(y_true=new_testing_dataset[:, 1], y_pred=new_emo_prediction_tmlp)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(y_true=new_testing_dataset[:, 1], y_pred=new_emo_prediction_tmlp, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(f"----------------------------------------------------------\n\n")

performance_file.close()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

# hyperparameter used in gridsearch
hyperparam = {'activation': ['sigmoid', 'tanh', 'relu', 'identity'],
                'hidden_layer_sizes': [2, 3],
                'solver': ['Adam', 'sgd']}
# Create the object classifiers for emotions
new_sen_top_mlp_gridsearch = GridSearchCV(emotions_classifier_mlp, param_grid=hyperparam)
# Fit the model with new_training_X as X and columns of new_training_dataset as y
new_sen_top_mlp_gridsearch.fit(X=new_training_X_emotions, y=new_training_dataset[:, 2])
# Make predictions with new_testing_X as X
new_sen_prediction_tmlp = new_sen_top_mlp_gridsearch.predict(X=new_testing_X_emotions)
print(new_emo_prediction_tmlp)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Documenting Top Multi-Layered Percentron classification with GridSearchCV (Sentiment)

# Append Sentiment results
performance_file = open("performance_2.5", "a")
performance_file.write("-----Sentiment classification (Top Multi-Layered Percentron with GridSearchCV)-----\n")

performance_file.write(f"Sentiment hyperparamenters = {new_sen_top_mlp_gridsearch.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(y_true=new_testing_dataset[:, 2], y_pred=new_sen_prediction_tmlp)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(y_true=new_testing_dataset[:, 2], y_pred=new_sen_prediction_tmlp, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(f"----------------------------------------------------------\n\n")

performance_file.close()

### 2.5 - Verdict

<hr>
We played with train_test_split using 2 different percentages.
<br><br>
The first split we used was 50% for the training set and 50% for the testing set.<br>
This gave us a split of:<br>
Size of new training set = 85910<br> Size of new testing set = 85910<br>
We expected that the 30% drop in data for the training set would result in a less accurate reading when doing the testing but this we just not the case. As seen in the "new_performace" file that is produced when running the redone train_test_split, our hypothesis was wrong. The accuracy, precision, recall and f1-score barely changed if at all. This result created more questions than answering them. A possible issue that would explain these unusual results could be how the dataset is not a well-balanced dataset.
<br><br>
The second split we used was 20% for the training set and 80% for the testing set.<br>
This gave us a split of:<br>
Size of new training set = 34364<br> Size of new testing set = 137456<br>
We decided to run the inverse of the original split since the 50:50 split did not yield the expected test results. This new split yielded some interesting results as well. As seen in the "new_performance" file the precision for overall lower than the original and 50:50 splits, which is reasonable to believe considering the training set is only 20% of the original dataset size. As for other values such as the f1-score and precision did not provide substancial evidence to prove that changing the split for this specific dataset would yield predictable results.
<hr>

## 3. Embeddings as Features (20pts)

3.1. □ (0pts) Use `gensim.downloader.load` to load the `word2vec-google-news-300` pretrained embedding model.

In [None]:
# Incase its the first time running
# import nltk

# nltk.download('punkt')


In [None]:
import gensim.downloader
google_model = gensim.downloader.load("word2vec-google-news-300")


3.2. □ (2pts) Use the `tokenizer` from `nltk` to extract words from the Reddit posts. Display the number
of tokens in the training set.

In [None]:
import nltk.tokenize


phrase_token_vector = [nltk.tokenize.word_tokenize(i) for i in phrases]
emotions_and_phrases_token_vector = [nltk.tokenize.word_tokenize(i) for i in emotions_and_phrases]


In [None]:
# Get the token count in phrase token list
token_count = 0
for i in phrase_token_vector:
    token_count += len(i)

print("Number of Tokens in phrases=", token_count)


# Get the token count in phrase and emotion token list
token_count = 0
for i in emotions_and_phrases_token_vector:
    token_count += len(i)

print("Number of Tokens in phrases=", token_count)


3.3. □ (5pts) Compute the embedding of a Reddit post as the <u>average</u> of the embeddings of its words. If
a word has no embedding in Word2Vec, skip it.

In [None]:
from statistics import mean
from gensim.models import Word2Vec, KeyedVectors


mean_embedding_list_emotions = []
for i in phrase_token_vector:
    mean_embedding_list_emotions.append(google_model.get_mean_vector(i))

mean_embedding_list_sentiments = []
for i in emotions_and_phrases_token_vector:
    mean_embedding_list_sentiments.append(google_model.get_mean_vector(i))


# Test to get the average embedding for a phrase with index i
i = 0
print(
    f"The mean embedding for phrase: '{phrase_token_vector[i]}'\n = \n{mean_embedding_list_emotions[i]}")

# Test to get the average embedding for a pohrase and emotion with index i
i = 0
print(
    f"The mean embedding for phrase and emotion: '{emotions_and_phrases_token_vector[i]}'\n = \n{mean_embedding_list_sentiments[i]}")


3.4. □ (3pts) Compute and display the overall hit rates of the training and test sets (i.e. the % of words
in the Reddit posts for which an embedding is found in Word2Vec).


In [None]:
# Split the tokens (to get the hit rate)
training_phrase_token, testing_phrase_token = train_test_split(
    phrase_token_vector, train_size=0.8, test_size=0.2)

training_emotions_and_phrases_token, testing_emotions_and_phrases_token = train_test_split(
    emotions_and_phrases_token_vector, train_size=0.8, test_size=0.2)


# Split the mean embedding list for both eomtions and sentiments (the X values)
training_X_emotions_emb, testing_X_emotions_emb = train_test_split(
    mean_embedding_list_emotions, train_size=0.8, test_size=0.2)

training_X_sentiments_emb, testing_X_sentiments_emb = train_test_split(
    mean_embedding_list_sentiments, train_size=0.8, test_size=0.2)


In [None]:
# Hit Rate for phrases training set
num_of_hits = 0
total = 0

for i in training_phrase_token:
    for word in i:
        try:
            # pdb.set_trace()
            google_model.key_to_index[word]
            num_of_hits += 1
        except Exception:
            pass
        total += 1

print(f"Hit Rate in training set (phrases) = {num_of_hits/total*100}%")


# Hit Rate for phrases testing set
num_of_hits = 0
total = 0

for i in testing_phrase_token:
    for word in i:
        try:
            # pdb.set_trace()
            google_model.key_to_index[word]
            num_of_hits += 1
        except Exception:
            pass
        total += 1

print(f"Hit Rate in testing set (phrases) = {num_of_hits/total*100}%")

# Hit Rate for phrases and emotions training set
num_of_hits = 0
total = 0

for i in training_emotions_and_phrases_token:
    for word in i:
        try:
            # pdb.set_trace()
            google_model.key_to_index[word]
            num_of_hits += 1
        except Exception:
            pass
        total += 1

print(f"Hit Rate in training set (phrases and emotions) = {num_of_hits/total*100}%")


# Hit Rate for phrases and emotions testing set
num_of_hits = 0
total = 0

for i in testing_emotions_and_phrases_token:
    for word in i:
        try:
            # pdb.set_trace()
            google_model.key_to_index[word]
            num_of_hits += 1
        except Exception:
            pass
        total += 1

print(f"Hit Rate in testing set (phrases and emotions) = {num_of_hits/total*100}%")


3.5. □ (3pts) **Train a Base-MLP:** a Multi-Layered Perceptron (`neural_network.MLPClassifier`) with
the default parameters.

In [None]:
from sklearn.neural_network import MLPClassifier


# For emotions classification
emotions_classifier_mlp_embedding = MLPClassifier()

# Fit the model with training_X as X and columns of training_dataset as y
emotions_classifier_mlp_embedding.fit(X=training_X_emotions_emb,
                                      y=training_dataset[:, 1])

# Make predictions with testing_X as X
emotion_prediction_mlp_embedding = emotions_classifier_mlp_embedding.predict(
    X=testing_X_emotions_emb)
print(emotion_prediction_mlp_embedding)

In [None]:
# Append Emotions results
performance_file = open("performance_3.5", "w")
performance_file.write(
    "-----Emotions classification (google model) (Multi Layered Perception with word Embeddings)-----\n")

performance_file.write(
    f"Emotions hyperparameters = {emotions_classifier_mlp_embedding.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_mlp_embedding)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_mlp_embedding, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()

In [None]:
# For sentiments classification
sentiments_classifier_mlp_embedding = MLPClassifier()

# Fit the model with training_X as X and columns of training_dataset as y
sentiments_classifier_mlp_embedding.fit(X=training_X_sentiments_emb,
                                      y=training_dataset[:, 2])

# Make predictions with testing_X as X
sentiments_prediction_mlp_embedding = sentiments_classifier_mlp_embedding.predict(
    X=testing_X_sentiments_emb)
print(sentiments_prediction_mlp_embedding)

In [None]:
# Append Sentiments results
performance_file = open("performance_3.5", "a")
performance_file.write(
    "-----Sentiments classification (google model) (Multi Layered Perception with word Embeddings)-----\n")

performance_file.write(
    f"Sentiments hyperparameters = {sentiments_classifier_mlp_embedding.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 2], y_pred=sentiments_prediction_mlp_embedding)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 2], y_pred=sentiments_prediction_mlp_embedding, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()


3.6. □ (3pts) **Train a Top-MLP:** a better performing Multi-Layered Perceptron found with whatever
hyperparameters you want.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

# hyperparameter used in gridsearch
hyperparam = {'activation': ['tanh', 'relu', 'identity'],
                'hidden_layer_sizes': [2, 3],
                'solver': ['Adam', 'sgd']}

# For emotions classification
emotions_classifier_top_mlp_gridsearch_emb = GridSearchCV(emotions_classifier_mlp_embedding, param_grid=hyperparam)
emotions_classifier_top_mlp_gridsearch_emb.fit(X=training_X_emotions_emb, y=training_dataset[:, 1])
emotions_classifier_prediction_tmlp_emb = emotions_classifier_top_mlp_gridsearch_emb.predict(X=testing_X_emotions_emb)
print(emotions_classifier_prediction_tmlp_emb)


In [None]:
# Append Emotions results
performance_file = open("performance_3.6", "w")
performance_file.write(
    "-----Emotions classification (google model) (TOP Multi Layered Perception with word Embeddings)-----\n")

performance_file.write(
    f"Emotions hyperparameters = {emotions_classifier_top_mlp_gridsearch_emb.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 1], y_pred=emotions_classifier_prediction_tmlp_emb)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 1], y_pred=emotions_classifier_prediction_tmlp_emb, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()

In [None]:
# For sentiments classifications
sentiments_classifier_top_mlp_gridsearch_emb = GridSearchCV(sentiments_classifier_mlp_embedding, param_grid=hyperparam)
sentiments_classifier_top_mlp_gridsearch_emb.fit(X=training_X_sentiments_emb, y=training_dataset[:, 2])
sentiments_classifier_prediction_tmlp_emb = sentiments_classifier_top_mlp_gridsearch_emb.predict(X=testing_X_emotions_emb)
print(sentiments_classifier_prediction_tmlp_emb)

In [None]:
# Append Sentiments results
performance_file = open("performance_3.6", "a")
performance_file.write(
    "-----Sentiments classification (google model) (TOP Multi Layered Perception with word Embeddings)-----\n")

performance_file.write(
    f"Sentiments hyperparameters = {sentiments_classifier_top_mlp_gridsearch_emb.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 2], y_pred=sentiments_classifier_prediction_tmlp_emb)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 2], y_pred=sentiments_classifier_prediction_tmlp_emb, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()

3.7. □ (2pts) Display the performance of your classifiers using `metrics.classification_report` and add
these to your `performance` file.

In [None]:
# See code after generating the classifications for both 3.5 and 3.6

3.8. □ (7.5pts) **Do your own exploration:** Rerun your best performing model but with 2 other English
pretrained embedding models and compare the results. Many pre-trained embeddings are available
on line (including in Gensim or at http://vectors.nlpl.eu/repository).

In [None]:
fast_model = gensim.downloader.load("fasttext-wiki-news-subwords-300") 
twitter_model = gensim.downloader.load("glove-twitter-100")

Using the `fasttext-wiki-news-subwords-300` model from the gensim repo https://github.com/RaRe-Technologies/gensim-data

In [None]:
# Repetition of 3.3 to 3.7 using the fast model
# Tokenization has been done in 3.2 so will reuse that code 
# phrase_token_vector = [nltk.tokenize.word_tokenize(i) for i in phrases]
# emotions_and_phrases_token_vector = [nltk.tokenize.word_tokenize(i) for i in emotions_and_phrases]


In [None]:
mean_embedding_list_emotions = []
for i in phrase_token_vector:
    mean_embedding_list_emotions.append(fast_model.get_mean_vector(i))

mean_embedding_list_sentiments = []
for i in emotions_and_phrases_token_vector:
    mean_embedding_list_sentiments.append(fast_model.get_mean_vector(i))


In [None]:
# Split the tokens (to get the hit rate)
training_phrase_token, testing_phrase_token = train_test_split(
    phrase_token_vector, train_size=0.8, test_size=0.2)

training_emotions_and_phrases_token, testing_emotions_and_phrases_token = train_test_split(
    emotions_and_phrases_token_vector, train_size=0.8, test_size=0.2)


# Split the mean embedding list for both eomtions and sentiments (the X values)
training_X_emotions_emb, testing_X_emotions_emb = train_test_split(
    mean_embedding_list_emotions, train_size=0.8, test_size=0.2)

training_X_sentiments_emb, testing_X_sentiments_emb = train_test_split(
    mean_embedding_list_sentiments, train_size=0.8, test_size=0.2)


In [None]:
# Hit Rate for phrases training set
num_of_hits = 0
total = 0

for i in training_phrase_token:
    for word in i:
        try:
            # pdb.set_trace()
            fast_model.key_to_index[word]
            num_of_hits += 1
        except Exception:
            pass
        total += 1

print(f"Hit Rate in training set (phrases) = {num_of_hits/total*100}%")


# Hit Rate for phrases testing set
num_of_hits = 0
total = 0

for i in testing_phrase_token:
    for word in i:
        try:
            # pdb.set_trace()
            fast_model.key_to_index[word]
            num_of_hits += 1
        except Exception:
            pass
        total += 1

print(f"Hit Rate in testing set (phrases) = {num_of_hits/total*100}%")

# Hit Rate for phrases and emotions training set
num_of_hits = 0
total = 0

for i in training_emotions_and_phrases_token:
    for word in i:
        try:
            # pdb.set_trace()
            fast_model.key_to_index[word]
            num_of_hits += 1
        except Exception:
            pass
        total += 1

print(f"Hit Rate in training set (phrases and emotions) = {num_of_hits/total*100}%")


# Hit Rate for phrases and emotions testing set
num_of_hits = 0
total = 0

for i in testing_emotions_and_phrases_token:
    for word in i:
        try:
            # pdb.set_trace()
            fast_model.key_to_index[word]
            num_of_hits += 1
        except Exception:
            pass
        total += 1

print(f"Hit Rate in testing set (phrases and emotions) = {num_of_hits/total*100}%")


In [None]:
# For emotions classification
emotions_classifier_mlp_embedding = MLPClassifier()

# Fit the model with training_X as X and columns of training_dataset as y
emotions_classifier_mlp_embedding.fit(X=training_X_emotions_emb,
                                      y=training_dataset[:, 1])

# Make predictions with testing_X as X
emotion_prediction_mlp_embedding = emotions_classifier_mlp_embedding.predict(
    X=testing_X_emotions_emb)
print(emotion_prediction_mlp_embedding)

In [None]:
# Append Emotions results
performance_file = open("performance_3.8", "w")
performance_file.write(
    "-----Emotions classification (fast model) (Multi Layered Perception with word Embeddings)-----\n")

performance_file.write(
    f"Emotions hyperparameters = {emotions_classifier_mlp_embedding.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_mlp_embedding)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_mlp_embedding, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()

In [None]:
# For sentiments classification
sentiments_classifier_mlp_embedding = MLPClassifier()

# Fit the model with training_X as X and columns of training_dataset as y
sentiments_classifier_mlp_embedding.fit(X=training_X_sentiments_emb,
                                      y=training_dataset[:, 2])

# Make predictions with testing_X as X
sentiments_prediction_mlp_embedding = sentiments_classifier_mlp_embedding.predict(
    X=testing_X_sentiments_emb)
print(sentiments_prediction_mlp_embedding)

In [None]:
# Append Sentiments results
performance_file = open("performance_3.8", "a")
performance_file.write(
    "-----Sentiments classification (concept model) (Multi Layered Perception with word Embeddings)-----\n")

performance_file.write(
    f"Sentiments hyperparameters = {sentiments_classifier_mlp_embedding.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 2], y_pred=sentiments_prediction_mlp_embedding)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 2], y_pred=sentiments_prediction_mlp_embedding, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()


Using the Twitter (`glove-twitter-100`) Model from the gensim repo https://github.com/RaRe-Technologies/gensim-data

In [None]:
# Repetition of 3.3 to 3.7 using the twitter model
# Tokenization has been done in 3.2 so will reuse that code 
# phrase_token_vector = [nltk.tokenize.word_tokenize(i) for i in phrases]
# emotions_and_phrases_token_vector = [nltk.tokenize.word_tokenize(i) for i in emotions_and_phrases]


In [None]:
mean_embedding_list_emotions = []
for i in phrase_token_vector:
    mean_embedding_list_emotions.append(twitter_model.get_mean_vector(i))

mean_embedding_list_sentiments = []
for i in emotions_and_phrases_token_vector:
    mean_embedding_list_sentiments.append(twitter_model.get_mean_vector(i))


In [None]:
# Split the tokens (to get the hit rate)
training_phrase_token, testing_phrase_token = train_test_split(
    phrase_token_vector, train_size=0.8, test_size=0.2)

training_emotions_and_phrases_token, testing_emotions_and_phrases_token = train_test_split(
    emotions_and_phrases_token_vector, train_size=0.8, test_size=0.2)


# Split the mean embedding list for both eomtions and sentiments (the X values)
training_X_emotions_emb, testing_X_emotions_emb = train_test_split(
    mean_embedding_list_emotions, train_size=0.8, test_size=0.2)

training_X_sentiments_emb, testing_X_sentiments_emb = train_test_split(
    mean_embedding_list_sentiments, train_size=0.8, test_size=0.2)


In [None]:
# Hit Rate for phrases training set
num_of_hits = 0
total = 0

for i in training_phrase_token:
    for word in i:
        try:
            # pdb.set_trace()
            twitter_model.key_to_index[word]
            num_of_hits += 1
        except Exception:
            pass
        total += 1

print(f"Hit Rate in training set (phrases) = {num_of_hits/total*100}%")


# Hit Rate for phrases testing set
num_of_hits = 0
total = 0

for i in testing_phrase_token:
    for word in i:
        try:
            # pdb.set_trace()
            twitter_model.key_to_index[word]
            num_of_hits += 1
        except Exception:
            pass
        total += 1

print(f"Hit Rate in testing set (phrases) = {num_of_hits/total*100}%")

# Hit Rate for phrases and emotions training set
num_of_hits = 0
total = 0

for i in training_emotions_and_phrases_token:
    for word in i:
        try:
            # pdb.set_trace()
            twitter_model.key_to_index[word]
            num_of_hits += 1
        except Exception:
            pass
        total += 1

print(f"Hit Rate in training set (phrases and emotions) = {num_of_hits/total*100}%")


# Hit Rate for phrases and emotions testing set
num_of_hits = 0
total = 0

for i in testing_emotions_and_phrases_token:
    for word in i:
        try:
            # pdb.set_trace()
            twitter_model.key_to_index[word]
            num_of_hits += 1
        except Exception:
            pass
        total += 1

print(f"Hit Rate in testing set (phrases and emotions) = {num_of_hits/total*100}%")


In [None]:
# For emotions classification
emotions_classifier_mlp_embedding = MLPClassifier()

# Fit the model with training_X as X and columns of training_dataset as y
emotions_classifier_mlp_embedding.fit(X=training_X_emotions_emb,
                                      y=training_dataset[:, 1])

# Make predictions with testing_X as X
emotion_prediction_mlp_embedding = emotions_classifier_mlp_embedding.predict(
    X=testing_X_emotions_emb)
print(emotion_prediction_mlp_embedding)

In [None]:
# Append Emotions results
performance_file = open("performance_3.8", "a")
performance_file.write(
    "-----Emotions classification (twitter model) (Multi Layered Perception with word Embeddings)-----\n")

performance_file.write(
    f"Emotions hyperparameters = {emotions_classifier_mlp_embedding.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_mlp_embedding)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_mlp_embedding, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()

In [None]:
# For sentiments classification
sentiments_classifier_mlp_embedding = MLPClassifier()

# Fit the model with training_X as X and columns of training_dataset as y
sentiments_classifier_mlp_embedding.fit(X=training_X_sentiments_emb,
                                      y=training_dataset[:, 2])

# Make predictions with testing_X as X
sentiments_prediction_mlp_embedding = sentiments_classifier_mlp_embedding.predict(
    X=testing_X_sentiments_emb)
print(sentiments_prediction_mlp_embedding)

In [None]:
# Append Sentiments results
performance_file = open("performance_3.8", "a")
performance_file.write(
    "-----Sentiments classification (twitter model) (Multi Layered Perception with word Embeddings)-----\n")

performance_file.write(
    f"Sentiments hyperparameters = {sentiments_classifier_mlp_embedding.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 2], y_pred=sentiments_prediction_mlp_embedding)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 2], y_pred=sentiments_prediction_mlp_embedding, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()
