# Mini Project 1

In [2]:
print('Hello MP1!')


Hello MP1!


### Imported libraries used for the project
1. jupiter
2. scikit-learn
3. gensim
4. nltk
5. numpy
6. pandas
7. matplotlib

`conda install jupiter scikit-learn gensim nltk numpy pandas matplotlib`

## 1. Dataset Preparation & Analysis (5pts)

1.2. Load the dataset. You can use `gzip.open` and `json.load` to do that.

In [3]:
import gzip
import json

dataset = gzip.open('goemotions.json.gz')
dataset_json = json.load(dataset)

# Close the gz dataset once your finished loading the data as a json object
dataset.close()


1.3. (5pts) Extract the posts and the 2 sets of labels (emotion and sentiment), then plot the distribution
of the posts in each category and save the graphic (a histogram or pie chart) in pdf. Do this for both
the emotion and the sentiment categories. You can use `matplotlib.pyplot` and `savefig` to do this.
This pre-analysis of the dataset will allow you to determine if the classes are balanced, and which
metric is more appropriate to use to evaluate the performance of your classifiers.

In [4]:
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter


numpy_dataset = np.array(dataset_json)

# Get column only for emotion and sentiment
emotion_dataset_col = numpy_dataset[:, 1]
sentiment_dataset_col = numpy_dataset[:, 2]

# Count the number of times each value appears
emotion_count = Counter(emotion_dataset_col)
sentiment_count = Counter(sentiment_dataset_col)

# Save the data values as a histogram
plt.hist(emotion_count.values())
plt.savefig('emotions_graph')

plt.close()


plt.hist(sentiment_count.values())
plt.savefig('sentiment_graph')

plt.close()


## 2. Words as Features (35pts)

2.1. □ (5pts) Process the dataset using `feature_extraction.text.CountVectorizer` to extract tokens/words
and their frequencies. Display the number of tokens (the size of the vocabulary) in the dataset.

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd


# Phrases are in the first column of the dataset
phrases = numpy_dataset[:, 0]

# Process the dataset
vectorizer_emotions = CountVectorizer()

# X value is the processed_dataset
X_emotions = vectorizer_emotions.fit_transform(phrases)

print("Number of features (tokens in the vocabulary) =",
      len(vectorizer_emotions.get_feature_names_out()))


Number of features (tokens in the vocabulary) = 30449


In [6]:
emotions = numpy_dataset[:, 1]

emotions_and_phrases = phrases.copy()

for count, i in enumerate(phrases):
    emotions_and_phrases[count] = i + " " + emotions[count]


vectorizer_sentiments = CountVectorizer()
X_sentiments = vectorizer_sentiments.fit_transform(emotions_and_phrases)

print("Number of features (tokens in the vocabulary) including emotions =",
      len(vectorizer_sentiments.get_feature_names_out()))


Number of features (tokens in the vocabulary) including emotions = 30450


2.2. □ (2pts) Split the dataset into 80% for training and 20% for testing. For this, you can use `train_test_split`.

In [7]:
# Split the dataset
from sklearn.model_selection import train_test_split


# Split the dataset
training_dataset, testing_dataset = train_test_split(
    numpy_dataset, train_size=0.8, test_size=0.2)

# Split the feature vector of emotions
training_X_emotions, testing_X_emotions = train_test_split(
    X_emotions, train_size=0.8, test_size=0.2)

# Split the feature vector of sentiments
training_X_sentiments, testing_X_sentiments = train_test_split(
    X_sentiments, train_size=0.8, test_size=0.2)

# Print the size of both datasets
print("Size of training set =", training_dataset.shape[0])
print("Size of testing set =", testing_dataset.shape[0])


Size of training set = 137456
Size of testing set = 34364


2.3. Train and test the following classifiers, for both the emotion and the sentiment classification, using
word frequency as features.

* 2.3.1. □ (3pts) **Base-MNB**: a Multinomial Naive Bayes Classifier `(naive_bayes.MultinomialNB.html)`
with the default parameters.

In [None]:
from sklearn.naive_bayes import MultinomialNB


# Create the object classifiers for emotions
emotions_classifier_mb = MultinomialNB()

# Fit the model with training_X as X and columns of training_dataset as y
emotions_classifier_mb.fit(X=training_X_emotions,
                           y=training_dataset[:, 1])

# Make predictions with testing_X as X
emotion_prediction_mb = emotions_classifier_mb.predict(X=testing_X_emotions)
print(emotion_prediction_mb)


['neutral' 'neutral' 'neutral' ... 'neutral' 'neutral' 'neutral']


In [None]:
# Create the object classifiers for sentiments
sentiment_classifier_mb = MultinomialNB()

# Fit the model with training_X as X and columns of training_dataset as y
sentiment_classifier_mb.fit(X=training_X_sentiments,
                            y=training_dataset[:, 2])

# Make predictions with testing_X as X
sentiment_prediction_mb = sentiment_classifier_mb.predict(
    X=testing_X_sentiments)
print(sentiment_prediction_mb)


['positive' 'neutral' 'positive' ... 'positive' 'positive' 'neutral']


In [None]:
# Part 2.4 for Multinomial classification
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Append Emotions results
performance_file = open("performance", "w")
performance_file.write(
    "-----Emotions classification (Multinomial Naive Bayes)-----\n")

performance_file.write(
    f"Emotions hyperparameters = {emotions_classifier_mb.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_mb)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_mb, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()


In [None]:
# Append Sentiments results
performance_file = open("performance", "a")
performance_file.write(
    "-----Sentiments classification (Multinomial Naive Bayes)-----\n")

performance_file.write(
    f"Sentiments hyperparameters = {sentiment_classifier_mb.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 1], y_pred=sentiment_prediction_mb)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 1], y_pred=sentiment_prediction_mb, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()


* 2.3.2. □ (3pts) **Base-DT:** a Decision Tree `(tree.DecisionTreeClassifier)` with the default parameters.

In [22]:
from sklearn.tree import DecisionTreeClassifier


# Create the object classifiers for emotions
emotions_classifier_dt = DecisionTreeClassifier()

# Fit the model with training_X as X and columns of training_dataset as y
emotions_classifier_dt.fit(X=training_X_emotions,
                           y=training_dataset[:, 1])

# Make predictions with testing_X as X
emotion_prediction_dt = emotions_classifier_dt.predict(X=testing_X_emotions)
print(emotion_prediction_dt)


['curiosity' 'neutral' 'neutral' ... 'optimism' 'admiration' 'admiration']


In [23]:
# Create the object classifiers for sentiments
sentiment_classifier_dt = DecisionTreeClassifier()

# Fit the model with training_X as X and columns of training_dataset as y
sentiment_classifier_dt.fit(X=training_X_sentiments,
                            y=training_dataset[:, 2])

# Make predictions with testing_X as X
sentiment_prediction_dt = sentiment_classifier_dt.predict(
    X=testing_X_sentiments)
print(sentiment_prediction_dt)


['positive' 'negative' 'positive' ... 'neutral' 'negative' 'negative']


In [24]:
# Part 2.4 for DecisionTree classification


# Append Emotions results
performance_file = open("performance", "a")
performance_file.write("-----Emotions classification (Decision Tree)-----\n")

performance_file.write(
    f"Emotions hyperparameters = {emotions_classifier_dt.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_dt)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 1], y_pred=emotion_prediction_dt, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")
performance_file.close()


In [25]:

# Append Sentiments results
performance_file = open("performance", "a")
performance_file.write(
    "-----Sentiments classification (Decision Tree)-----\n")

performance_file.write(
    f"Sentiments hyperparameters = {sentiment_classifier_dt.n_features_in_}\n")

confusion_matrix_output = confusion_matrix(
    y_true=testing_dataset[:, 1], y_pred=sentiment_prediction_dt)
performance_file.write(f"Confusion Matrix = \n{confusion_matrix_output}\n\n")

class_report = classification_report(
    y_true=testing_dataset[:, 1], y_pred=sentiment_prediction_dt, zero_division=0)
performance_file.write(f"Classification Report = \n{class_report}\n")
performance_file.write(
    f"----------------------------------------------------------\n\n")

performance_file.close()


* 2.3.3. □ (3pts) **Base-MLP:** a Multi-Layered Perceptron `(neural network.MLPClassifier)` with the
default parameters.

* 2.3.4. □ (3pts) **Top-MNB:** a better performing Multinomial Naive Bayes Classifier found using `GridSearchCV`.
The gridsearch will allow you to find the best combination of hyper-parameters, as determined
by the evaluation function that you have determined in step 1.3. The only hyper-parameter that
you will experiment with is `alphafloat` with values 0.5, 0 and 2 other values of your choice.

* 2.3.5. □ (3pts) **Top-DT:** a better performing Decision Tree found using `GridSearchCV.` The hyperparameters
that you will experiment with are:
  * `criterion:` gini or entropy
  * `max depth:` 2 different values of your choice
  * `min samples split:` 3 different values of your choice

* 2.3.6. □ (3pts) **Top-MLP:** a better performing Multi-Layered Perceptron found using GridSearchCV.
The hyper-parameters that you will experiment with are:
    * `activation:` sigmoid, tanh, relu and identity
    * 2 network architectures of your choice: for eg, 2 hidden layers with 30+50 nodes and 3 hidden
layers with 10 + 10 + 10
    * `solver:` Adam and stochastic gradient descent

2.4. □ (5pts) For each of the 6 classifiers above and each of the classification tasks (emotion or sentiment),
produce and save the following information in a file called `performance`:
* a string clearly describing the model (e.g. the model name + hyper-parameter values) and the
classification task (emotion or sentiment)
* the confusion matrix – use `metrics.confusion_matrix`
* the precision, recall, and F1-measure for each class, and the accuracy, macro-

2.5. □ (7.5pts) **Do your own exploration:** Do only one of the following, depending on your own interest:
* Use tf-idf instead of word frequencies and redo all substeps of 2.3 above – you can use `TfidfTransformer`
for this. Display the results of this experiment.
* Remove stop words and redo all substeps of 2.3 above – you can use the parameter of `CountVectorizer`
for this. Display the results of this experiment.
* Play with `train_test_split` in order have different splits of 80% training, 20% test sets and
different sizes of training sets and redo all substeps of 2.3 above. Show and explain how the
performance of your models vary depending on the training/test sets are used.

## 3. Embeddings as Features (20pts)

3.1. □ (0pts) Use `gensim.downloader.load` to load the `word2vec-google-news-300` pretrained embedding model.

In [1]:
# Incase its the first time running
# import nltk

# nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mlope\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [8]:
import gensim.downloader
google_model = gensim.downloader.load("word2vec-google-news-300")


3.2. □ (2pts) Use the `tokenizer` from `nltk` to extract words from the Reddit posts. Display the number
of tokens in the training set.

In [9]:
import nltk.tokenize


reddit_vector = [nltk.tokenize.word_tokenize(i) for i in phrases]


In [10]:
# Get the token count
token_count = 0
for i in reddit_vector:
    token_count += len(i)

print("Number of Tokens =", token_count)


Number of Tokens = 2642128


3.3. □ (5pts) Compute the embedding of a Reddit post as the <u>average</u> of the embeddings of its words. If
a word has no embedding in Word2Vec, skip it.

In [21]:
from statistics import mean
from gensim.models import Word2Vec, KeyedVectors


mean_embedding_list_emotions = []
for i in reddit_vector:
    mean_embedding_list_emotions.append(google_model.get_mean_vector(i))


# Test to get the average embedding for a word with index i
i = 0
print(
    f"The mean embedding for '{reddit_vector[i]}'\n = \n{mean_embedding_list_emotions[i]}")


The mean embedding for '['That', 'game', 'hurt', '.']'
 = 
[-0.00249175  0.03335912  0.04454305  0.02014099 -0.005342    0.03958913
  0.04116163 -0.03550375  0.04966702  0.08322471  0.02130005 -0.07371153
  0.02242474 -0.02079816 -0.03720252  0.01349978  0.06466671  0.02539938
  0.03124769 -0.04660584  0.02208368  0.0440758   0.02905092 -0.03239964
 -0.01410932 -0.01994989 -0.05039005 -0.03178651  0.05876349  0.0290636
  0.00571934  0.00895379  0.03168915 -0.0298906   0.04790528  0.03915513
  0.00097203  0.05557248  0.03150231  0.09819147  0.06256143  0.00659081
  0.07235992 -0.03676999 -0.04593803 -0.01758593  0.00756323 -0.0163933
  0.07148833  0.06899618 -0.03733969  0.0571317   0.0141902  -0.02753571
  0.02445936 -0.00438119  0.03252912 -0.01304839  0.02398836 -0.04210209
 -0.02512687  0.01739748 -0.03247805 -0.03073107 -0.02493709  0.00965649
  0.01314632 -0.01541938  0.035196   -0.00662027  0.00404124 -0.01152261
 -0.00373993  0.00676702 -0.05968076 -0.02077763  0.01960778  0.026

3.4. □ (3pts) Compute and display the overall hit rates of the training and test sets (i.e. the % of words
in the Reddit posts for which an embedding is found in Word2Vec).


In [22]:
# Split the embedding/reddit phrases
training_reddit_vector, testing_reddit_vector = train_test_split(
    reddit_vector, train_size=0.8, test_size=0.2)

training_X_emotions_emb, testing_X_emotions_emb = train_test_split(
    mean_embedding_list_emotions, train_size=0.8, test_size=0.2)


In [25]:
# Hit Rate for training set
num_of_hits = 0
total = 0

for i in training_reddit_vector:
    for word in i:
        try:
            # pdb.set_trace()
            google_model.key_to_index[word]
            num_of_hits += 1
        except Exception:
            pass
        total += 1

# Print the hit rate for training set
print(f"Hit Rate in training set = {num_of_hits/total*100}%")


# Hit Rate for testing set
num_of_hits = 0
total = 0

for i in testing_reddit_vector:
    for word in i:
        try:
            # pdb.set_trace()
            google_model.key_to_index[word]
            num_of_hits += 1
        except Exception:
            pass
        total += 1

# Print the hit rate for testing set
print(f"Hit Rate in testing set = {num_of_hits/total*100}%")


Hit Rate in training set= 77.4334307210221%
Hit Rate in testing set= 77.51949871270635%


3.5. □ (3pts) **Train a Base-MLP:** a Multi-Layered Perceptron (`neural_network.MLPClassifier`) with
the default parameters.

In [26]:
from sklearn.neural_network import MLPClassifier


emotions_classifier_mlp_embedding = MLPClassifier()

# Fit the model with training_X as X and columns of training_dataset as y
emotions_classifier_mlp_embedding.fit(X=training_X_emotions_emb,
                                      y=training_dataset[:, 1])

# Make predictions with testing_X as X
emotion_prediction_mlp_embedding = emotions_classifier_mlp_embedding.predict(
    X=testing_X_emotions_emb)
print(emotion_prediction_mlp_embedding)




['neutral' 'neutral' 'neutral' ... 'neutral' 'neutral' 'neutral']


3.6. □ (3pts) **Train a Top-MLP:** a better performing Multi-Layered Perceptron found with whatever
hyperparameters you want.

3.7. □ (2pts) Display the performance of your classifiers using `metrics.classification_report` and add
these to your `performance` file.

3.8. □ (7.5pts) **Do your own exploration:** Rerun your best performing model but with 2 other English
pretrained embedding models and compare the results. Many pre-trained embeddings are available
on line (including in Gensim or at http://vectors.nlpl.eu/repository).