# Sentiment analysis with an MLP and BOW representation

In [None]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import time

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
import nltk
from nltk import word_tokenize          
from nltk import FreqDist

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, TextVectorization
from tensorflow.keras.callbacks import EarlyStopping

## Sentiment Analysis

Text classification is a machine learning technique that assigns a set of predefined categories to open-ended text. Text classifiers can be used to organize, structure, and categorize pretty much any kind of text – from documents, medical studies and files, and all over the web.

For example, new articles can be organized by topics; support tickets can be organized by urgency; chat conversations can be organized by language; brand mentions can be organized by sentiment; and so on.

Text classification is one of the fundamental tasks in natural language processing with broad applications such as **sentiment analysis**, topic labeling, spam detection, and intent detection.

**Why is Text Classification Important?**

It’s estimated that around 80% of all information is unstructured, with text being one of the most common types of unstructured data. Because of the messy nature of text, analyzing, understanding, organizing, and sorting through text data is hard and time-consuming, so most companies fail to use it to its full potential.

This is where text classification with machine learning comes in. Using text classifiers, companies can automatically structure all manner of relevant text, from emails, legal documents, social media, chatbots, surveys, and more in a fast and cost-effective way. This allows companies to save time analyzing text data, automate business processes, and make data-driven business decisions.

**How Does Text Classification Work?**

Instead of relying on manually crafted rules, machine learning text classification learns to make classifications based on past observations. By using pre-labeled examples as training data, machine learning algorithms can learn the different associations between pieces of text, and that a particular output (i.e., tags) is expected for a particular input (i.e., text). A “tag” is the pre-determined classification or category that any given text could fall into.

The first step towards training a machine learning NLP classifier is feature extraction: a method is used to transform each text into a numerical representation in the form of a vector. One of the most frequently used approaches is bag of words, where a vector represents the frequency of a word in a predefined dictionary of words.

Then, the machine learning algorithm is fed with training data that consists of pairs of feature sets (vectors for each text example) and tags (e.g. sports, politics) to produce a classification model:

![training](https://monkeylearn.com/static/507a7b5d0557f416857a038f553865d1/2ed04/text_process_training.webp)

Once it’s trained with enough training samples, the machine learning model can begin to make accurate predictions. The same feature extractor is used to transform unseen text to feature sets, which can be fed into the classification model to get predictions on tags (e.g., sports, politics):

![prediction](https://monkeylearn.com/static/afa7e0536886ee7152dfa4c628fe59f0/2b924/text_process_prediction.webp)

Text classification with machine learning is usually much more accurate than human-crafted rule systems, especially on complex NLP classification tasks. Also, classifiers with machine learning are easier to maintain and you can always tag new examples to learn new tasks.

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Today-lab" data-toc-modified-id="Today-lab-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Today lab</a></span></li><li><span><a href="#Load-dataset" data-toc-modified-id="Load-dataset-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Load dataset</a></span><ul class="toc-item"><li><span><a href="#About-Train,-validation-and-test-sets" data-toc-modified-id="About-Train,-validation-and-test-sets-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span><a href="https://towardsdatascience.com/train-validation-and-test-sets-72cb40cba9e7" rel="nofollow" target="_blank">About Train, validation and test sets</a></a></span></li><li><span><a href="#Undestand-the-dataset" data-toc-modified-id="Undestand-the-dataset-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Undestand the dataset</a></span></li></ul></li><li><span><a href="#Build-X-(features-vectors)-and-y-(labels)" data-toc-modified-id="Build-X-(features-vectors)-and-y-(labels)-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Build X (features vectors) and y (labels)</a></span></li><li><span><a href="#Our-previous-baseline" data-toc-modified-id="Our-previous-baseline-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Our previous baseline</a></span></li><li><span><a href="#Build-an-MLP-Classifier" data-toc-modified-id="Build-an-MLP-Classifier-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Build an MLP Classifier</a></span></li></ul></div>

## The dataset

In this lab we use part of the 'Amazon_Unlocked_Mobile.csv' dataset published by Kaggle. The dataset contain the following information:
* Product Name
* Brand Name
* Price
* Rating
* Reviews
* Review Votes

We are mainly interested by the 'Reviews' (X) and by the 'Rating' (y)

The goal is to try to predict the 'Rating' after reading the 'Reviews'. I've prepared for you TRAIN and TEST set. We use accuracy score in order to evaluate the model.

### [About Train, validation and test sets](https://towardsdatascience.com/train-validation-and-test-sets-72cb40cba9e7)
![test/train/val](https://miro.medium.com/max/1466/1*aNPC1ifHN2WydKHyEZYENg.png)

* **Training Dataset:** The sample of data used to fit the model.
* **Validation Dataset:** The sample of data used to provide an unbiased evaluation of a model fit on the training dataset while tuning model hyperparameters. The evaluation becomes more biased as skill on the validation dataset is incorporated into the model configuration.
* **Test Dataset:** The sample of data used to provide an unbiased evaluation of a final model fit on the training dataset.

## Load the dataset

In [None]:
TRAIN = pd.read_csv("http://www.i3s.unice.fr/~riveill/dataset/Amazon_Unlocked_Mobile/train.csv.gz")
VAL = pd.read_csv("http://www.i3s.unice.fr/~riveill/dataset/Amazon_Unlocked_Mobile/val.csv.gz")
TEST = pd.read_csv("http://www.i3s.unice.fr/~riveill/dataset/Amazon_Unlocked_Mobile/test.csv.gz")

TRAIN.head()

### Undestand the dataset

To choose certain constants (size of vocabulary, length of a line, etc.), it is good to know the dataset used.

In [None]:
# Tokenized the reviews
reviews_tokenized = [word_tokenize(review) for review in TRAIN['Reviews']]
reviews_tokenized[:10]

In [None]:
# Count the vocabulary
flatten_reviews = [item for sublist in reviews_tokenized for item in sublist]
vocabulary_size = len(set(flatten_reviews))
vocabulary_size

In [None]:
# Plot the distribution of words
from nltk import FreqDist

freqDist = FreqDist(flatten_reviews)
print(freqDist.most_common(10))
freqDist.plot(25, cumulative=False)

In [None]:
# Plot the distibution of word length size
len_words = [len(w) for w in flatten_reviews]
freqDist2 = FreqDist(len_words)
freqDist2.plot(cumulative=False)

In [None]:
# On TRAIN only because we don't know future data

# Add a column to the dataframe with the length in tokens of the review
TRAIN['length'] = [len(r) for r in reviews_tokenized]

# What is the maximum length of a review
print("MAX:", TRAIN['length'].max())

# What is the average length of a review
print("MEAN:", TRAIN['length'].mean())

# What is the length of a review so that 90% of it is under
print("90%:", TRAIN['length'].quantile(0.9))

In [None]:
# Put all reviews in lower case
lower_reviews = [r.lower() for r in TRAIN['Reviews']]
lower_reviews[:10]

In [None]:
# Is the dataset is balanced ?
data=pd.DataFrame(y_train)['Rating'].value_counts().reset_index()
sns.barplot(x="index", y="Rating", data=data)

y=len(y_train)/5
data=pd.DataFrame({"index":[0,4],"Rating":[y,y]})
sns.lineplot(x="index", y="Rating", data=data, color="red")

$$[TODO - Students]$$ 

Is accuracy adequate metric ?

## Build X (features vectors) and y (labels)

In [None]:
# Construct X_train and y_train
X_train = TRAIN['Reviews'].fillna("")
y_train = TRAIN['Rating']
X_train.shape, y_train.shape

In [None]:
# Construct X_val and y_val
X_val = VAL['Reviews'].fillna("")
y_val = VAL['Rating']
X_val.shape, y_val.shape

In [None]:
# Construct X_test and y_test
X_test = TEST['Reviews'].fillna("")
y_test = TEST['Rating']
X_test.shape, y_test.shape

## Build a baseline

### What does "baseline" mean in the context of machine learning and data science?

A baseline is a method that uses heuristics, simple summary statistics, randomness, or machine learning to create predictions for a dataset. You can use these predictions to measure the baseline's performance (e.g., accuracy)-- this metric will then become what you compare any other machine learning algorithm against.

In more detail:

* A machine learning algorithm tries to learn a function that models the relationship between the input (feature) data and the target variable (or label). When you test it, you will typically measure performance in one way or another. For example, your algorithm may be 75% accurate. But what does this mean? You can infer this meaning by comparing with a baseline's performance.

Typical baselines include those supported by scikit-learn's "dummy" estimators:

* Classification baselines:

    * “stratified”: generates predictions by respecting the training set’s class distribution.
    * “most_frequent”: always predicts the most frequent label in the training set.
    * “prior”: always predicts the class that maximizes the class prior.
    * “uniform”: generates predictions uniformly at random.
    * “constant”: always predicts a constant label that is provided by the user.
    * This is useful for metrics that evaluate a non-majority class.

* Regression baselines:

    * “median”: always predicts the median of the training set
    * “quantile”: always predicts a specified quantile of the training set,provided with the quantile parameter.
    * “constant”: always predicts a constant value that is provided by the user.

In general, you will want your approach to outperform the baselines you have selected. In the example above, you would want your 75% accuracy to be higher than any baseline you have run on the same data.

Finally, if you are dealing with a specific domain of machine learning (such as recommender systems), then you will typically pick baselines that are current state-of-the-art(SoTA) approaches - since you will usually want to demonstrate that your approach does better than these. For example, while you evaluate a new collaborative filtering algorithm, you may want to compare it to matrix factorization -- which itself is a learning algorithm, but is now a popular baseline since it has been so successful in recommender system research.

Using a binary `CountVectorizer` and a `LogisticRegression` classifier, learned in a previous lecture, build a first model.

For this model, you will not pre-process the text and will only use words (not N-grams). Leaves all parameter as default.

The evaluation metric is accuracy.

$$[TODO - Students]$$ 
Quickly remind what are CountVectorizer and LogisticRegression and how they work.

In [None]:
# I define the pipeline
lr_pipeline = Pipeline([
        ('feature_extraction',  CountVectorizer()),
        ('classification',  LogisticRegression(multi_class='auto', max_iter=400))
        ])

# I fit the model
lr_pipeline.fit(X_train, y_train)

# I evaluate the model
y_pred = lr_pipeline.predict(X_test)

print(classification_report(y_pred, y_test))

## Build an MLP Classifier


In [None]:
# Encode dataset with CountVectorizer (X) and 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder

cv = CountVectorizer()
X_train_encoded = cv.fit_transform(X_train).toarray()
X_val_encoded = cv.transform(X_val).toarray()
X_test_encoded = cv.transform(X_test).toarray()

$$[TODO - Students]$$ 

Encode y_train, y_val and y_test using OneHotEncoder. What is the purpose of OneHotEncoding ?

In [None]:
# Your code here

In [None]:
# Define constant
num_classes = len(np.unique(y_train))

feature_vector_length = X_train_encoded.shape[1]
feature_vector_length, num_classes

$$[TODO - Students]$$ 

Build a simple network to predict the star rating of a review using the functional API. It should have the folowing characteristic : one hidden layer with 256 nodes and relu activation.



In [None]:
# Your code here

$$[TODO - Students]$$ 

We are now training the model. Using the tensorflow documentation, explain the purpose the EarlyStopping callback and detail its arguments.

Complete the following code with the appropriate loss and explain what it does.

Choose the correct loss function and metrics.

In [None]:
# Configure the model and start training
# Stop training with early stopping with patience of 20
callbacks_list = [EarlyStopping(monitor="XXX", min_delta="XXX", patience="XXX", verbose=1, mode="XXX",
                                restore_best_weights=True)
                 ]

model.compile(loss="XXX", optimizer='adam', metrics=['XXX'])
history = model.fit(X_train_encoded, y_train_encoded,
                    validation_data=(X_val_encoded, y_val_encoded),
                    epochs=1000, batch_size=250,
                    callbacks=callbacks_list, verbose=1)

In [None]:
# Plot the learning curves and analyze them
pd.DataFrame(history.history).plot(figsize=(8,5))
plt.show()

$$[TODO - Students]$$ 

How do you interpret those learning curves ?


The model is seemingly overfitting the training data. Various strategies could reduce the overfitting but for this lab, we'll settle on changing the layers number and size

In [None]:
# Evaluate the model
y_pred_encoded = model.predict(X_test_encoded)
y_pred = np.argmax(y_pred_encoded,axis=1) + 1

print(classification_report(y_test, y_pred))

In [None]:
# Print/plot the confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
sns.heatmap(confusion_matrix(y_test, y_pred, normalize='true'),annot=True)

## Same example with Keras TextVectorization layer

Do you understand it? Study the differences and choose the approach you think is best for the rest of the Lab.

In [None]:
text_dataset = tf.data.Dataset.from_tensor_slices(X_train)

# Create the layer.
vectorize_layer = tf.keras.layers.TextVectorization(output_mode='count')

# Now that the vocab layer has been created, call `adapt` on the
# text-only dataset to create the vocabulary. You don't have to batch,
# but for large datasets this means we're not keeping spare copies of
# the dataset.
vectorize_layer.adapt(text_dataset.batch(64))

In [None]:
# Create the model that uses the vectorize text layer

# Set the input layer
# It needs to have a shape of (1,) (because we need to guarantee that there is exactly
# one string input per batch), and the dtype needs to be 'string'.
text_input = Input(shape=(1,), dtype=tf.string, name='input')

# The first layer in our model is the vectorization layer.
h =vectorize_layer(text_input)

# Set the hidden layer
h = Dense(256, activation='relu', name='hidden')(h)

# Set the output layer
ouputs = Dense(num_classes, activation='softmax', name='output')(h)

# Build the model
model = Model(text_input, ouputs)

# Print the model
model.summary()

In [None]:
# Configure the model and start training
# Stop training with early stopping with patience of 20
callbacks_list = [EarlyStopping(monitor='val_accuracy', min_delta=0.0005, patience=20, verbose=1, mode='max', restore_best_weights=True)
                 ]

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model.fit(X_train, y_train-np.min(y_train),
                    validation_data=(X_val, y_val-np.min(y_val)),
                    epochs=1000, batch_size=250,
                    callbacks=callbacks_list, verbose=1)

## Main part of the lab

$$[TODO - Students]$$ 

Improve the model by changing the preprocessing, the number of layers and the number of neurons per layer.
The goal is to increase the model accuracy.