<a href="https://colab.research.google.com/github/maria-gpe-a/implement-nlp-word-embedding/blob/main/module3/Module3_Demo2_Analysing_Sentiment_With_OHE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Analysing Sentiment

Let's first import everything and load the dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob, Word
import nltk
import torch
from torch import nn
import seaborn as sns
nltk.download('punkt')

%matplotlib inline
sns.set(rc={'figure.figsize':(20,20)})
import warnings
warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
#%%writefile get_data.sh
#if [ ! -f yelp.csv ]; then
#  wget https://raw.githubusercontent.com/axel-sirota/implement-nlp-word-embedding/main/module3/data/yelp.csv
#fi

Overwriting get_data.sh


In [None]:
#!bash get_data.sh


In [2]:
path = './yelp.csv'
yelp = pd.read_csv(path)
# Create a new DataFrame that only contains the 5-star and 1-star reviews.
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

# Define X and y.
X = yelp_best_worst.text
y = yelp_best_worst.stars.map({1:0, 5:1})


## Doing the train_test split and defining model

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
print(X_train)

5307    If I could give it more than 5, I would.  Swee...
5985    We had a fantastic experience here! We went on...
6918    4 stars for the place itself and it's food/cof...
4315    I'm a huge fan of Padre's! What food I've trie...
3356    I love this place. I just tried the other Ethi...
                              ...                        
2737    Let me tell you about my first crush in Phoeni...
3142    My dear love and I went to the museum on a rom...
2065    You like hotdogs?  Motor (thats me) says get i...
8596    Nice facilities, nice AC, but two FATAL flaws:...
7787    The single best ribs I've ever had at any rest...
Name: text, Length: 3268, dtype: object


In [4]:
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [5]:
print(X_train_dtm.shape)
print(X_test_dtm.shape)

(3268, 17181)
(818, 17181)


In [7]:
print(X_train_dtm[1,:].toarray())


[[0 0 0 ... 0 0 0]]


In [8]:
X_train_tensor = torch.Tensor(X_train_dtm.toarray()).to(device)
X_test_tensor = torch.Tensor(X_test_dtm.toarray()).to(device)
y_train = torch.Tensor(y_train.values).type(torch.LongTensor).to(device)
y_test = torch.Tensor(y_test.values).type(torch.LongTensor).to(device)

In [9]:
model = nn.Sequential(
  nn.Linear(X_train_tensor.shape[1], 2),
  nn.LogSoftmax(dim = 1)
).to(device)

In [10]:
def forward(X):
  return model(X).to(device)

def loss(y_pred, y):
  return nn.functional.nll_loss(y_pred, y)

def metric(y_pred, y):  # -> accuracy
  return (1 / len(y)) * ((y_pred.argmax(dim = 1) == y).sum())


## Let's verify the metric makes sense

In [11]:
y_train_pred = model(X_train_tensor).to(device)
y_train_pred.argmax(dim=1)

tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0')

In [12]:
(y_train_pred.argmax(dim = 1) == y_train).sum()

tensor(1164, device='cuda:0')

In [13]:
metric(y_train_pred, y_train)

tensor(0.3562, device='cuda:0')

In [14]:
del y_train_pred

## The training routine

In [16]:
optimizer = torch.optim.AdamW(model.parameters())

In [24]:
print(model)
print(model.parameters())

Sequential(
  (0): Linear(in_features=17181, out_features=2, bias=True)
  (1): LogSoftmax(dim=1)
)
<generator object Module.parameters at 0x7a738272b5a0>


In [25]:
epochs = 1000
for i in range(epochs):
  y_pred = forward(X_train_tensor)
  xe = loss(y_pred, y_train)
  #print(xe)
  #print(xe.backward)
  accuracy = metric(y_pred, y_train)
  xe.backward()
  if i % 100 == 0:
   print("Loss: ", xe, " Accuracy ", accuracy.data.item())
  optimizer.step()
  optimizer.zero_grad()

Loss:  tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  1.0
Loss:  tensor(0.0013, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  1.0
Loss:  tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  1.0
Loss:  tensor(0.0012, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  1.0
Loss:  tensor(0.0011, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  1.0
Loss:  tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  1.0
Loss:  tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  1.0
Loss:  tensor(0.0010, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  1.0
Loss:  tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  1.0
Loss:  tensor(0.0009, device='cuda:0', grad_fn=<NllLossBackward0>)  Accuracy  1.0


In [26]:
y_test_pred = forward(X_test_tensor)
print(f'Model accuracy is {metric(y_test_pred, y_test)}')

Model accuracy is 0.9009780287742615


# Some manual validation

In [27]:
review = np.array(["This place was fantastic"])
vectorized_review = torch.Tensor(vect.transform(review).toarray()).to(device)

In [28]:
prediction = forward(vectorized_review)
prediction.argmax(dim = 1)

tensor([1], device='cuda:0')

Therefore, the model predicted correctly that the review was positive!