## Import library

In [None]:
import pandas as pd
import nltk
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
# Stopwords and stemming
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim as optim
import pickle

In [2]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\nbenl\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\nbenl\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     C:\Users\nbenl\AppData\Roaming\nltk_data...
[nltk_data]    |   Package biocreative_ppi is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\nbenl\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     C:\Users\nbenl\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown_tei is already up-to-date!
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     C:\Users\n

[nltk_data]    |   Package timit is already up-to-date!
[nltk_data]    | Downloading package toolbox to
[nltk_data]    |     C:\Users\nbenl\AppData\Roaming\nltk_data...
[nltk_data]    |   Package toolbox is already up-to-date!
[nltk_data]    | Downloading package treebank to
[nltk_data]    |     C:\Users\nbenl\AppData\Roaming\nltk_data...
[nltk_data]    |   Package treebank is already up-to-date!
[nltk_data]    | Downloading package twitter_samples to
[nltk_data]    |     C:\Users\nbenl\AppData\Roaming\nltk_data...
[nltk_data]    |   Package twitter_samples is already up-to-date!
[nltk_data]    | Downloading package udhr to
[nltk_data]    |     C:\Users\nbenl\AppData\Roaming\nltk_data...
[nltk_data]    |   Package udhr is already up-to-date!
[nltk_data]    | Downloading package udhr2 to
[nltk_data]    |     C:\Users\nbenl\AppData\Roaming\nltk_data...
[nltk_data]    |   Package udhr2 is already up-to-date!
[nltk_data]    | Downloading package unicode_samples to
[nltk_data]    |     C:\U

True

## Data import

In [3]:
dataset = pd.read_csv('Restaurant_Reviews.tsv.txt', delimiter = '\t', quoting = 3)

In [4]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Natural language process

In [6]:
ps = PorterStemmer()

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [8]:
corpus = []

for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ',dataset['Review'][i]) # Regular expression
    review = review.lower()   # Convert to lowercase                                  
    review = nltk.word_tokenize(review)   # Seperate words with tokenization
    clean_review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]   # Drop stopwords
    lemma = nltk.WordNetLemmatizer()  # lemmatisation - drop -ing, -ed etc..
    clean_review = [lemma.lemmatize(word) for word in clean_review]
    clean_review = ' '.join(clean_review) # make text from words
    corpus.append(clean_review) # add to corpus list


In [9]:
corpus[12]

'cashier care ever say still end wayyy overpr'

In [10]:
vectorizer = TfidfVectorizer(max_features = 1500, min_df = 3, max_df = 0.6)    # Features

In [11]:
X = vectorizer.fit_transform(corpus).toarray()

In [12]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
y = dataset.iloc[:, 1].values      # Labels, pos or negative classes

In [14]:
y

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,

## Model Training

In [15]:
# Train test split , 20% of data to test , 80% data to train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)   

### Convert X and y to tensor format

In [17]:
Xtrain_ = torch.from_numpy(X_train).float()
Xtest_ = torch.from_numpy(X_test).float()
ytrain_ = torch.from_numpy(y_train)
ytest_ = torch.from_numpy(y_test)

In [19]:
Xtrain_.shape, ytrain_.shape
# 466 features ( vectorized words )

(torch.Size([800, 466]), torch.Size([800]))

In [20]:
Xtest_.shape, ytest_.shape

(torch.Size([200, 466]), torch.Size([200]))

In [21]:
input_size = 466   # Input layer node size (466 different words)
output_size = 2    # Output layer node size is 2 ( negative or positive )
hidden_size = 500  # Hidden layer node size 

In [23]:
# We have 1 input layer, 2 hidden layers and 1 output layer

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, hidden_size)     # (input layer => hidden layer)
        self.fc2 = torch.nn.Linear(hidden_size, hidden_size)    # (hidden layer => hidden layer)
        self.fc3 = torch.nn.Linear(hidden_size, output_size)    # (hidden layer => output layer)
                                                                # 3 connection between the layers
        
    def forward(self, X):
        X = torch.relu(self.fc1(X))
        X = torch.relu((self.fc2(X)))
        X = self.fc3(X)
        
        return F.log_softmax(X, dim=1)
        

In [24]:
model = Net()   # Define a model class

### Define optimizer and learning rate

In [28]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.NLLLoss()

In [30]:
epochs = 100

### Train the neural network

In [32]:
for epoch in range(epochs):
    optimizer.zero_grad()
    Ypred = model(Xtrain_)
    loss = loss_fn(Ypred, ytrain_)
    loss.backward()
    optimizer.step()
    print('Epoch', epoch, 'loss', loss.item())

Epoch 0 loss 0.6932188272476196
Epoch 1 loss 0.6624799966812134
Epoch 2 loss 0.49528175592422485
Epoch 3 loss 0.31116509437561035
Epoch 4 loss 0.19999530911445618
Epoch 5 loss 0.1342722475528717
Epoch 6 loss 0.09170883893966675
Epoch 7 loss 0.08128456026315689
Epoch 8 loss 0.061998918652534485
Epoch 9 loss 0.047029245644807816
Epoch 10 loss 0.04822373390197754
Epoch 11 loss 0.04732479527592659
Epoch 12 loss 0.036885570734739304
Epoch 13 loss 0.03483529016375542
Epoch 14 loss 0.03640918806195259
Epoch 15 loss 0.03744332492351532
Epoch 16 loss 0.03369613736867905
Epoch 17 loss 0.03140092268586159
Epoch 18 loss 0.03283485770225525
Epoch 19 loss 0.032439254224300385
Epoch 20 loss 0.030135197564959526
Epoch 21 loss 0.028767814859747887
Epoch 22 loss 0.030737070366740227
Epoch 23 loss 0.03142515569925308
Epoch 24 loss 0.029614362865686417
Epoch 25 loss 0.029397867619991302
Epoch 26 loss 0.02924180030822754
Epoch 27 loss 0.029212377965450287
Epoch 28 loss 0.028857680037617683
Epoch 29 loss 0.

In [33]:
sample = ["It's a good day"]
sample = vectorizer.transform(sample).toarray()

In [34]:
sample

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [35]:
torch.from_numpy(sample).float()

tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.8559, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0

In [36]:
sentiment = model(torch.from_numpy(sample).float())    

In [37]:
sentiment
# Second element is higher then the first, it means we have a positive sentence

tensor([[-24.2465,   0.0000]], grad_fn=<LogSoftmaxBackward>)

In [38]:
sample2 = ["It's a bad day"]
sample2 = vectorizer.transform(sample2).toarray()

In [39]:
sentiment2 = model(torch.from_numpy(sample2).float())
sentiment2

tensor([[  0.0000, -21.5121]], grad_fn=<LogSoftmaxBackward>)

In [40]:
# First element is higher then the second, it means we have a negative sentence

### Creating dictionary from the model

In [41]:
model.state_dict()

OrderedDict([('fc1.weight',
              tensor([[-0.0272,  0.0094,  0.0268,  ...,  0.0237, -0.0694,  0.0063],
                      [ 0.1581, -0.0577,  0.0286,  ..., -0.0413,  0.0972, -0.0279],
                      [ 0.1147, -0.0364, -0.0723,  ...,  0.0757, -0.0037,  0.0828],
                      ...,
                      [-0.0584, -0.0709,  0.0306,  ..., -0.0156,  0.0413, -0.0573],
                      [-0.0162, -0.0014, -0.0106,  ...,  0.0312,  0.0627, -0.0420],
                      [-0.0293, -0.0318,  0.0401,  ...,  0.0027, -0.0393, -0.0174]])),
             ('fc1.bias',
              tensor([-0.0726, -0.0429,  0.0114,  0.0451,  0.0405, -0.0820,  0.0365, -0.0612,
                      -0.0332, -0.0248, -0.0654, -0.0320, -0.0226, -0.0805,  0.0402, -0.0034,
                       0.0495, -0.1060, -0.0322, -0.0069,  0.0426,  0.0120, -0.0159, -0.1163,
                       0.0555,  0.0218, -0.0197, -0.0519, -0.1159, -0.0715,  0.0254,  0.0300,
                       0.0709,  0.00

In [42]:
torch.save(model.state_dict(), 'text_classifier_pytorch')

In [43]:
ls

 Volume in drive C has no label.
 Volume Serial Number is 1687-AAC5

 Directory of C:\Users\nbenl\SentimentAnalysis\Text_classify_PyTorch

12.06.2021  16:11    <DIR>          .
12.06.2021  16:11    <DIR>          ..
12.06.2021  13:38    <DIR>          .ipynb_checkpoints
12.06.2021  11:18            61˙332 Restaurant_Reviews.tsv.txt
12.06.2021  16:11            86˙311 text_classifier.ipynb
12.06.2021  16:11         1˙942˙239 text_classifier_pytorch
               3 File(s)      2˙089˙882 bytes
               3 Dir(s)   4˙620˙165˙120 bytes free


In [44]:
with open('tfidfmodel.pickle', 'wb') as file:
    pickle.dump(vectorizer, file)

In [45]:
ls

 Volume in drive C has no label.
 Volume Serial Number is 1687-AAC5

 Directory of C:\Users\nbenl\SentimentAnalysis\Text_classify_PyTorch

12.06.2021  16:14    <DIR>          .
12.06.2021  16:14    <DIR>          ..
12.06.2021  13:38    <DIR>          .ipynb_checkpoints
12.06.2021  11:18            61˙332 Restaurant_Reviews.tsv.txt
12.06.2021  16:13            87˙460 text_classifier.ipynb
12.06.2021  16:11         1˙942˙239 text_classifier_pytorch
12.06.2021  16:14            46˙235 tfidfmodel.pickle
               4 File(s)      2˙137˙266 bytes
               3 Dir(s)   4˙617˙195˙520 bytes free
