# NLP - sentiment analysis using pytorch with CNN

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import spacy

from numpy import ndarray
from pandas import DataFrame
from spacy.lang.pl import Polish
from spacy.tokens.doc import Doc
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearnex import patch_sklearn

from typing import List
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Load dataset

In [2]:
path: str = \
    'polish_sentiment_dataset.csv'

dataset: DataFrame = \
    pd.read_csv(path)
    
print(f'rows: {dataset.shape[0]}, columns: {dataset.shape[1]}')
dataset.head()

rows: 936883, columns: 3


Unnamed: 0,description,length,rate
0,Polecam nie pierwszy i nie ostatni raz!,39.0,1.0
1,Bardzo dobra komunikacja sms i telefoniczna. Z...,121.0,1.0
2,Polecam zakupy w tym sklepie. Są dostępne częś...,87.0,1.0
3,0,0.0,0.0
4,Jestem w pełni zadowolona z przebiegu transakcji,48.0,1.0


## Data exploration

### Checking data types

In [3]:
dataset.dtypes

description     object
length         float64
rate           float64
dtype: object

### Checking NaNs

In [4]:
dataset.apply(lambda row: sum(pd.isna(row)))

description       629
length         174047
rate               66
dtype: int64

### Checking empty strings

In [5]:
(dataset.description == u'').sum()

0

### Checking number of classes

In [6]:
dataset.groupby('rate').size()

rate
-1.0    184020
 0.0     18547
 1.0    734250
dtype: int64

In [7]:
round(dataset.groupby('rate').size() / dataset.shape[0] * 100, 2)

rate
-1.0    19.64
 0.0     1.98
 1.0    78.37
dtype: float64

## Data preparation

### Drop `length` column

In [8]:
clean_dataset: DataFrame = \
    dataset.drop(columns = ['length'])
    
clean_dataset.head()

Unnamed: 0,description,rate
0,Polecam nie pierwszy i nie ostatni raz!,1.0
1,Bardzo dobra komunikacja sms i telefoniczna. Z...,1.0
2,Polecam zakupy w tym sklepie. Są dostępne częś...,1.0
3,0,0.0
4,Jestem w pełni zadowolona z przebiegu transakcji,1.0


### Change `description` column to `string`

In [9]:
clean_dataset.description = \
    clean_dataset.description.astype('string')

clean_dataset.dtypes

description     string
rate           float64
dtype: object

### Drop NaNs

In [10]:
clean_dataset = \
    clean_dataset[clean_dataset.description.notna()]

clean_dataset = \
    clean_dataset[clean_dataset.rate.notna()]

clean_dataset.apply(lambda row: sum(pd.isna(row)))

description    0
rate           0
dtype: int64

### Drop `0` class from `rate` column

In [11]:
clean_dataset = \
    clean_dataset[clean_dataset.rate != 0]

(clean_dataset.rate == 0).sum()

0

In [12]:
clean_dataset.rate.replace(-1, 0, inplace = True)

round(clean_dataset.groupby('rate').size() / clean_dataset.shape[0] * 100, 2)

rate
0.0    19.99
1.0    80.01
dtype: float64

### Sort values to get `50%` of `0` class and `50%` of `1` class when reducing dataset

In [13]:
clean_dataset = clean_dataset.sort_values('rate')
clean_dataset.head()

Unnamed: 0,description,rate
936882,wiesz człowieku że on ją nawet nie uderzył i m...,0.0
814850,jak mój kot to zobaczył to od razu spierdolił ...,0.0
814849,a byłaś u spowiedzi niegrzeczna dziewczynko,0.0
814848,mmmLala bierz mnie,0.0
814847,Tak chujowe że aż mi chromosom wyjebało Sorry ...,0.0


### Reduce dataset to save time and GPU

In [14]:
negative: DataFrame = \
    clean_dataset[:10_000]
    
positive: DataFrame = \
    clean_dataset[-10_000:]
    
clean_dataset = pd.concat([negative, positive], ignore_index = True, sort = False)
round(clean_dataset.groupby('rate').size() / clean_dataset.shape[0] * 100, 2)

rate
0.0    50.0
1.0    50.0
dtype: float64

## Create inputs and labels

### Lemma tokens

In [15]:
nlp: Polish = \
    spacy.load('pl_core_news_lg')

In [16]:
docs: List[Doc] = \
    list(nlp.pipe(clean_dataset.description, disable = "ner"))

In [17]:
lemmas: List[List[str]] = \
    [[sentence.lemma_ for sentence in doc] for doc in docs]

In [18]:
tokenizer: Tokenizer = \
    Tokenizer(filters = '', oov_token = '<OOV>')

tokenizer.fit_on_texts(lemmas)

In [19]:
sequences: List[List[int]] = \
    tokenizer.texts_to_sequences(lemmas)

padded_sequences: ndarray = \
    pad_sequences(sequences, padding = 'post')

padded_sequences.shape, padded_sequences.dtype, len(tokenizer.word_counts)

((20000, 2360), dtype('int32'), 18088)

### Label tokens

In [20]:
rates: ndarray = \
    np.array(clean_dataset.rate, dtype = np.int64)

rates.shape, rates.dtype

((20000,), dtype('int64'))

In [77]:
import torch.nn as nn
import torch.nn.functional as F

class CNN1d(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size = fs)
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        #text = [batch size, sent len]
        print(text.shape)
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        print(embedded.shape)
        embedded = embedded.permute(0, 2, 1)
        
        #embedded = [batch size, emb dim, sent len]
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))
        
        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [78]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [79]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch[0]).squeeze(1)
        
        loss = criterion(predictions, batch[1])
        
        acc = binary_accuracy(predictions, batch[1])
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [80]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch[0]).squeeze(1)
            
            loss = criterion(predictions, batch[1])
            
            acc = binary_accuracy(predictions, batch[1])

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [81]:
INPUT_DIM = len(tokenizer.word_counts) + 10
EMBEDDING_DIM = 200
N_FILTERS = 100
FILTER_SIZES = [3, 4, 5]
OUTPUT_DIM = 1
DROPOUT = 0.5

model = CNN1d(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

In [82]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

In [83]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
criterion = criterion.to(device)

device, torch.cuda.current_device(), torch.cuda.get_device_name(0)

(device(type='cuda'), 0, 'NVIDIA GeForce GTX 1650 Ti')

In [84]:
X_train: ndarray
X_test: ndarray
y_train: ndarray
y_test: ndarray

X_train, X_test, y_train, y_test = \
    train_test_split(padded_sequences, rates, test_size = 0.33, random_state = 2021)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13400, 2360), (6600, 2360), (13400,), (6600,))

In [85]:
from torch.utils.data import Dataset, DataLoader

class DatasetMaper(Dataset):

   def __init__(self, x, y):
      self.x = x
      self.y = y
      
   def __len__(self):
      return len(self.x)
      
   def __getitem__(self, idx):
      return self.x[idx], self.y[idx]

In [86]:
N_EPOCHS = 2
BATCH_SIZE = 32

train_maper: DatasetMaper = \
    DatasetMaper(torch.LongTensor(X_train).cuda(), torch.FloatTensor(y_train).cuda())

train_iterator: DataLoader = \
    DataLoader(train_maper, batch_size = BATCH_SIZE, shuffle = True)


for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')

torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Si

torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Si

torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Si

torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Si

torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Si

torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Size([32, 2360, 200])
torch.Size([32, 2360])
torch.Si

In [62]:
valid_maper: DatasetMaper = \
    DatasetMaper(torch.LongTensor(X_test).cuda(), torch.FloatTensor(y_test).cuda())

valid_iterator: DataLoader = \
    DataLoader(valid_maper, batch_size = BATCH_SIZE, shuffle = True)

valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

	 Val. Loss: 0.029 |  Val. Acc: 99.17%


In [71]:
# def predict_sentiment(model, sentence, min_len = 3):
#     model.eval()
#     tokenized = [word.lemma_ for word in nlp(sentence)]
#     if len(tokenized) < min_len:
#         tokenized += ['<pad>'] * (min_len - len(tokenized))
#     indexed = [TEXT.vocab.stoi[t] for t in tokenized]
# #     
#     return tokenized

# predict_sentiment(model, 'bardzo polecam sprzedawcę')

['bardzo', 'polecać', 'sprzedawca']

In [89]:
text_to_predict: str = \
    'bardzo polecam sprzedawcę'
    
lemmas_to_predict = \
    ' '.join(word.lemma_ for word in nlp(text_to_predict))

sequences_to_predict: List[str] = \
    tokenizer.texts_to_sequences(lemmas_to_predict)
    
padded_sequences_to_predict: ndarray = \
    pad_sequences(sequences_to_predict, padding='post')
    
torch_tensor = \
    torch.LongTensor(padded_sequences_to_predict).to(device)
    
# model(torch_tensor)
# torch_tensor.shape
# prediction = torch.sigmoid(model(torch_tensor))
# prediction.item()