# NLP - sentiment analysis using tensorflow with CNN

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import spacy

from numpy import ndarray
from pandas import DataFrame
from spacy.lang.pl import Polish
from spacy.tokens.doc import Doc
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPool1D, Dense, Dropout
from tensorflow.keras import Model
from tensorflow import concat
from tensorflow.test import is_gpu_available
from sklearnex import patch_sklearn

from typing import List
patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Load dataset

In [2]:
path: str = \
    'polish_sentiment_dataset.csv'

dataset: DataFrame = \
    pd.read_csv(path)
    
print(f'rows: {dataset.shape[0]}, columns: {dataset.shape[1]}')
dataset.head()

rows: 936883, columns: 3


Unnamed: 0,description,length,rate
0,Polecam nie pierwszy i nie ostatni raz!,39.0,1.0
1,Bardzo dobra komunikacja sms i telefoniczna. Z...,121.0,1.0
2,Polecam zakupy w tym sklepie. Są dostępne częś...,87.0,1.0
3,0,0.0,0.0
4,Jestem w pełni zadowolona z przebiegu transakcji,48.0,1.0


## Data exploration

### Checking data types

In [3]:
dataset.dtypes

description     object
length         float64
rate           float64
dtype: object

### Checking NaNs

In [4]:
dataset.apply(lambda row: sum(pd.isna(row)))

description       629
length         174047
rate               66
dtype: int64

### Checking empty strings

In [5]:
(dataset.description == u'').sum()

0

### Checking number of classes

In [6]:
dataset.groupby('rate').size()

rate
-1.0    184020
 0.0     18547
 1.0    734250
dtype: int64

In [7]:
round(dataset.groupby('rate').size() / dataset.shape[0] * 100, 2)

rate
-1.0    19.64
 0.0     1.98
 1.0    78.37
dtype: float64

## Data preparation

### Drop `length` column

In [8]:
clean_dataset: DataFrame = \
    dataset.drop(columns = ['length'])
    
clean_dataset.head()

Unnamed: 0,description,rate
0,Polecam nie pierwszy i nie ostatni raz!,1.0
1,Bardzo dobra komunikacja sms i telefoniczna. Z...,1.0
2,Polecam zakupy w tym sklepie. Są dostępne częś...,1.0
3,0,0.0
4,Jestem w pełni zadowolona z przebiegu transakcji,1.0


### Change `description` column to `string`

In [9]:
clean_dataset.description = \
    clean_dataset.description.astype('string')

clean_dataset.dtypes

description     string
rate           float64
dtype: object

### Drop NaNs

In [10]:
clean_dataset = \
    clean_dataset[clean_dataset.description.notna()]

clean_dataset = \
    clean_dataset[clean_dataset.rate.notna()]

clean_dataset.apply(lambda row: sum(pd.isna(row)))

description    0
rate           0
dtype: int64

### Drop `0` class from `rate` column

In [11]:
clean_dataset = \
    clean_dataset[clean_dataset.rate != 0]

(clean_dataset.rate == 0).sum()

0

### Convert `-1` class to `0` class from `rate` column

In [12]:
clean_dataset.rate.replace(-1, 0, inplace = True)

round(clean_dataset.groupby('rate').size() / clean_dataset.shape[0] * 100, 2)

rate
0.0    19.99
1.0    80.01
dtype: float64

### Sort values to get `50%` of `0` class and `50%` of `1` class when reducing dataset

In [13]:
clean_dataset = clean_dataset.sort_values('rate')
clean_dataset.head()

Unnamed: 0,description,rate
936882,wiesz człowieku że on ją nawet nie uderzył i m...,0.0
814850,jak mój kot to zobaczył to od razu spierdolił ...,0.0
814849,a byłaś u spowiedzi niegrzeczna dziewczynko,0.0
814848,mmmLala bierz mnie,0.0
814847,Tak chujowe że aż mi chromosom wyjebało Sorry ...,0.0


### Reduce dataset to save time and GPU

In [14]:
negative: DataFrame = \
    clean_dataset[:10_000]
    
positive: DataFrame = \
    clean_dataset[-10_000:]
    
clean_dataset = pd.concat([negative, positive], ignore_index = True, sort = False)
round(clean_dataset.groupby('rate').size() / clean_dataset.shape[0] * 100, 2)

rate
0.0    50.0
1.0    50.0
dtype: float64

## Create inputs and labels

### Word2Vec tokens

In [15]:
nlp: Polish = \
    spacy.load('pl_core_news_lg')

In [16]:
docs: List[Doc] = \
    list(nlp.pipe(clean_dataset.description, disable="ner"))

In [17]:
vector_sequences: ndarray = \
    np.array([sentence.vector for sentence in docs])
    
vector_sequences.shape, vector_sequences.dtype, len(nlp.vocab)

((20000, 300), dtype('float32'), 28790)

### Lemma tokens

In [18]:
lemmas: List[List[str]] = \
    [[sentence.lemma_ for sentence in doc] for doc in docs]

In [19]:
tokenizer: Tokenizer = \
    Tokenizer(filters = '', oov_token = '<OOV>')

tokenizer.fit_on_texts(lemmas)

In [20]:
sequences: List[List[int]] = \
    tokenizer.texts_to_sequences(lemmas)

padded_sequences: ndarray = \
    pad_sequences(sequences, padding='post')

padded_sequences.shape, padded_sequences.dtype, len(tokenizer.word_counts)

((20000, 2360), dtype('int32'), 18088)

### Label tokens

In [21]:
rates: ndarray = \
    np.array(clean_dataset.rate, dtype=np.int64)

rates.shape, rates.dtype

((20000,), dtype('int64'))

## Create model

In [162]:
class DCNN(Model):
    def __init__(self, vocabulary_size: int,
                 embedding_size: int, filters_number: int,
                 output_size: int, classes_number: int,
                 dropout_rate: float, name = 'dcnn'):

        super(DCNN, self).__init__(name = name)

        self.embedding = Embedding(vocabulary_size, embedding_size)
        
        self.bigram = Conv1D(filters = filters_number, kernel_size = 2, padding = 'valid', activation = 'relu')
        self.trigram = Conv1D(filters = filters_number, kernel_size = 3, padding = 'valid', activation = 'relu')
        self.fourgram = Conv1D(filters = filters_number, kernel_size = 4, padding = 'valid', activation = 'relu')
        self.fifthgram = Conv1D(filters = filters_number, kernel_size = 5, padding = 'valid', activation = 'relu')
        
        self.pool = GlobalMaxPool1D()

        self.dense_1 = Dense(units = output_size, activation = 'relu')
        self.dropout_1 = Dropout(rate = dropout_rate)
        
        if classes_number == 2:
            self.last_dense = Dense(units = 1, activation = 'sigmoid')
        else:
            self.last_dense = Dense(units = classes_number, activation = 'softmax')

    def call(self, inputs: ndarray, training: bool = True):
        embeddings = self.embedding(inputs)
        
        bigram = self.bigram(embeddings)
        bigram = self.pool(bigram)

        trigram = self.trigram(embeddings)
        trigram = self.pool(trigram)
        
        fourgram = self.fourgram(embeddings)
        fourgram = self.pool(fourgram)
        
        fifthgram = self.fifthgram(embeddings)
        fifthgram = self.pool(fifthgram)

        output = concat([bigram, trigram, fourgram, fifthgram], axis = -1)
        
        output = self.dense_1(output)
        output = self.dropout_1(output, training)
        
        output = self.last_dense(output)

        return output

## Set hyperparameters

In [151]:
# VOCABULARY_SIZE: int = len(tokenizer.word_counts)
VOCABULARY_SIZE: int = len(nlp.vocab)
EMBEDDING_SIZE: int = 300
FILTERS_NUMBER: int = 128
OUTPUT_SIZE: int = 256
CLASSES_NUMBER: int = len(set(clean_dataset.rate))
DROPOUT_RATE: float = 0.25
BATCH_SIZE: int = 128
NB_EPOCHS: int = 10

## Create model instance

In [152]:
dcnn: DCNN = \
    DCNN(VOCABULARY_SIZE, EMBEDDING_SIZE, FILTERS_NUMBER, 
         OUTPUT_SIZE, CLASSES_NUMBER, DROPOUT_RATE)

In [153]:
if CLASSES_NUMBER == 2:
    dcnn.compile(loss = "binary_crossentropy", optimizer = "adam", metrics = ["accuracy"])
else:
    dcnn.compile(loss = "sparse_categorical_crossentropy", optimizer = "adam", metrics = ["sparse_categorical_accuracy"])

## Split on train test set

In [154]:
X_train: ndarray
X_test: ndarray
y_train: ndarray
y_test: ndarray

# X_train, X_test, y_train, y_test = \
#     train_test_split(padded_sequences, rates, test_size = 0.33, random_state = 2021)

X_train, X_test, y_train, y_test = \
    train_test_split(vector_sequences, rates, test_size = 0.33, random_state = 2021)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13400, 300), (6600, 300), (13400,), (6600,))

## Check if GPU with CUDA enable

In [155]:
is_gpu_available(cuda_only=True)

True

## Train model

In [156]:
# 0.8715
dcnn.fit(X_train, y_train, batch_size = BATCH_SIZE, epochs = NB_EPOCHS)

Epoch 1/10
(None, 300)
(None, 300)
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x29d6f325e50>

## Evaluate

In [157]:
results: List[float] = \
    dcnn.evaluate(X_test, y_test, batch_size = BATCH_SIZE)

print(f'loss: {results[0]}, accuracy: {results[1]}')

(None, 300)
loss: 0.3185167610645294, accuracy: 0.8621212244033813


## Predict results

### Negative sample

In [165]:
text_to_predict: str = \
    'strata czasu.'
    
vector: ndarray = \
    nlp(text_to_predict, disable="ner tagger parser").vector
    
vector_input: ndarray = \
    np.transpose(np.expand_dims(vector, axis = 1))
# vector_input.shape
np.squeeze(dcnn(vector_input, training=False).numpy())

(1, 300)


array(0.13956702, dtype=float32)

### Positive sample

In [166]:
text_to_predict: str = \
    'gorąco polecam'
    
vector: ndarray = \
    nlp(text_to_predict, disable="ner tagger parser").vector
    
vector_input: ndarray = \
    np.transpose(np.expand_dims(vector, axis = 1))
    
dcnn(vector_input, training=False).numpy()

(1, 300)


array([[0.7726752]], dtype=float32)

### Negative sample (tokenizer)

In [160]:
text_to_predict: str = \
    'strata czasu.'
    
lemmas_to_predict = \
    ' '.join(word.lemma_ for word in nlp(text_to_predict))

sequences_to_predict: List[str] = \
    tokenizer.texts_to_sequences(lemmas_to_predict)
    
padded_sequences_to_predict: ndarray = \
    pad_sequences(sequences_to_predict, padding='post')
    
dcnn(np.transpose(padded_sequences_to_predict), training=False).numpy()

(1, 13)


array([[0.9322992]], dtype=float32)

In [161]:
text_to_predict: str = \
    'bardzo polecam sprzedawcę'
    
lemmas_to_predict = \
    ' '.join(word.lemma_ for word in nlp(text_to_predict))

sequences_to_predict: List[str] = \
    tokenizer.texts_to_sequences(lemmas_to_predict)
    
padded_sequences_to_predict: ndarray = \
    pad_sequences(sequences_to_predict, padding='post')
    
dcnn(np.transpose(padded_sequences_to_predict), training=False).numpy()

(1, 25)


array([[0.9749672]], dtype=float32)