<a href="https://colab.research.google.com/github/maryamyazdi/snappfood-sentiment-analysis/blob/master/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import hazm as hz
from finglish import f2p
import itertools
import re
import pickle
import os
import torch
import gensim
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

gpu_enable = torch.cuda.is_available()
device = torch.device("cuda" if gpu_enable else 'cpu')

In [None]:
df = pd.read_csv('./data/train.csv', sep='\t', index_col=0)
train_data = (df.head(n=300)).drop(axis=1, columns='label')

#Preprocessing
def fixup(x):
    x = x.replace('\u200c', '').replace('\xa0','').replace('\r\n',' ').replace('|',' ')
    return x

normalizer = hz.Normalizer()

def my_tokenizer(text):
  text = re.sub(r"[\{\}\؛\*\=\-\+\/\n\(\)]"," ",str(text))
  text = re.sub("[ ]+"," ",text)
  text = re.sub("\!+","!",text)
  text = re.sub("[؟]+","؟",text)
  text = re.sub("\?+","?",text)
  text = re.sub("[.]+","",text)
  text = re.sub("[،]+","",text)
  if(bool(re.match('^[a-zA-Z]',text))==True):
    text=f2p(text)
  for c in "..آابپتثجچحخدذرزژسشصضطظعغفقکگلمنوهیئ":
    text = re.sub(f"[{c}]+", c, text)
  text = fixup(normalizer.normalize(text))
  words = []
  words.append(hz.word_tokenize(text))
  return words

nested_train_data = train_data['comment'].apply(my_tokenizer)
train_data['comment'] = list(itertools.chain(*nested_train_data))

In [None]:
train_data

Unnamed: 0,comment,label_id
0,"[واقعا, حیف, وقت, که, بنویسم, سرویس, دهیتون, ش...",1
1,"[قرار, بود, ۱, ساعته, برسه, ولی, نیم, ساعت, زو...",0
2,"[قیمت, این, مدل, اصلا, با, کیفیتش, سازگاری, ند...",1
3,"[عالی, بود, همه, چه, درست, و, به, اندازه, و, ک...",0
4,"[شیرینی, وانیلی, فقط, یک, مدل, بود]",0
...,...,...
295,"[فقط, ساندویچ, سرد, شده_بود, با, اینکه, ده, دق...",0
296,"[برگش, خیلی, عالی, بود, ولی, کباب, لقمهاش, بد,...",1
297,"[قبلا, خیلی, خوب, بود, ولی, نمیدونم, چرا, اینق...",1
298,"[زرشک, پلو, با, مرغ, با, بوی, بسیار, بد]",1


In [None]:
#Some statistical info
words_count = train_data['comment'].apply(len)

print('Min length =', words_count.min())
print('Max length =', words_count.max())
print('Mean = {:.2f}'.format(words_count.mean()))
print('Std  = {:.2f}'.format(words_count.std()))
print('mean + 2 * sigma = {:.2f}'.format(words_count.mean() + 2.0 * words_count.std()))

Min length = 4
Max length = 124
Mean = 16.78
Std  = 15.29
mean + 2 * sigma = 47.35


In [None]:
max_len = 32
PAD = '<pad>'

#Make all tokens the same length
def padding_and_trimming(tokens):
  if len(tokens) < max_len:
      num_pads = max_len - len(tokens)
      tokens = [PAD] * num_pads + tokens
  elif len(tokens) > max_len:
      tokens = tokens[:max_len]
  return tokens

In [None]:
train_data['comment'] = train_data['comment'].apply(padding_and_trimming)

In [None]:
!wget 'http://vectors.nlpl.eu/repository/20/61.zip' -O './w2vec.zip'
!unzip ./w2vec.zip -d ./w2vec

w2v_model = gensim.models.KeyedVectors.load_word2vec_format('./w2vec/model.txt', binary=False, unicode_errors='replace')
w2v_weights = torch.FloatTensor(w2v_model.vectors)

In [None]:
class LSTMClassifier(nn.Module):
  def __init__(self, hidden_size, batch_size, layers_count):
    super(LSTMClassifier, self).__init__()
    self.hidden_size = hidden_size
    self.batch_size = batch_size
    self.layers_count = layers_count

    self.embedding = nn.Embedding.from_pretrained(w2v_weights)
    self.lstm = nn.LSTM(100, hidden_size, layers_count, bidirectional=True, batch_first=True)
    self.classifier_layer = nn.Sequential(
        nn.Linear(2*hidden_size, 100),
        nn.ReLU(),
        nn.Linear(100, 2)
    )
    self.hidden = self.init_hidden()

  def init_hidden(self):
    h = torch.autograd.Variable(torch.zeros((2*self.layers_count, self.batch_size, self.hidden_size)).to(device))
    c = torch.autograd.Variable(torch.zeros((2*self.layers_count, self.batch_size, self.hidden_size)).to(device))
    return h, c

  def forward(self, x):
    x = self.embedding(x)
    x, self.hidden = self.lstm(x, self.hidden)
    x = x.permute(1, 0, 2).detach()
    x = self.classifier_layer(x[-1])
    return x

In [None]:
BATCH_SIZE = 128
lstm_model = LSTMClassifier(hidden_size=512, batch_size=BATCH_SIZE, layers_count=1)

if gpu_enable:
  lstm_model = lstm_model.cuda()
criterion = nn.CrossEntropyLoss()
if gpu_enable:
  criterion = criterion.cuda()

optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.01, betas=(0.7, 0.99))
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.975)

In [None]:
PAD = '<pad>'
UNK = '<unk>'

class TDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.X_train = dataset['comment']
        self.y_train = dataset['label_id']

    def __len__(self):
        return len(self.X_train)

    def __getitem__(self, index):
        vectors = []
        for token in self.X_train[index]:
          if token == PAD:
            vectors.append(1)
            continue
          try:
            vectors.append(w2v_model.vocab[token].index)
          except KeyError:
            vectors.append(2)
        return torch.tensor(vectors), torch.tensor(self.y_train[index])


dataset = TDataset(train_data)

In [None]:
train_dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

lstm_model.train()

losses = []
for epoch in range(5):
  total_loss = 0
  for i, (inputs, targets) in enumerate(train_dataloader):
    optimizer.zero_grad()

    inputs = inputs.to(device)
    targets = targets.to(device)
    outputs = lstm_model(inputs)
    
    loss = criterion(outputs, targets)
    loss.backward()
    scheduler.step()
    total_loss += loss.item()

    print(f'Epoch {epoch + 1}/5 : step {i + 1}/{len(dataset) // BATCH_SIZE}, loss: {loss.item()}')