In [141]:
import spacy
import numpy as np
from torch.nn import functional as F
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix, accuracy_score, f1_score

def collate_pad(batch):    
  in_ = []
  out_ = []
  seq_len = []
  for x,y in batch:
    in_.append(x)
    out_.append(y)
    seq_len.append(len(x))

  return torch.nn.utils.rnn.pad_sequence(in_).cuda(), torch.tensor(out_).cuda(), seq_len 


def evaluate(model, test_set):
  test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=128, collate_fn=collate_pad)

  y_pred = []
  y_test = []
  with torch.no_grad():
    model.eval()
    valid_loss = 0
    loss_counter = 0
    for batch, (x,y, seq_len_x) in enumerate(test_dataloader):
      x = x.to(device)
      y = y.to(device)

      y_pred.extend(F.softmax(model(x, seq_len_x)).argmax(-1).cpu().numpy())
      y_test.extend(y.cpu().numpy())

  print(confusion_matrix(y_test, y_pred))
  precision, recall, fscore, support  = precision_recall_fscore_support(y_test, y_pred)
  print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))
  print("f1: {}".format(f1_score(y_test, y_pred, average='weighted')))
  print('precision: {}'.format(precision))
  print('recall: {}'.format(recall))
  print('fscore: {}'.format(fscore))
  print('support: {}'.format(support))

def evaluate_svm(model, test_set):
  X_test, y_test = test_set['emb_mean'].to_list(), test_set['label'].to_list()
  y_pred = model.predict(X_test)

  print(confusion_matrix(y_test, y_pred))
  precision, recall, fscore, support  = precision_recall_fscore_support(y_test, y_pred)
  print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))
  print("f1: {}".format(f1_score(y_test, y_pred, average='weighted')))
  print('precision: {}'.format(precision))
  print('recall: {}'.format(recall))
  print('fscore: {}'.format(fscore))
  print('support: {}'.format(support))

def calculate_embeddings(df):
  embeddings = []
  sents = df['sentence']
  print(len(sents))
  for i,sent in enumerate(sents):
    if i%1000 == 0:
      print(f"Processing {i} of {len(sents)} ({int(i/len(sents)*100)}%)")

    doc = nlp(sent)
    embedding = []
    for d in doc:
      emb = d.vector
      emb.shape = (300,)
      embedding.append(emb)

    embeddings.append(embedding)

  return embeddings

def factorize(value):
  if value < 0.2:
    return 0
  elif 0.2 <= value < 0.4:
    return 1
  elif 0.4<= value < 0.6:
    return 2
  elif 0.6<= value < 0.8:
    return 3
  elif 0.8<= value:
    return 4

labels = {0: 'very negative', 1: 'negative', 2: 'neutral', 3: 'positive', 4: 'very positive'}

def predict_sentence(model, sentence):
  doc = nlp(sentence)
  emb = doc.vector
  emb.shape = (300,)
  y_pred = model.predict([emb])[0]
  return labels[y_pred]

def predict(model, dataset, sentence):
    model.eval()
    input = dataset.embedding(sentence)

    with torch.no_grad():
      values = []

      # Prepare input
      length = len(input)
      x = torch.tensor(input)
      x = torch.nn.utils.rnn.pad_sequence([x])

      y_pred = model.forward(x.to(device), [length])

      return labels[F.softmax(y_pred).argmax(-1).cpu().numpy()[0]]

In [None]:
import pandas as pd
import numpy as np
import spacy
import os

base_directory = 'nlp/stanfordSentimentTreebank/'
sentences = pd.read_csv('nlp/stanfordSentimentTreebank/datasetSentences.txt', index_col="sentence_index",
                                sep="\t")
splits = pd.read_csv(os.path.join(base_directory, "datasetSplit.txt"), index_col="sentence_index")
sentences = sentences.join(splits)
sentences = sentences.sort_values(by=['splitset_label', 'sentence_index'])
sentences.reset_index(inplace=True)

sents_df = pd.DataFrame()

for i,data in enumerate(['nlp/trees/train.txt', 'nlp/trees/test.txt', 'nlp/trees/dev.txt']):
  new = pd.read_csv(data, sep="\t", header=None)
  new = new.apply(lambda it: str(it).split()[1][1], axis=1).to_frame().rename(columns={0: 'label'})
  new['set'] = i+1
  sents_df = sents_df.append(new)

sents_df.reset_index(inplace=True, drop=True)
sents_df = pd.concat([sentences, sents_df], axis=1)
sents_df.label = sents_df.label.astype(float)
sents_df['emb'] = calculate_embeddings(sents_df)
sents_df.to_pickle('nlp/sents.pkl')


In [None]:
import pandas as pd

phrases = pd.read_csv('nlp/stanfordSentimentTreebank/dictionary.txt', header=None,
                                sep="|").rename(columns={0: 'sentence', 1: 'phrase_id'})
phrase_sentiments = pd.read_csv('nlp/stanfordSentimentTreebank/sentiment_labels.txt',
                                sep="|").rename(columns={'phrase ids': 'phrase_id', 'sentiment values': 'label'})
phrase_df = pd.merge(phrases, phrase_sentiments, on='phrase_id')
phrase_df['emb'] = calculate_embeddings(phrase_df)
phrase_df.to_pickle('nlp/phrases.pkl')

In [3]:
import pandas as pd

phrase_df = pd.read_pickle('nlp/phrases.pkl')
sents_df = pd.read_pickle('nlp/sents.pkl')

phrase_df['emb_mean'] = phrase_df['emb'].apply(lambda it: np.mean(it, axis=0))
sents_df['emb_mean'] = sents_df['emb'].apply(lambda it: np.mean(it, axis=0))

def factorize(value):
  if value < 0.2:
    return 0
  elif 0.2 <= value < 0.4:
    return 1
  elif 0.4<= value < 0.6:
    return 2
  elif 0.6<= value < 0.8:
    return 3
  elif 0.8<= value:
    return 4

phrase_df['label'] = phrase_df['label'].apply(factorize)

In [52]:
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import f1_score, accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.base import clone
import numpy as np
from sklearn.model_selection import train_test_split
import time

svm_data = {
    'n_samples': [],
    'acc': [],
    'f1': [],
    'precision': [],
    'recall': [],
    'fscore': [],
    'support': [],
    'confusion': [],
    'time': []
}

df, svm_phrase_test_dataset = train_test_split(phrase_df, test_size=2000, shuffle=True, stratify=phrase_df['label'])
X_test, y_test = svm_phrase_test_dataset['emb_mean'].to_list(), svm_phrase_test_dataset['label'].to_list()

for frac in [0.001, 0.005, 0.01, 0.02, 0.05]:
  svm_phrase_train_dataset = df.sample(frac=frac)

  n_samples = len(svm_phrase_train_dataset)
  print("Train size: ", n_samples)

  X_train, y_train = svm_phrase_train_dataset['emb_mean'].to_list(), svm_phrase_train_dataset['label'].to_list()
  
  svm_model = SVC()
  start_time = time.time()
  svm_model.fit(X_train, y_train)
  print("Training complete")
  svm_data['time'].append(time.time()-start_time)

  print("Evaluating on test set")
  y_pred = svm_model.predict(X_test)
  precision, recall, fscore, support  = precision_recall_fscore_support(y_test, y_pred)

  svm_data['n_samples'].append(n_samples)
  svm_data['acc'].append(accuracy_score(y_test, y_pred))
  svm_data['f1'].append(f1_score(y_test, y_pred, average='weighted'))
  svm_data['precision'].append(precision)
  svm_data['recall'].append(recall)
  svm_data['fscore'].append(fscore)  
  svm_data['support'].append(support)
  svm_data['confusion'].append(confusion_matrix(y_test, y_pred))

Train size:  11862
Training complete
Evaluating on test set


  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
pd.DataFrame(svm_data)[:-1][['n_samples', 'acc', 'f1', 'time']]

Unnamed: 0,n_samples,acc,f1,time
0,237,0.5095,0.357441,0.022918
1,1186,0.5535,0.474195,0.313549
2,2372,0.5535,0.466368,1.151901
3,4745,0.5695,0.502472,4.28018
4,11862,0.578,0.517135,24.954232


In [46]:
import torch
import pandas as pd
import spacy
from torch import nn
from torch.nn import functional as F
import torch

nlp = spacy.load('en_core_web_lg')

class SentimentDataset(torch.utils.data.Dataset):
  def __init__(self, dataset) -> None:
    super().__init__()
    
    labels = dataset['label']
    sents = dataset['sentence']
    emb = dataset['emb']

    self.dataset = pd.DataFrame({'emb': emb, 'label': labels})
    self.dataset.reset_index(inplace=True)

  def embedding(self, sentence):
    doc = nlp(sentence)
    embeddings = []
    for d in doc:
      emb = d.vector
      emb.shape = (300,)
      embeddings.append(emb)
    return embeddings
  
  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, index):
    return (
        torch.tensor(self.dataset.loc[index, 'emb']).float(),
        torch.tensor(self.dataset.loc[index, 'label'])
    )


class LSTM_classifier(nn.Module):
  def __init__(self, hidden_size, num_layers, output_size, bidirectional) -> None:
      super(LSTM_classifier, self).__init__()
      self.embedding_dim = 300
      self.num_layers = num_layers
      self.hidden_size = hidden_size
      self.bidirectional = bidirectional
      self.lstm = nn.LSTM(input_size = self.embedding_dim, hidden_size=self.hidden_size, num_layers=self.num_layers, bidirectional=bidirectional, dropout=0.2)
      self.output_size = output_size
      self.fc1 = nn.Linear(in_features=(2 if bidirectional else 1)*self.hidden_size, out_features=128)
      self.fc2 = nn.Linear(128, out_features=output_size)

  def forward(self, x, seq_len):
    x_packed = torch.nn.utils.rnn.pack_padded_sequence(x, seq_len, enforce_sorted=False)
    out_packed, (h, c) = self.lstm(x_packed)
    x_unpacked, x_lengths = torch.nn.utils.rnn.pad_packed_sequence(out_packed)
    cat = torch.cat((h[-1, :, :], h[-2, :, :]), dim=1) if self.bidirectional else h[-1, :, :]
    x = self.fc1(cat)
    x = F.relu(x)
    x = self.fc2(x)
    return x.view(-1, self.output_size)

In [27]:
import torch
import numpy as np
import time

def train_model(model, train_set, validation_set, max_epochs, batch_size):

  evaluation_data = {
      'train_loss': [],
      'validation_loss': [],
  }

  # Define model and loss functions
  model = model.to(device)
  criterion = torch.nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=0.002)


  # Define dataloaders
  dataloader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, collate_fn=collate_pad)
  validation_dataloader = torch.utils.data.DataLoader(validation_set, batch_size=128, collate_fn=collate_pad)

  # Train loop
  for epoch in range(max_epochs):
    start_time = time.time()
    model.train()

    ## Train in batches
    for batch, (x,y, seq_len_x) in enumerate(dataloader):
      if batch%100 == 0:
        print(f'Epoch {epoch} - Batch {batch} of {int(len(train_set)/batch_size)}')
      optimizer.zero_grad()
      x = x.to(device)
      y = y.to(device)

      y_pred = model(x, seq_len_x)

      loss = criterion(y_pred, y)
      loss.backward()
      optimizer.step()
    
    
    ## Evaluate performance on train set
    print(f'Epoch {epoch} - Evaluating train set')
    with torch.no_grad():
      model.eval()
      epoch_loss = 0
      loss_counter = 0
      for batch, (x,y, seq_len_x) in enumerate(dataloader):
        x = x.to(device)
        y = y.to(device)

        y_pred = model(x, seq_len_x)

        loss = criterion(y_pred, y)

        epoch_loss += loss.item()
        loss_counter +=1

      epoch_loss /= loss_counter

    ## Evaluate performance on validation set
    print(f'Epoch {epoch} - Evaluating validation set')
    with torch.no_grad():
      model.eval()
      valid_loss = 0
      loss_counter = 0
      for batch, (x,y, seq_len_x) in enumerate(validation_dataloader):
        x = x.to(device)
        y = y.to(device)

        y_pred = model(x, seq_len_x)

        loss = criterion(y_pred, y)

        valid_loss += loss.item()
        loss_counter +=1

      valid_loss /= loss_counter

    ## update evaluation data
    evaluation_data['train_loss'].append(epoch_loss)
    evaluation_data['validation_loss'].append(valid_loss)

    print("Train Epoch {}: Time {}s |  Loss - {} | Validation loss - {}".format(epoch, int(time.time() - start_time), epoch_loss, valid_loss))
    print("----------------------------------------")
  return evaluation_data


In [57]:
device = "cuda"

from sklearn.model_selection import train_test_split

nn_phrase_train_dataset, nn_phrase_test_dataset = train_test_split(phrase_df, test_size=0.2, shuffle=True)
nn_phrase_validation_dataset, nn_phrase_test_dataset = train_test_split(nn_phrase_test_dataset, test_size=0.5)

nn_phrase_train_dataset = SentimentDataset(nn_phrase_train_dataset)
nn_phrase_validation_dataset = SentimentDataset(nn_phrase_validation_dataset)
nn_phrase_test_dataset = SentimentDataset(nn_phrase_test_dataset)

args = {
    'max_epochs': 3,
    'batch_size': 128,
}

phase_nn_model_bi = LSTM_classifier(hidden_size=128, num_layers=1, output_size=5, bidirectional=True)
phase_nn_model_bi_eval = train_model(phase_nn_model_bi, nn_phrase_train_dataset, nn_phrase_validation_dataset, **args)

  "num_layers={}".format(dropout, num_layers))


Epoch 0 - Batch 0 of 1495
Epoch 0 - Batch 100 of 1495
Epoch 0 - Batch 200 of 1495
Epoch 0 - Batch 300 of 1495
Epoch 0 - Batch 400 of 1495
Epoch 0 - Batch 500 of 1495
Epoch 0 - Batch 600 of 1495
Epoch 0 - Batch 700 of 1495
Epoch 0 - Batch 800 of 1495
Epoch 0 - Batch 900 of 1495
Epoch 0 - Batch 1000 of 1495
Epoch 0 - Batch 1100 of 1495
Epoch 0 - Batch 1200 of 1495
Epoch 0 - Batch 1300 of 1495
Epoch 0 - Batch 1400 of 1495
Epoch 0 - Evaluating train set
Epoch 0 - Evaluating validation set
Train Epoch 0: Time 157s |  Loss - 0.7840078087812439 | Validation loss - 0.8220239625895087
----------------------------------------
Epoch 1 - Batch 0 of 1495
Epoch 1 - Batch 100 of 1495
Epoch 1 - Batch 200 of 1495
Epoch 1 - Batch 300 of 1495
Epoch 1 - Batch 400 of 1495
Epoch 1 - Batch 500 of 1495
Epoch 1 - Batch 600 of 1495
Epoch 1 - Batch 700 of 1495
Epoch 1 - Batch 800 of 1495
Epoch 1 - Batch 900 of 1495
Epoch 1 - Batch 1000 of 1495
Epoch 1 - Batch 1100 of 1495
Epoch 1 - Batch 1200 of 1495
Epoch 1 - B

In [53]:
test_set = sents_df[sents_df.splitset_label == 2]
print('Evaluate on phrases')
print(evaluate_svm(svm_model, svm_phrase_test_dataset))
print('Evaluate on sents')
print(evaluate_svm(svm_model, test_set))

Evaluate on phrases
[[  0  40  54   1   0]
 [  1  97 243  19   0]
 [  0  47 897  55   0]
 [  0   8 226 185   0]
 [  0   0  31  96   0]]
Accuracy: 0.5895
f1: 0.5290365509345394
precision: [0.         0.50520833 0.61819435 0.51966292 0.        ]
recall: [0.         0.26944444 0.8978979  0.44152745 0.        ]
fscore: [0.         0.35144928 0.7322449  0.47741935 0.        ]
support: [ 95 360 999 419 127]
None
Evaluate on sents


  _warn_prf(average, modifier, msg_start, len(result))


[[  0  98 167  14   0]
 [  0 151 429  53   0]
 [  0  31 285  73   0]
 [  0  16 243 251   0]
 [  0   3 121 275   0]]
Accuracy: 0.31085972850678734
f1: 0.25272194115693986
precision: [0.         0.50501672 0.22891566 0.37687688 0.        ]
recall: [0.         0.2385466  0.73264781 0.49215686 0.        ]
fscore: [0.         0.32403433 0.34883721 0.42687075 0.        ]
support: [279 633 389 510 399]
None


  _warn_prf(average, modifier, msg_start, len(result))


In [59]:
print("Evaluation on sentences")
test = sents_df[sents_df.splitset_label == 2]
sents_test_dataset = SentimentDataset(test)

evaluate(phase_nn_model_bi, sents_test_dataset)

Evaluation on sentences




[[ 58 212   9   0   0]
 [ 26 467 129  10   1]
 [  1 122 189  75   2]
 [  0  10  75 379  46]
 [  0   0  12 187 200]]
Accuracy: 0.5850678733031675
f1: 0.5704657439513451
precision: [0.68235294 0.57583231 0.45652174 0.58218126 0.80321285]
recall: [0.2078853  0.73775671 0.48586118 0.74313725 0.50125313]
fscore: [0.31868132 0.6468144  0.47073474 0.65288544 0.61728395]
support: [279 633 389 510 399]


In [60]:
print("Evaluation on phrases")
evaluate(phase_nn_model_bi, nn_phrase_test_dataset)

Evaluation on phrases




[[  171   881    81     2     0]
 [   72  2672  1558    47     0]
 [   19  1086 10031   780     6]
 [    1   108  1947  2732   239]
 [    0     5   100   881   505]]
Accuracy: 0.6734241765591038
f1: 0.6576171457861136
precision: [0.65019011 0.56228956 0.73128235 0.61503827 0.67333333]
recall: [0.15066079 0.61439411 0.84138567 0.54346529 0.33869886]
fscore: [0.24463519 0.58718822 0.78247982 0.57704087 0.45069166]
support: [ 1135  4349 11922  5027  1491]


In [184]:
print("BiLSTM predictions")
print("'Bad product, not good at all':", predict(phase_nn_model_bi, nn_phrase_train_dataset, "Bad product, not good at all"))
print("'Good product, not bad at all':", predict(phase_nn_model_bi, nn_phrase_train_dataset, "Good product, not bad at all"))
print("'I don't think that the movie was good':", predict(phase_nn_model_bi, nn_phrase_train_dataset, "I don't think that the movie was good"))
print("'I think that the movie was worth watching':", predict(phase_nn_model_bi, nn_phrase_train_dataset, "I think that the movie was worth watching"))
print("'I think that the movie, which my father told me about, was worth watching':", predict(phase_nn_model_bi, nn_phrase_train_dataset, "I think that the movie, which my father told me about, was worth watching"))
print("'Not good, not bad':", predict(phase_nn_model_bi, nn_phrase_train_dataset, "Not good, not bad"))
print("'It's the exact opposite of great!':", predict(phase_nn_model_bi, nn_phrase_train_dataset, "It's the exact opposite of great!"))
print("'The movie seemed bad in the beginning, but I was mistaken. It turned out to be amazing in the end.':", predict(phase_nn_model_bi, nn_phrase_train_dataset, "The movie seemed bad in the beginning, but I was mistaken. It turned out to be amazing in the end."))

BiLSTM predictions
'Bad product, not good at all': negative
'Good product, not bad at all': positive
'I don't think that the movie was good': negative
'I think that the movie was worth watching': positive
'I think that the movie, which my father told me about, was worth watching': neutral
'Not good, not bad': neutral
'It's the exact opposite of great!': positive
'The movie seemed bad in the beginning, but I was mistaken. It turned out to be amazing in the end.': neutral




In [143]:
print("SVM predictions")
print(predict_sentence(svm_model, "Amazing product!"))
print(predict_sentence(svm_model, "Not bad"))
print(predict_sentence(svm_model, "Good product, not bad at all"))

SVM predictions
positive
negative
negative
