<a href="https://colab.research.google.com/github/nicolaerosca/colab_notebooks/blob/master/IMDB_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Text sentiment analysis can be seen as a classification task. Here I am trying to determine sentiment analysis of a text. 

In [1]:
import numpy as np
import torch
from torch.nn import Parameter
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import os
import tarfile
!pip install wget
import wget
import io
import re
from torch.utils.data import Dataset, DataLoader

Collecting wget
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/40/15/30/7d8f7cea2902b4db79e3fea550d7d7b85ecb27ef992b618f3f
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [2]:
# downloading IMDB data set and creating CSV manifest file 
data_set_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
target_filename = data_set_url.split("/")[-1]
regex_pattern = '(\d+)_(\d+)\.txt'

def create_manifest(data_path, output_path):
  with io.FileIO(output_path, "w") as file:
    file.write("path,id,score\n".encode('utf-8'))
    for dirpath, dirnames, files in os.walk(data_path + "pos"):
      for file_name in files:
        m = re.search('(\d+)_(\d+)\.txt', file_name)
        review_id = m.group(1)
        score = m.group(2)
        sample = data_path + "pos/" + file_name + ',' + str(review_id) + ',' + str(score) + '\n'
        file.write(sample.encode('utf-8'))
    for dirpath, dirnames, files in os.walk(data_path + "neg"):
      for file_name in files:
        m = re.search('(\d+)_(\d+)\.txt', file_name)
        review_id = m.group(1)
        score = m.group(2)
        sample = data_path + "neg/" + file_name + ',' + str(review_id) + ',' + str(score) + '\n'
        file.write(sample.encode('utf-8'))

if not os.path.exists(target_filename) and not os.path.exists('aclImdb'):
  print("Downloading {}...".format(target_filename))
  wget.download(data_set_url)
if not os.path.exists('aclImdb'):
  print("Unpacking {}...".format(target_filename))
  tar = tarfile.open(target_filename)
  tar.extractall()
  tar.close()
  os.remove(target_filename)
  
create_manifest("aclImdb/test/", 'imdb_test_manifest.csv')
create_manifest("aclImdb/train/", 'imdb_train_manifest.csv')
# os.remove('imdb_test_manifest.csv')

Downloading aclImdb_v1.tar.gz...
Unpacking aclImdb_v1.tar.gz...


In [3]:
import pandas as pd
df = pd.read_csv('imdb_train_manifest.csv')
df.shape

(25000, 3)

In [7]:
df.query("score <= 5").head()

Unnamed: 0,path,id,score
12500,aclImdb/train/neg/5808_4.txt,5808,4
12501,aclImdb/train/neg/6654_1.txt,6654,1
12502,aclImdb/train/neg/5731_1.txt,5731,1
12503,aclImdb/train/neg/301_1.txt,301,1
12504,aclImdb/train/neg/903_3.txt,903,3


# Logistic regression
Here  we will have two classes, possitive and negative (1 and 0)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# read all 
reviews_train = []
for i, row in pd.read_csv('imdb_train_manifest.csv').iterrows():
   reviews_train.append(open(row['path'], "r").read())

reviews_test = []
for i, row in pd.read_csv('imdb_test_manifest.csv').iterrows():
    reviews_test.append(open(row['path'], "r").read())

    
# clean 
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

# vectorize BoW, https://towardsdatascience.com/hacking-scikit-learns-vectorizers-9ef26a7170af
cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
X = cv.transform(reviews_train_clean)
X_test = cv.transform(reviews_test_clean)


target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

final_model = LogisticRegression(C=0.05)
final_model.fit(X, target)
print ("Final Accuracy: %s" % accuracy_score(target, final_model.predict(X_test)))



Final Accuracy: 0.88152


In [25]:
# Let's try TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfIdf = TfidfVectorizer()
tfIdf.fit_transform(reviews_train_clean)
X_tfidf = tfIdf.transform(reviews_train_clean)
X_test_tfidf = tfIdf.transform(reviews_test_clean)

target = [1 if i < 12500 else 0 for i in range(25000)]

X_train, X_val, y_train, y_val = train_test_split(
    X_tfidf, target, train_size = 0.75
)

# different_c = [0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.85, 1.9, 1.95, 2.1, 2.5, 3]
# for c in different_c:
#   model = LogisticRegression(C=c)
#   model.fit(X, target)
#   print("Accuracy: %s, %s" % (c, accuracy_score(target, model.predict(X_test))))
  
final_model = LogisticRegression(C=1.85)
final_model.fit(X, target)
print ("Final Accuracy : %s" % accuracy_score(target, final_model.predict(X_test)))



Final Accuracy : 0.86728


In [26]:
# let't do some inference
x = tfIdf.transform(["It was not a nice movie", "It was a nice movie", "It was a terrible movie", "It was a not terrible movie"])
print(final_model.predict(x))


[1 1 0 0]


As we can see negation make things complicated for sentiment analysis. Also let's try a Multiclass classifier and add  all scores from reviews

In [47]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

train_multiclass = []
for i, row in pd.read_csv('imdb_train_manifest.csv').iterrows():
    train_multiclass.append(row['score'])

target_multiclass = []
for i, row in pd.read_csv('imdb_test_manifest.csv').iterrows():
    target_multiclass.append(row['score'])
    
X_train, X_val, y_train, y_val = train_test_split(X_tfidf, train_multiclass, train_size = 0.75)

multiclass_model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=6))
multiclass_model.fit(X_train, y_train)
print ("Multiclass Accuracy : %s" % accuracy_score(target_multiclass, multiclass_model.predict(X_test)))
print ("F1 %s " % (f1_score(y_val, multiclass_model.predict(X_val), average="weighted")))



Multiclass Accuracy : 0.35928
F1 0.3880064735063461 


In [33]:
x = tfIdf.transform(["It was not a nice movie", "It was a nice movie", "It was a terrible movie", "It was a not terrible movie"])
print(multiclass_model.predict(x))

[7 7 1 1]


In [49]:
from sklearn.metrics import classification_report
print(classification_report(y_val, multiclass_model.predict(X_val)))

              precision    recall  f1-score   support

           1       0.54      0.80      0.65      1211
           2       0.24      0.11      0.15       591
           3       0.26      0.19      0.22       577
           4       0.38      0.31      0.35       727
           7       0.31      0.26      0.28       623
           8       0.29      0.28      0.28       733
           9       0.22      0.10      0.14       570
          10       0.51      0.70      0.59      1218

   micro avg       0.42      0.42      0.42      6250
   macro avg       0.34      0.34      0.33      6250
weighted avg       0.38      0.42      0.39      6250



In [42]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, multiclass_model.predict(X_val))


array([[1067,   15,   27,   42,    6,    9,    0,   54],
       [ 383,   25,   27,   53,    5,   12,    1,   40],
       [ 300,   24,   56,  130,   25,   32,    2,   54],
       [ 240,   18,   60,  197,   54,   58,    1,   78],
       [  44,    3,   19,   54,  151,  157,   14,  212],
       [  43,    2,    9,   22,   90,  200,   21,  340],
       [  27,    1,    5,   10,   32,  128,   17,  373],
       [  74,    1,    5,   14,   40,  105,    9,  933]])

# Pytorch Datasets

In [0]:
class ReviewDataset(Dataset):
  """A pandas wrapper for CSV file """

  def __init__(self, csv_file, root_dir, transform=None):
      """
      Args:
          csv_file (string): Path to the csv file with review score.
          root_dir (string): Directory with all the text files. Depends where files where un-archived.
          transform (callable, optional): Optional transform to be applied
              on a sample.
      """
      self.df = pd.read_csv(csv_file)
      self.root_dir = root_dir
      self.transform = transform

  def __len__(self):
      return len(self.df)

  def __getitem__(self, idx):
      file_name = os.path.join(self.root_dir,
                              self.df.iloc[idx, 0])
      f = open(file_name, "r")
      text = f.read()
      id = self.df.iloc[idx, 1]
      score =  self.df.iloc[idx, 2]
      sample = {'id': id, 'text': text, 'score': score}

      if self.transform:
          sample = self.transform(sample)

      return sample

In [0]:
dataSet = ReviewDataset("imdb_train_manifest.csv", "/content/")
dataSet.__len__()

In [64]:
dataSet[1]

{'id': 4093,
 'score': 8,
 'text': 'Regardless of what personal opinion one may have of Walerian Borowczyk grotesque yet beautiful gem "La bête" of 1975, one has to admit that this bizarre gem is an absolutely unique cinematic experience. Borowczyk erotic fairy tale was banned in several countries for a long time, and it is quite obvious why this controversial gem fell victim to stuporous film censors. "La bête" is a fascinating blend of intense and beautiful fairy-tale-like atmosphere, quite explicit eroticism and genuine weirdness that bravely refuses to take any compromise. The fact that beastiality (of sorts) is one of the film\'s central themes did certainly not help it with the censors, but it made it highly controversial and therefore known to a wider audience.<br /><br />Pierre de l\'Esperance (Guy Tréjan), the head of a French aristocratic family, has arranged for his somewhat demented son Mathurin (Pierre Benedetti) to marry Lucy Broadhurst (Lisbeth Hummel), the young and bea

# Transformations/Vectorization

In [0]:
from torchvision import transforms

# todo: create Transform classes
REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

# stack/chain multiple transformations, lowercase -> REPLACE_NO_SPACE -> REPLACE_TAGS -> 
transform = transforms.Compose([
    transforms.Lambda(lambda item: [REPLACE_NO_SPACE.sub("", item['text'].lower()), item['score']] ),
    transforms.Lambda(lambda item: [REPLACE_WITH_SPACE.sub(" ", item[0]), item[1]])
])

train = ReviewDataset("imdb_train_manifest.csv", "/content/",  transform=transform)
test = ReviewDataset("imdb_test_manifest.csv", "/content/",  transform=transform)

# for now vectorization is happening in train.

In [0]:
train_loader = DataLoader(train, batch_size=25, shuffle=True)
test_loader = DataLoader(test, batch_size=25)

In [0]:
for sentence, score in train_loader:
  print(sentence[0])
  print(score.data)
  print(torch.ones(len(sentence[0]), dtype=torch.long) * score[0])
  break

For now I will try simple LSTM recurrent network but output sequence I will create and array of score of that sentence. 

In [8]:
word_to_ix = {}
for sentence in reviews_train_clean:
    for word in sentence.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
for sentence in reviews_test_clean:
    for word in sentence.split():
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
len(word_to_ix)            

NameError: ignored

In [0]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores
      


In [147]:
EMBEDDING_DIM = 32
HIDDEN_DIM = 32

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), 11)
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)
  
def prepare_sequence_batch(batch_seq, to_ix):
    idxs = [prepare_sequence(seq, to_ix) for seq in batch_seq]
    return torch.tensor(idxs, dtype=torch.long)
    
for epoch in range(20):
  for sentence, score in train_loader: 
    # Step 1. Remember that Pytorch accumulates gradients.
    # We need to clear them out before each instance
    model.zero_grad()
    # Step 2. Get our inputs ready for the network, that is, turn them into
    # Tensors of word indices.
    seq = sentence[0].split()
    sentence_in = prepare_sequence(seq, word_to_ix)
    targets = torch.ones(len(seq), dtype=torch.long) * score[0]
    
    # Step 3. Run our forward pass.
    tag_scores = model(sentence_in)
#     print("Score: %s, Target len: %s, scores len: %s" % (score[0], len(targets), len(tag_scores)))
    # last score should be the classification class, TODO

    # Step 4. Compute the loss, gradients, and update the parameters by
    #  calling optimizer.step()
    loss = loss_function(tag_scores, targets)
    loss.backward()
    optimizer.step()


  # validation loss
  with torch.no_grad():
    pred = []
    real_score = []
    losses = []
    for sentence, score in test_loader:
      seq = sentence[0].split()
      inputs = prepare_sequence(seq, word_to_ix)
      tag_scores = model(inputs)
      pred.append(tag_scores[-1])
      real_score.append(score)
      targets = torch.ones(len(seq), dtype=torch.long) * score[0]
      loss = loss_function(tag_scores, targets)
    print("Test accuracy: %s\nAverage loss %s" % (accuracy_score(real_score, pred), np.average(losses)))

ValueError: ignored

In [0]:
# prepare_sequence("nice work", word_to_ix)
# X_train[0]
word_to_ix["wasn"] = 0

torch

In [146]:
np.average([4343.4, 3232.6])

3788.0