In [41]:
import numpy as np
import pandas as pd
import io
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer 
import os, re, csv, math, codecs
from sklearn import model_selection
from sklearn import metrics
import torch
import torch.nn as nn
import tensorflow as tf 
import fasttext

torch.manual_seed(1024);

In [42]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# LSTM-sentiment
This notebooks uses pretrained word embeddings (Fasttext) and an LSTM architecture to predict the sentiments (positive or negative) of IMDB comments.
Heavily inspired by [this](https://www.kaggle.com/code/tientd95/deep-learning-for-sentiment-analysis#2.-Pretrained-word-embedding-) Kaggle notebook.

In [43]:
# Load the dataset into a list of tuples
df = pd.read_csv('imdb-dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


### Data preprocessing


In [44]:
# First adjust all the labels
df.sentiment = df.sentiment.apply(lambda x: 1 if x == 'positive' else 0)
y = df.sentiment.values

In [45]:
#load fasttext embeddings
print('loading word embeddings...')
fasttext_embedding = {}
f = codecs.open('./wiki.simple.vec', encoding='utf-8')
for line in tqdm(f):
    values = line.rstrip().rsplit(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    fasttext_embedding[word] = coefs
f.close()

loading word embeddings...


111052it [00:08, 13479.67it/s]


In [46]:
fasttext_embedding['hello'].shape

(300,)

In [47]:
class IMDBDataset:
    def __init__(self, reviews, targets):
        """
        Argument:
        reviews: a numpy array
        targets: a vector array
        
        Return xtrain and ylabel in torch tensor datatype, stored in dictionary format
        """
        self.reviews = reviews
        self.target = targets
    
    def __len__(self):
        # return length of dataset
        return len(self.reviews)
    
    def __getitem__(self, index):
        # given an idex (item), return review and target of that index in torch tensor
        review = torch.tensor(self.reviews[index,:], dtype = torch.long)
        target = torch.tensor(self.target[index], dtype = torch.float)
        
        return {'review': review,
                'target': target}

In [48]:
class LSTM(nn.Module):
    def __init__(self, embedding_matrix):
        """
        Given embedding_matrix: numpy array with vector for all words
        return prediction ( in torch tensor format)
        """
        super(LSTM, self).__init__()
        # Number of words = number of rows in embedding matrix
        num_words = embedding_matrix.shape[0]
        # Dimension of embedding is num of columns in the matrix
        embedding_dim = embedding_matrix.shape[1]
        # Define an input embedding layer
        self.embedding = nn.Embedding(
                                      num_embeddings=num_words,
                                      embedding_dim=embedding_dim)
        # Embedding matrix actually is collection of parameter
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype = torch.float32))
        # Because we use pretrained embedding (GLove, Fastext,etc) so we turn off requires_grad-meaning we do not train gradient on embedding weight
        self.embedding.weight.requires_grad = False
        # LSTM with hidden_size = 128
        self.lstm = nn.LSTM(
                            embedding_dim, 
                            128,
                            bidirectional=True,
                            batch_first=True,
                             )
        # Input(512) because we use bi-directional LSTM ==> hidden_size*2 + maxpooling **2  = 128*4 = 512, will be explained more on forward method
        self.out = nn.Linear(512, 1)

    def forward(self, x):
        # pass input (tokens) through embedding layer
        x = self.embedding(x)
        # fit embedding to LSTM
        hidden, _ = self.lstm(x)
        # apply mean and max pooling on lstm output
        avg_pool = torch.mean(hidden, 1)
        max_pool, _ = torch.max(hidden, 1)
        # concat avg_pool and max_pool ( so we have 256 size, also because this is bidirectional ==> 256*2 = 512)
        out = torch.cat((avg_pool, max_pool), 1)
        # fit out to self.out to conduct dimensionality reduction from 512 to 1
        out = self.out(out)
        # return output
        return out

In [49]:
def train(data_loader, model, optimizer):
    """
    this is model training for one epoch
    data_loader:  this is torch dataloader, just like dataset but in torch and devide into batches
    model : lstm
    optimizer : torch optimizer
    """
    # set model to training mode
    model.train()
    # go through batches of data in data loader
    for data in data_loader:
        reviews = data['review']
        targets = data['target']
        # move the data to device that we want to use
        reviews = reviews.to(device, dtype = torch.long)
        targets = targets.to(device, dtype = torch.float)
        # clear the gradient
        optimizer.zero_grad()
        # make prediction from model
        predictions = model(reviews)
        # caculate the losses
        loss = nn.BCEWithLogitsLoss()(predictions, targets.view(-1,1))
        # backprop
        loss.backward()
        # single optimization step
        optimizer.step()

In [50]:
def evaluate(data_loader, model):
    final_predictions = []
    final_targets = []
    model.eval()
    # turn off gradient calculation
    with torch.no_grad():
        for data in data_loader:
            reviews = data['review']
            targets = data['target']
            reviews = reviews.to(device, dtype = torch.long)
            targets = targets.to(device, dtype=torch.float)
            # make prediction
            predictions = model(reviews)
            # move prediction and target to cpu
            predictions = predictions.cpu().numpy().tolist()
            targets = data['target'].cpu().numpy().tolist()
            # add predictions to final_prediction
            final_predictions.extend(predictions)
            final_targets.extend(targets)
    return final_predictions, final_targets

In [51]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 5

In [52]:
def create_embedding_matrix(word_index, embedding_dict=None, d_model=100):
    """
     this function create the embedding matrix save in numpy array
    :param word_index: a dictionary with word: index_value
    :param embedding_dict: a dict with word embedding
    :d_model: the dimension of word pretrained embedding, here I just set to 100, we will define again
    :return a numpy array with embedding vectors for all known words
    """
    embedding_matrix = np.zeros((len(word_index) + 1, d_model))
    ## loop over all the words
    for word, index in word_index.items():
        if word in embedding_dict:
            embedding_matrix[index] = embedding_dict[word]
    return embedding_matrix

In [53]:
# use tf.keras for tokenization,  
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df.review.values.tolist())

In [54]:
print('Load fasttext embedding')
embedding_matrix = create_embedding_matrix(tokenizer.word_index, embedding_dict=fasttext_embedding, d_model=300)

# STEP 2: cross validation
n = int(len(df) * 0.8)
train_df = df[:n].reset_index(drop=True)
valid_df = df[n:].reset_index(drop=True)

# STEP 3: pad sequence
xtrain = tokenizer.texts_to_sequences(train_df.review.values)
xtest = tokenizer.texts_to_sequences(valid_df.review.values)

# zero padding
xtrain = tf.keras.preprocessing.sequence.pad_sequences(xtrain, maxlen=MAX_LEN)
xtest = tf.keras.preprocessing.sequence.pad_sequences(xtest, maxlen=MAX_LEN)

# STEP 4: initialize dataset class for training
train_dataset = IMDBDataset(reviews=xtrain, targets=train_df.sentiment.values)

# STEP 5: Load dataset to Pytorch DataLoader
# after we have train_dataset, we create a torch dataloader to load train_dataset class based on specified batch_size
train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size = TRAIN_BATCH_SIZE, num_workers=2)
# initialize dataset class for validation
valid_dataset = IMDBDataset(reviews=xtest, targets=valid_df.sentiment.values)
valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size = VALID_BATCH_SIZE, num_workers=1)
# STEP 6: Running 
# feed embedding matrix to lstm
model_fasttext = LSTM(embedding_matrix)
# set model to cuda device
model_fasttext.to(device)
# initialize Adam optimizer
optimizer = torch.optim.Adam(model_fasttext.parameters(), lr=1e-3)

print('training model')

for epoch in range(EPOCHS):
    #train one epoch
    train(train_data_loader, model_fasttext, optimizer)
    #validate
    outputs, targets = evaluate(valid_data_loader, model_fasttext)
    # threshold
    outputs = np.array(outputs) >= 0.5
    # calculate accuracy
    accuracy = metrics.accuracy_score(targets, outputs)
    print(f'epoch: {epoch}, accuracy_score: {accuracy}')

Load fasttext embedding
training model


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/mertkipcak/.pyenv/versions/3.11.1/lib/python3.11/multiprocessing/spawn.py", line 120, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/mertkipcak/.pyenv/versions/3.11.1/lib/python3.11/multiprocessing/spawn.py", line 130, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'IMDBDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 

In [None]:
# Well... This was a bad way to learn that my laptop is not good enough to run this model.