In [1]:
import numpy as np 
import pandas as pd 
import torch.nn as nn 
import torch.nn.functional as F

In [2]:
df = pd.read_csv('data/cleaned_data/testing/random_names_eval.csv',usecols=['name','label'])
df = df.rename(columns={'name': 'review', 'label': 'sentiment'})
df.head()

Unnamed: 0,review,sentiment
0,lamberti raffaella,0
1,ciobanu gabriela oana,0
2,okeke chikelu,0
3,jamshidi roshanak,0
4,hristov bisera,0


In [3]:
df = df.sample(frac=1).reset_index(drop=True)

In [4]:
import torch

class IMDBDataset:
    def __init__(self,reviews,sentiments):
        self.reviews = reviews
        self.sentiments = sentiments
        
    def __len__(self):
        return len(self.reviews)

    def __getitem__(self,item):
        review = self.reviews[item,:]
        target = self.sentiments[item]
        return {
            "review": torch.tensor(review,dtype = torch.long),
            "target": torch.tensor(target,dtype = torch.long)
        }

In [5]:
class LSTM(nn.Module):
    def __init__(self,embedding_matrix):
        super(LSTM,self).__init__()

        num_embeddings, embedding_dim = embedding_matrix.shape
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False

        self.lstm1 = nn.LSTM(embedding_dim, 32, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(64, 16, bidirectional=True, batch_first=True)  # 64 because it's bidirectional
        self.lstm3 = nn.LSTM(32, 8, bidirectional=True, batch_first=True)  # 32 because it's bidirectional

        self.dropout1 = nn.Dropout(0.3)
        self.fc1 = nn.Linear(16, 16)  # 16 because it's bidirectional

        self.dropout2 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(16, 8)

        self.dropout3 = nn.Dropout(0.3)
        self.fc3 = nn.Linear(8, 1)


    def forward(self,x):
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x, _ = self.lstm3(x)
        x = self.dropout1(x)
        x = torch.relu(self.fc1(x))
        x = self.dropout2(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout3(x)
        x = torch.sigmoid(self.fc3(x))
        return x
        

In [6]:
import torch
import torch.nn as nn

def train(data_loader,model,optimizer,device):
    """
    This is the main training function that trains model
    for one epoch
    :param data_loader: this is the torchdataloader
    :param model: model(lstm model)
    :param optimizer: optimizer Adam, SGD etc
    :param device: this can be "cuda" or "cpu"
    """
    # set the model to training mode
    model.train()
    
    # go through the batches of data in data_loader:
    for data in tqdm(data_loader):
        reviews = data["review"]
        targets = data["target"]
        # move the data to the device that we want to use
        reviews = reviews.to(device,dtype=torch.long)
        targets = targets.to(device,dtype=torch.float)
        # clear the gradients
        optimizer.zero_grad()
        # make predictions from the models
        predictions = model(reviews)
        # loss
        loss = nn.BCEWithLogitsLoss()(predictions,targets.view(-1,1))
        loss.backward()
        optimizer.step()
        
def evaluate(data_loader,model,device):
    final_predictions = []
    final_targets = []
    
    model.eval()
    for data in tqdm(data_loader):
        reviews = data["review"]
        targets = data["target"]
        # move the data to the device that we want to use
        reviews = reviews.to(device,dtype=torch.long)
        targets = targets.to(device,dtype=torch.float)
        predictions = model(reviews)
        predictions = predictions.detach().cpu().numpy().tolist()
        targets = data["target"].detach().cpu().numpy().tolist()
        final_predictions.extend(predictions)
        final_targets.extend(targets)
        
        
    return  final_predictions,final_targets


In [7]:
MAX_LEN = 8
TRAIN_BATCH_SIZE = 128
VALID_BATCH_SIZE = 128
EPOCHS = 10

In [8]:
import io
import torch

import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn import metrics
from sklearn.model_selection import train_test_split

from tqdm import tqdm

In [9]:
def load_vectors(fname):
    fin = open(fname,'r',encoding='utf-8')
    data = {}
    for line in fin:
        tokens = line.split()
        data[tokens[0]] = np.array([float(value) for value in tokens[1:]])
        
    return data

In [10]:
def create_embedding_matrix(word_index,embedding_dict):
    """
    This function creates the embedding matrix
    :param word_index: a dictionary of word: index_value
    :param embedding_dict:
    :return a numpy array with embedding vectors for all known words
    """
    # intialize the embedding matrix 
    embedding_matrix = np.zeros((len(word_index)+1,100))
    for word, i in word_index.items():
        if word in embedding_dict:
            embedding_matrix[i] = embedding_dict[word]
    return embedding_matrix
            

In [11]:
def run(df):
    y = df.sentiment.values
    train_df,valid_df = train_test_split(df,test_size = 0.2, stratify = y)
    
    print('Fitting tokenizer')
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df.review.values.tolist())
    
    xtrain = tokenizer.texts_to_sequences(train_df.review.values)
    xtest = tokenizer.texts_to_sequences(valid_df.review.values)
    xtrain = tf.keras.preprocessing.sequence.pad_sequences(xtrain,maxlen = MAX_LEN)
    xtest = tf.keras.preprocessing.sequence.pad_sequences(xtest,maxlen = MAX_LEN)
    train_dataset = IMDBDataset(reviews=xtrain,sentiments=train_df.sentiment.values)
    train_loader = torch.utils.data.DataLoader(train_dataset,batch_size=TRAIN_BATCH_SIZE,num_workers=2)
    valid_dataset = IMDBDataset(reviews=xtest,sentiments=valid_df.sentiment.values)
    valid_loader = torch.utils.data.DataLoader(valid_dataset,batch_size=VALID_BATCH_SIZE,num_workers=2)
    
    print("Load embeddings")
    embedding_dict = load_vectors("data/glove.6B.100d.txt")
    embedding_matrix = create_embedding_matrix(tokenizer.word_index,embedding_dict)
    # create a torch device since we are using cuda
    device = torch.device("cuda")
    print(device)
    model = LSTM(embedding_matrix)
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(),lr = 1e-3)
    print("Traning model")
    best_accuracy = 0
    early_stopping_counter = 0
    for epoch in range(1,EPOCHS+1):
        train(train_loader,model,optimizer,device)
        outputs,targets = evaluate(valid_loader,model,device)
        outputs = np.array(outputs)>0.5
        accuracy = metrics.accuracy_score(targets,outputs)
        print(f"EPOCH:{epoch}, Accuracy Score: {accuracy}")
        if accuracy>best_accuracy:
            best_accuracy = accuracy
        else:
            early_stopping_counter +=1
        if early_stopping_counter>2:
            break
            
    
    

In [12]:
df.head()

Unnamed: 0,review,sentiment
0,beridze tamaz ramaz,0
1,vasylyshyn ruslana myroslava,0
2,chai rong yi,0
3,marino rosa,0
4,tremblay louna claudie,0


In [13]:
run(df)

Fitting tokenizer
Load embeddings
cuda


  from .autonotebook import tqdm as notebook_tqdm


Traning model


  0%|          | 0/999 [00:00<?, ?it/s]