In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pwd
import os
os.chdir('/content/drive/MyDrive/Panos/Εργασία DeepLearning/Assignment 2 & 3')
!pwd

/content
/content/drive/MyDrive/Panos/Εργασία DeepLearning/Assignment 2 & 3


In [None]:
def load_glove_embeddings(glove_file, word_to_index, embedding_dim):
    """ Load GloVe embeddings and create an embedding matrix. """
    embeddings = {}
    with open(glove_file, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings[word] = vector

    embedding_matrix = np.zeros((len(word_to_index), embedding_dim))
    for word, index in word_to_index.items():
        vector = embeddings.get(word)
        if vector is not None:
            embedding_matrix[index] = vector

    return torch.tensor(embedding_matrix, dtype=torch.float)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

class SentimentRNN(nn.Module):
    def __init__(self,no_layers,vocab_size,hidden_dim,embedding_dim,pretrained_embeddings,drop_prob=0.5):
        super(SentimentRNN,self).__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.no_layers = no_layers
        self.vocab_size = vocab_size
        # Embedding layer with pre-trained embeddings
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight = nn.Parameter(pretrained_embeddings, requires_grad=True)
        #lstm
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=self.hidden_dim,
                           num_layers=no_layers, batch_first=True)
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        # linear and sigmoid layer
        self.fc = nn.Linear(self.hidden_dim, output_dim)
        self.sig = nn.Sigmoid()

    def forward(self,x,hidden):
        batch_size = x.size(0)
        # embeddings and lstm_out
        embeds = self.embedding(x)  # shape: B x S x Feature   since batch = True
        #print(embeds.shape)  #[50, 500, 1000]
        lstm_out, hidden = self.lstm(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.sig(out)
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        # return last sigmoid output and hidden state
        return sig_out, hidden

    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        h0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        c0 = torch.zeros((self.no_layers,batch_size,self.hidden_dim)).to(device)
        hidden = (h0,c0)
        return hidden

In [None]:
# Preprocess the data
import os
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from torchtext.data import get_tokenizer

def read_txt_files(folder_path, label):
    data = []
    for filename in os.listdir(folder_path):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            review = file.read()
            data.append((review, label))
    return data

# Path to the folders containing positive and negative reviews
positive_folder = '/content/drive/MyDrive/Panos/Εργασία DeepLearning/Assignment 2 & 3 /IMDB_reviews/aclImdb/train/pos'
negative_folder = '/content/drive/MyDrive/Panos/Εργασία DeepLearning/Assignment 2 & 3 /IMDB_reviews/aclImdb/train/neg'

positive_folder_2 = '/content/drive/MyDrive/Panos/Εργασία DeepLearning/Assignment 2 & 3 /IMDB_reviews/aclImdb/test/pos'
negative_folder_2 = '/content/drive/MyDrive/Panos/Εργασία DeepLearning/Assignment 2 & 3 /IMDB_reviews/aclImdb/test/neg'

# Read positive and negative reviews
positive_data = read_txt_files(positive_folder, label=1)
negative_data = read_txt_files(negative_folder, label=0)
positive_data_2 = read_txt_files(positive_folder_2, label=1)
negative_data_2 = read_txt_files(negative_folder_2, label=0)

# Combine positive and negative data
all_data = positive_data + negative_data + positive_data_2 + negative_data_2

In [None]:
import json
os.chdir('/content/drive/MyDrive/Panos/Εργασία DeepLearning/Assignment 2 & 3 ')

with open("all_data_IMDB.json", "w") as outfile:
    json.dump(all_data, outfile)

In [None]:
import json

f = open('/content/drive/MyDrive/Panos/Εργασία DeepLearning/Assignment 2 & 3/all_data_IMDB.json')
all_data = json.load(f)

f.close()

In [None]:
count1 = 0
count2 = 0

for item in range(0, len(all_data)):
  if all_data[item][1] == 1:
    count1 += 1
  else:
    count2 += 1

print("Positive reviews:", count1)
print("Negative reviews:", count2)

Positive reviews: 25000
Negative reviews: 25000


In [None]:
type(all_data[0][1])

int

In [None]:
print(type(all_data))
print(len(all_data))
print(all_data[1])

<class 'list'>
50000
["Being the prototype of the classical Errol Flynn adventure movie and having a good story as well as two more brilliant co-stars in Maureen O'Hara (what an exquisite beauty!) and Anthony Quinn, I can only recommend this movie to all those having even the slightest liking for romance and adventure.<br /><br />Hollywood at its best!", 1]


In [None]:
#Function to clean html tags from a sentence
import re
def clean_html(sentence):
    pattern = re.compile('<.*?>')
    cleaned_text = re.sub(pattern,' ',sentence)
    return cleaned_text

# Function to keep only words containing letters A-Z and a-z.
# this will remove all punctuations, special characters.
def rem_pun(sentence):
    cleaned_text  = re.sub('[^a-zA-Z]',' ',sentence)
    return (cleaned_text)

#Remove URL from sentences.
def rem_url(sen):
    txt = re.sub(r"http\S+", " ", sen)
    sen = re.sub(r"www.\S+", " ", txt)
    return (sen)

#Remove words like 'ddddddddd', 'funnnnnn', 'coolllllll' etc. Preserves words like 'goods', 'cool', 'best' etc. We will remove all such words which has three consecutive repeating characters.
def remove_extra(sen):
    cleaned_text  = re.sub("\s*\b(?=\w*(\w)\1{2,})\w*\b",' ',sen)
    return (cleaned_text)


In [None]:
import random
random.shuffle(all_data)

In [None]:
print(all_data[50])

["Honestly, when I went to see this movie at the Rave theater in Plainfield Indiana, I did not expect much. I went to this movie only because I figured hey, it's a WWE movie it'll be good for a laugh. Then I sat down and watched it and saw why they chose Glen Jacobs (Kane) to play Jacob Goodnight. He is probably one of the freakiest guys on the big screen (much worse in my opinion than Freddy or Jason) and has one big advantage to other movies that attracts me to a horror movie. It shows Jacob Goodnight as someone who is human. He has a heart, no matter how twisted and creepy it is. He feels pain, something that Jason never does or appears to show. He feels sorrow and pleasure, though again both of them insane which you will notice if you see the movie. All in all, a different experience in my opinion than many slashers, and it surprised me in a few ways, as in who lived in the end.", 1]


In [None]:
print(all_data[0])
print()
print(all_data[1])
print()
print(all_data[2])
print()
print(all_data[3])
print()

["The excruciatingly slow pace of this film was probably the director's express intention, in order to convey what life was like growing up as a village teen in China. However, I found the combination of the glacially slow 'plot' and the general filming style so impersonal as to be totally alienating, particularly to a western audience. At times I actually had trouble telling some characters apart, as they were filmed from such a distance. Two hours in and I was totally past caring. As someone who is not only interested in music but is also very into the history and culture of China (and is by the way no stranger to Chinese cinema), I couldn't engage with a single character and found nothing to get my teeth into. It begs the question: If I disliked it, who on earth would like it? Give me Zhang Yimou, give me Chen Kaige. Give me the work of just about any other Chinese director I've ever seen. This sorry effort just doesn't measure up at all. I'd be sorry to see Chinese cinema judged ag

In [None]:
new_data = []

for sentence in range(0, len(all_data)):

  sent = all_data[sentence][0]

  sent = clean_html(sent)
  sent = rem_url(sent)
  all_data[sentence][0] = sent

In [None]:
print(all_data[0])
print()
print(all_data[1])
print()
print(all_data[2])
print()
print(all_data[3])
print()

["The excruciatingly slow pace of this film was probably the director's express intention, in order to convey what life was like growing up as a village teen in China. However, I found the combination of the glacially slow 'plot' and the general filming style so impersonal as to be totally alienating, particularly to a western audience. At times I actually had trouble telling some characters apart, as they were filmed from such a distance. Two hours in and I was totally past caring. As someone who is not only interested in music but is also very into the history and culture of China (and is by the way no stranger to Chinese cinema), I couldn't engage with a single character and found nothing to get my teeth into. It begs the question: If I disliked it, who on earth would like it? Give me Zhang Yimou, give me Chen Kaige. Give me the work of just about any other Chinese director I've ever seen. This sorry effort just doesn't measure up at all. I'd be sorry to see Chinese cinema judged ag

In [None]:
#Convert all the words to lower case
#Source https://github.com/saugatapaul1010/Amazon-Fine-Food-Reviews-Analysis
import re

def lower_case(x):
    x = str(x).lower()
    x = x.replace(",000,000", " m").replace(",000", " k").replace("′", "'").replace("’", "'")\
                           .replace("won't", " will not").replace("cannot", " can not").replace("can't", " can not")\
                           .replace("n't", " not").replace("what's", " what is").replace("it's", " it is")\
                           .replace("'ve", " have").replace("'m", " am").replace("'re", " are")\
                           .replace("he's", " he is").replace("she's", " she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will").replace("how's"," how has").replace("y'all"," you all")\
                           .replace("o'clock"," of the clock").replace("ne'er"," never").replace("let's"," let us")\
                           .replace("finna"," fixing to").replace("gonna"," going to").replace("gimme"," give me").replace("gotta"," got to").replace("'d"," would")\
                           .replace("daresn't"," dare not").replace("dasn't"," dare not").replace("e'er"," ever").replace("everyone's"," everyone is")\
                           .replace("'cause'"," because")

    x = re.sub(r"([0-9]+)000000", r"\1m", x)
    x = re.sub(r"([0-9]+)000", r"\1k", x)
    return x

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence

# Tokenizer
tokenizer = get_tokenizer('basic_english')

# Tokenize the data
tok_reviews = [tokenizer(lower_case(review)) for review, _ in all_data]

# Build vocabulary
vocab = build_vocab_from_iterator(tok_reviews, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])  # Handle unknown tokens
print(len(vocab))

# Get the stoi (string-to-index) mapping from the vocab
stoi = vocab.get_stoi()

# Create word_to_index mapping, shifting indices by 1 and reserving 0 for <PAD>
word_to_index = {word: (index + 1) for word, index in stoi.items()}
word_to_index['<PAD>'] = 0

def preprocess(data, vocab, max_sequence_length=512):
    tok_reviews, labels = zip(*[(tokenizer(review), label) for review, label in data])

    # Numericalize the sentences
    numericalized_data = [[vocab[token] for token in sentence] for sentence in tok_reviews]

    # Pad sequences
    padded_sequences = pad_sequence([torch.tensor(seq[:max_sequence_length] + [0] * max(0, max_sequence_length - len(seq))) for seq in numericalized_data], batch_first=True, padding_value=0)

    print("Sample original text:", data[0][0])  # Print sample original text
    print("Sample tokenized text:", tok_reviews[0])  # Print sample tokenized text
    print("Sample numericalized text:", numericalized_data[0])  # Print sample numericalized text
    print("Sample label:", labels[0])  # Print sample label

    return padded_sequences, torch.tensor(labels)

# Split data before processing
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1
train_split = int(len(all_data) * train_ratio)
val_split = int(len(all_data) * (train_ratio + val_ratio))

train_data = all_data[:train_split]
val_data = all_data[train_split:val_split]
test_data = all_data[val_split:]

# Process the datasets
x_train_padded, y_train_tensor = preprocess(train_data, vocab)
x_val_padded, y_val_tensor = preprocess(val_data, vocab)
x_test_padded, y_test_tensor = preprocess(test_data, vocab)


147682
Sample original text: The excruciatingly slow pace of this film was probably the director's express intention, in order to convey what life was like growing up as a village teen in China. However, I found the combination of the glacially slow 'plot' and the general filming style so impersonal as to be totally alienating, particularly to a western audience. At times I actually had trouble telling some characters apart, as they were filmed from such a distance. Two hours in and I was totally past caring. As someone who is not only interested in music but is also very into the history and culture of China (and is by the way no stranger to Chinese cinema), I couldn't engage with a single character and found nothing to get my teeth into. It begs the question: If I disliked it, who on earth would like it? Give me Zhang Yimou, give me Chen Kaige. Give me the work of just about any other Chinese director I've ever seen. This sorry effort just doesn't measure up at all. I'd be sorry to s

In [None]:
print(x_train_padded.unsqueeze(1)[0])

print(type(x_train_padded))
print(x_train_padded[0])
print(type(x_train_padded[0]))

print()

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import numpy as np

# Convert data to PyTorch tensors

# x_train_tensor = x_train_padded.unsqueeze(1)
y_train_tensor = y_train_tensor.unsqueeze(1)
# x_val_tensor = x_val_padded.unsqueeze(1)
y_val_tensor = y_val_tensor.unsqueeze(1)
# x_test_tensor = x_test_padded.unsqueeze(1)
y_test_tensor = y_test_tensor.unsqueeze(1)

# Create DataLoaders directly with padded data
train_dataset = TensorDataset(x_train_padded, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)

val_dataset = TensorDataset(x_val_padded, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)

test_dataset = TensorDataset(x_test_padded, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
################################################################################

Using device: cuda


In [None]:
no_layers = 2
vocab_size = len(vocab) + 1 #extra 1 for padding
print(len(vocab))
embedding_dim = 200
output_dim = 1
hidden_dim = 256

# Assuming you have word_to_index mapping and the GloVe file path
glove_file = 'glove.6B.200d.txt'  # Path to your GloVe file
embedding_dim = 200  # Make sure this matches the GloVe embeddings you are using

# Load GloVe embeddings
pretrained_embeddings = load_glove_embeddings(glove_file, word_to_index, embedding_dim)
# pretrained_embeddings.to(device)

model = SentimentRNN(no_layers, vocab_size, hidden_dim, embedding_dim, pretrained_embeddings, drop_prob=0.5)
model.to(device)

# loss and optimization functions
lr=0.001
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

147682


In [None]:
pretrained_embeddings.shape

torch.Size([99343, 200])

In [None]:
# Assuming train_loader is your DataLoader instance
for i, (inputs, labels) in enumerate(val_loader):
    print(f"Batch {i+1}")

    # Print shapes
    print("Inputs shape:", inputs.shape)
    print("Labels shape:", labels.shape)

    # Print actual data
    # Depending on your data, you might need to adjust how you print it
    print("Inputs data:", inputs)
    print("Labels data:", labels)

    if i == 10:  # Inspect the first 2 batches
        break

In [None]:
print(x_train_padded.shape)
print(y_train_tensor.shape)
print()
print(x_test_padded.shape)
print(y_test_tensor.shape)
print()
print(x_val_padded.shape)
print(y_val_tensor.shape)

torch.Size([40000, 512])
torch.Size([40000, 1])

torch.Size([5000, 512])
torch.Size([5000, 1])

torch.Size([5000, 512])
torch.Size([5000, 1])


In [None]:
import numpy as np

# function to predict accuracy
def acc(pred,label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

batch_size = 32

clip = 5
epochs = 5
valid_loss_min = np.Inf
# train for some number of epochs
epoch_tr_loss,epoch_vl_loss = [], []
epoch_tr_acc,epoch_vl_acc = [], []

for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()
    # initialize hidden state

    h = model.init_hidden(batch_size)
    for inputs, labels in train_loader:
        # batch_size = inputs.size(0)

        inputs, labels = inputs.to(device), labels.to(device)

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        model.zero_grad()
        output, h = model(inputs, h)

        output = output.view(-1, 1)  # Reshape to [batch_size, 1]

        # calculate the loss and perform backprop
        loss = criterion(output, labels.float())
        loss.backward()
        train_losses.append(loss.item())
        # calculating accuracy
        accuracy = acc(output,labels.view(-1, 1))
        train_acc += accuracy
        #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

    val_h = model.init_hidden(batch_size)
    val_losses = []
    val_acc = 0.0
    model.eval()
    # for inputs, labels in val_loader:
    #         val_h = tuple([each.data for each in val_h])

    #         inputs, labels = inputs.to(device), labels.to(device)

    #         output, val_h = model(inputs, val_h)

    #         output = output.view(-1, 1)
    #         val_loss = criterion(output, labels.float())

    #         val_losses.append(val_loss.item())

    #         accuracy = acc(output, labels.view(-1, 1))
    #         val_acc += accuracy

    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc/len(train_loader.dataset)
    epoch_val_acc = val_acc/len(val_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
    print(f'Epoch {epoch+1}')
    print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
    print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')

Epoch 1
train_loss : 0.6934163057804108 val_loss : nan
train_accuracy : 50.677499999999995 val_accuracy : 0.0


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Epoch 2
train_loss : 0.6932401895999909 val_loss : nan
train_accuracy : 50.56 val_accuracy : 0.0
Epoch 3
train_loss : 0.6895505448818207 val_loss : nan
train_accuracy : 51.77 val_accuracy : 0.0
Epoch 4
train_loss : 0.6452429441213607 val_loss : nan
train_accuracy : 56.75750000000001 val_accuracy : 0.0
Epoch 5
train_loss : 0.33548453809022905 val_loss : nan
train_accuracy : 86.2375 val_accuracy : 0.0


In [None]:
# Function for accuracy calculation
def acc(pred, label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()

# Testing loop
test_losses = []  # to track the loss
test_acc = 0.0    # to track the accuracy

model.eval()  # turn off dropout for testing

# Iterate over the test set
for inputs, labels in test_loader:
    # Adjust the batch size based on the current batch
    current_batch_size = inputs.size(0)
    test_h = model.init_hidden(current_batch_size)

    test_h = tuple([each.data for each in test_h])  # detach hidden state

    inputs, labels = inputs.to(device), labels.to(device)

    output, test_h = model(inputs, test_h)
    output = output.view(-1, 1)

    test_loss = criterion(output, labels.float())
    test_losses.append(test_loss.item())

    accuracy = acc(output, labels.view(-1, 1))
    test_acc += accuracy

# Calculate the average test loss and accuracy
avg_test_loss = np.mean(test_losses)
avg_test_acc = test_acc / len(test_loader.dataset)

# Print the test results
print(f'Test loss: {avg_test_loss}')
print(f'Test accuracy: {avg_test_acc * 100}%')

Test loss: 0.32323430293494726
Test accuracy: 87.03999999999999%
