In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import sys
import os
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from wordcloud import WordCloud

is_cuda = torch.cuda.is_available()

# check if we have GPU available
if is_cuda:
    device = torch.device("cuda")
    print("Using GPU")
else:
    device = torch.device("cpu")
    print("Using CPU")

In [None]:
# unzip and read the datafile
try:
    df=pd.read_csv("IMDB Dataset.csv")
except:
    # !wget https://github.com/SalvatoreRa/tutorial/blob/main/datasets/IMDB.zip?raw=true
    # !unzip IMDB.zip?raw=true
    pass

df['sentiment_encoded'] = np.where(df['sentiment']=='positive', 0,1)
X,y = df['review'].values, df['sentimend_encoded'].values
x_train, x_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size=0.2)
x_train,x_val,y_train, y_val = train_test_split(x_train, y_train, stratify=y_train, test_size=0.1)
y_train, y_val, y_test = np.array(y_train), np.array(y_val), np.array(y_test)

In [None]:
def generate_wordclouds(df):
    '''
    Generate two wqord clouds from the 50 most frequent words in the list of positive and negative reviews respectively
    '''

    stop_words = set(stopwords.words('english'))

    # Separating reciews by sentiment
    positive_reviews = df[df['sentiment'] == 'positive']['review']
    negative_reviews = df[df['sentiment'] == 'negative']['review']

    def get_words(reviews):
        all_words = []
        for review in reviews:
            review = re.sub(r"[^\w\s]", '', review)
            review = re.sub(r"\d", '', review)
            words = review.split()
            filtered_words = [word for word in words if word not in stop_words and len(word) > 1]
            all_words.extend(filtered_words)
        return all_words
    
    positive_words = get_words(positive_reviews)
    negative_words = get_words(negative_reviews)

    positive_counts = Counter(positive_words)
    negative_counts = Counter(negative_words)

    positive_wordcloud = WordCloud(
        width=400,
        height=400,
        max_words=200,
        max_font_size=100,
        background_color='white',
        color_func=lambda *args, **kwargs: "green"
    ).generate_from_frequencies(positive_counts)

    negative_wordcloud = WordCloud(
        width=400,
        height=400,
        max_words=200,
        max_font_size=100,
        background_color='white',
        color_func=lambda *args, **kwargs: "red"
    ).generate_from_frequencies(negative_counts)

    plt.figure(figsize=(12,6))
    plt.subplot(1,2,1)
    plt.imshow(positive_wordcloud, interpolation='bilinear')
    plt.title("Positive Reviews")
    plt.axis("off")

    plt.subplot(1,2,2)
    plt.imshow(negative_wordcloud, interpolation='bilinear')
    plt.title('Negative Reviews')
    plt.axis("off")
    plt.savefig('word_clouds.jpg', format='jpeg', bbox_inches='tight')
    plt.show()

generate_wordclouds(df)

In [None]:
# plotting review length by sentiment

In [None]:
def preprocess_review(review):
    ''' 
    Cleaning of reviews: remove non-alphanumeric characters, collapse whitespace, and remove digits
    '''

    review = re.sub(r"[^\w\s]", ' ', review) # Replcace non-word characters with space
    review = re.sub(r"\s+", ' ', review) # Replace multiple spaces with a single space
    review = re.sub(r"\d", '', review) # Remove digits
    return review.strip().lower()

def tokenize_reviews(x_train, x_val, x_test):
    stop_words = set(stopwords.words("english"))

    # tokenize and clean list of reviews
    def tokenize_and_filter(reviews):
        word_list = []
        for review in reviews:
            words = word_tokenize(preprocess_review(review))
            filtered_words = [word for word in words if words not in stop_words and len(word) > 1]
            word_list.extend(filtered_words)
        return word_list
    
    # create a corpus
    corpus = Counter(tokenize_and_filter(x_train))
    # select the 1000 most commond words
    vocab = {word: i+1 for i, word in enumerate ([word for word, freq in corpus.most_common(1000)])}

    # convert reviews into sequences of indices
    def vectorize_reviews(reviews):
        vectorized = []
        for review in reviews:
            tokenized = word_tokenize(preprocess_review(review))
            indexed = [vocab[word] for word in tokenized if word in vocab]
        return vectorized
    
    _x_train = vectorize_reviews(x_train)
    _x_val = vectorize_reviews(x_val)
    _x_test = vectorize_reviews(x_test)

    return _x_train, _x_val, _x_test, vocab

X_train, x_val, x_test, vocab = tokenize_reviews(x_train, x_val, x_test)

In [None]:
# plot a review length of the tokenized data distribution