# Installing libraries

In [None]:
'''
python version: 3.12.8
beautifulsoup4==4.12.2
bs4==0.0.2
gensim==4.3.3
joblib==1.4.2
nltk==3.9.1
numpy==1.26.4
pandas==2.2.3
regex==2024.11.6
scikit-learn==1.6.1
scipy==1.13.1
soupsieve==2.5
threadpoolctl==3.5.0
torch==2.6.0
torchaudio==2.6.0
torchvision==0.21.0
tqdm==4.67.1
'''

import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, KeyedVectors
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import gensim
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
import re
from bs4 import BeautifulSoup
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/nirajdalavi/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nirajdalavi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nirajdalavi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Reading from dataset

In [3]:
pd_frame = pd.read_csv("data.tsv", sep="\t", on_bad_lines='skip', low_memory=False)


# Dataset generation

In [4]:
pd_frame = pd_frame[['review_body', 'star_rating']] 
pd_frame=pd_frame.dropna(subset=['star_rating','review_body'], how='any')
pd_frame['star_rating'] = pd.to_numeric(pd_frame['star_rating'], errors='coerce')

balanced_df = pd_frame.groupby("star_rating").apply(lambda x: x.sample(n=50000, random_state=42)).reset_index(drop=True)
balanced_df['sentiment'] = balanced_df['star_rating'].apply(
    lambda x: 1 if x > 3 else (2 if x <= 2 else 3)
)

balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

  balanced_df = pd_frame.groupby("star_rating").apply(lambda x: x.sample(n=50000, random_state=42)).reset_index(drop=True)


# Word Embedding a

In [5]:
import gensim.downloader as api

pre_model = api.load('word2vec-google-news-300')
result = pre_model.most_similar(positive=["doctor", "court"], negative=["hospital"], topn=1)
print("Doctor - Hospital + Court =", result)

similarity = pre_model.similarity("happy", "satisfied")
print("Similarity between 'happy' and 'satisfied':", similarity)

Doctor - Hospital + Court = [('judge', 0.6122931838035583)]
Similarity between 'happy' and 'satisfied': 0.6437949


# Word Embedding b

In [6]:
custom_mod = Word2Vec(sentences=balanced_df['review_body'].apply(word_tokenize), vector_size=300, window=11, min_count=10, workers=1, seed=43)

result_custom = custom_mod.wv.most_similar(positive=["doctor", "court"], negative=["hospital"], topn=1)
print("Doctor - Hospital + Court =", result_custom)

similarity_custom = custom_mod.wv.similarity("happy", "satisfied")
print("Similarity between 'happy' and 'satisfied' (Custom Model):", similarity_custom)


Doctor - Hospital + Court = [('documenting', 0.4553646147251129)]
Similarity between 'happy' and 'satisfied' (Custom Model): 0.9003303


# Data preprocessing from HW1

In [7]:

contract_dict = {
    "he'll": "he will","he'll've": "he will have","isn't": "is not","it'd": "it would",
    "it'd've": "it would have","it'll": "it will","it'll've": "it will have","it's": "it is",
    "let's": "let us","ma'am": "madam","mayn't": "may not","mightn't": "might not","might've": "might have",
    "must've": "must have","mustn't": "must not","needn't": "need not","ain't": "is not",
    "aren't": "are not","can't": "cannot","couldn't": "could not","could've": "could have",
    "couldn't've": "could not have","didn't": "did not","doesn't": "does not","don't": "do not",
    "hadn't": "had not","hadn't've": "had not have","hasn't": "has not","haven't": "have not",
    "haven't've": "have not have","he'd": "he would","he'd've": "he would have","needn't've": "need not have",
    "o'clock": "of the clock","shalln't": "shall not","shan't": "shall not","she'd": "she would",
    "she'd've": "she would have","he's": "he is","how'd": "how did","how'd'y": "how do you",
    "how'll": "how will","how's": "how is","I'd": "I would","I'd've": "I would have",
    "I'll": "I will","I'll've": "I will have","I'm": "I am","I've": "I have","I'd": "I had",
    "I'd've": "I had have","I'm": "I am","I've": "I have","she'll": "she will","she'll've": "she will have",
    "she's": "she is","should've": "should have","shouldn't": "should not","shouldn't've": "should not have",
    "so've": "so have","so's": "so is","that'd": "that would","that'd've": "that would have","that's": "that is",
    "there'd": "there would","there'd've": "there would have","there's": "there is","they'd": "they would",
    "they'd've": "they would have","they'll": "they will","they'll've": "they will have","they're": "they are",
    "they've": "they have","to've": "to have","wasn't": "was not","we'd": "we would","we'd've": "we would have",
    "we'll": "we will","we'll've": "we will have","we're": "we are","we've": "we have","weren't": "were not",
    "what'd": "what did","what'd'y": "what do you","what'll": "what will","what'll've": "what will have",
    "what're": "what are","what's": "what is","where've": "where have","who'd": "who would",
    "who'd've": "who would have","who'll": "who will","who'll've": "who will have","who's": "who is",
    "who've": "who have","why'd": "why did","why'll": "why will","why's": "why is",
    "why've": "why have","you'd": "you would","you'd've": "you would have","you'll": "you will",
    "you'll've": "you will have","you're": "you are","you've": "you have","what've": "what have",
    "when'd": "when did","when'll": "when will","when's": "when is","when've": "when have",
    "where'd": "where did","where'll": "where will","where's": "where is",
}

def review_cleaning(review):
    review = review.lower()
    if "<" in review and ">" in review: 
        review = BeautifulSoup(review, "html.parser").get_text()
    review = re.sub(r'http\S+|www\S+', '', review)
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contract_dict.keys()) + r')\b')
    review = pattern.sub(lambda x: contract_dict[x.group()], review)
    review = re.sub(r'[^a-zA-Z\s]', '', review)
    review = re.sub(r'\s+', ' ', review).strip()
    return review

balanced_df['review_body'] = balanced_df['review_body'].astype(str)
balanced_df['review_body'] = balanced_df['review_body'].apply(review_cleaning)


stop_words = set(stopwords.words('english'))

def stopwords_removal(review):
    words = review.split()
    words = [word for word in words if word not in stop_words]
    review = ' '.join(words)
    return review

balanced_df['review_body'] = balanced_df['review_body'].apply(stopwords_removal)


lemma = WordNetLemmatizer()

def review_lemmatization(review):
    words = review.split()
    words = [lemma.lemmatize(word) for word in words]
    review = ' '.join(words)
    return review



balanced_df['review_body'] = balanced_df['review_body'].apply(review_lemmatization)



# Data preparation for binary classification

In [8]:
binary_df = balanced_df[balanced_df['sentiment'] != 3]

X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(binary_df['review_body'], binary_df['sentiment'], test_size=0.2, random_state=42)

X_train_token = [word_tokenize(text) for text in X_train_bin]
X_test_token = [word_tokenize(text) for text in X_test_bin]

# Simple models

In [14]:

tf_idf = TfidfVectorizer()
tf_idf.fit(binary_df['review_body'])
X_train_tfidf = tf_idf.transform(X_train_bin)
X_test_tfidf = tf_idf.transform(X_test_bin)     

def avg_vector(review, model, vector_size=300):
    vectors = [model[word] for word in review if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)  


X_train_bin_avg_pre = np.array([avg_vector(review, pre_model) for review in X_train_token])
X_test_bin_avg_pre = np.array([avg_vector(review, pre_model) for review in X_test_token])

X_train_bin_avg_cus = np.array([avg_vector(review, custom_mod.wv) for review in X_train_token])
X_test_bin_avg_cus = np.array([avg_vector(review, custom_mod.wv) for review in X_test_token])

def train_eval(X_train, X_test, y_train, y_test, f_type):

    perceptron = Perceptron(max_iter=1000,eta0=0.01, random_state=42)
    perceptron.fit(X_train, y_train)
    y_pred_perceptron = perceptron.predict(X_test)
    perceptron_acc = accuracy_score(y_test, y_pred_perceptron)
 
    svm = LinearSVC()
    svm.fit(X_train, y_train)
    y_pred_svm = svm.predict(X_test)
    svm_acc = accuracy_score(y_test, y_pred_svm)

    print(f"Feature: {f_type}")
    print(f"Perceptron Accuracy: {perceptron_acc:.4f}")
    print(f"SVM Accuracy: {svm_acc:.4f}")
    print()

In [15]:
train_eval(X_train_tfidf, X_test_tfidf, y_train_bin, y_test_bin, "TF-IDF")
train_eval(X_train_bin_avg_pre, X_test_bin_avg_pre, y_train_bin, y_test_bin, "Word2Vec-Google-News-300")
train_eval(X_train_bin_avg_cus, X_test_bin_avg_cus, y_train_bin, y_test_bin, "Custom Word2Vec")

Feature: TF-IDF
Perceptron Accuracy: 0.8150
SVM Accuracy: 0.8637

Feature: Word2Vec-Google-News-300
Perceptron Accuracy: 0.7544
SVM Accuracy: 0.8161

Feature: Custom Word2Vec
Perceptron Accuracy: 0.7825
SVM Accuracy: 0.8435



# Feedforward Neural Networks

# Setting device

In [16]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

print(f"Using device: {device}")

Using device: mps


# MLP

In [17]:
class MLP(nn.Module):
    def __init__(self, input_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, 50)
        self.fc2 = nn.Linear(50, 10)
        self.fc3 = nn.Linear(10, output_size)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)  

def train_mlp(X_train, y_train, X_test, y_test, input_size, output_size, model_name):
    mod = MLP(input_size, output_size).to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(mod.parameters(), lr=0.001)
    train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)

    for epoch in range(10):
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = mod(batch_X)
            loss = loss_fn(outputs, batch_y)
            loss.backward()
            optimizer.step()
    
    with torch.no_grad():
        test_outputs = mod(X_test)
        predictions = torch.argmax(test_outputs, dim=1)
        accuracy = (predictions == y_test).float().mean().item()
    
    print(f"{model_name}: {accuracy:.4f}")

def prep_tensors(X, y, shift_labels=True):
    y_tensor = torch.tensor(y.values, dtype=torch.long)
    if shift_labels:  
        y_tensor -= 1  
    return torch.tensor(X, dtype=torch.float32).to(device), y_tensor.to(device)



# Data prep for ternary classification

In [18]:
X_train_ter, X_test_ter, y_train_ter, y_test_ter = train_test_split(balanced_df['review_body'], balanced_df['sentiment'], test_size=0.2, random_state=42)

X_train_ter_token = [word_tokenize(text) for text in X_train_ter]
X_test_ter_token = [word_tokenize(text) for text in X_test_ter]

X_train_ter_avg_pre = np.array([avg_vector(review, pre_model) for review in X_train_ter_token])
X_test_ter_avg_pre = np.array([avg_vector(review, pre_model) for review in X_test_ter_token])

X_train_ter_avg_cus = np.array([avg_vector(review, custom_mod.wv) for review in X_train_ter_token])
X_test_ter_avg_cus = np.array([avg_vector(review, custom_mod.wv) for review in X_test_ter_token])

# Average w2v (part a)

In [19]:
#AVG custom-binary
X_train_avg_bin_tensor, y_train_bin_tensor = prep_tensors(X_train_bin_avg_cus, y_train_bin)
X_test_avg_bin_tensor, y_test_bin_tensor = prep_tensors(X_test_bin_avg_cus, y_test_bin)

#AVG pretrained-binary
X_train_avg_google_tensor, y_train_google_tensor = prep_tensors(X_train_bin_avg_pre, y_train_bin)
X_test_avg_google_tensor, y_test_google_tensor = prep_tensors(X_test_bin_avg_pre, y_test_bin)

#AVG custom-ternary
X_train_avg_ter_tensor, y_train_ter_tensor = prep_tensors(X_train_ter_avg_cus, y_train_ter)
X_test_avg_ter_tensor, y_test_ter_tensor = prep_tensors(X_test_ter_avg_cus, y_test_ter)

#AVG pretrained-ternary
X_train_avg_google_ter_tensor, y_train_google_ter_tensor = prep_tensors(X_train_ter_avg_pre, y_train_ter)
X_test_avg_google_ter_tensor, y_test_google_ter_tensor = prep_tensors(X_test_ter_avg_pre, y_test_ter)


In [20]:
train_mlp(X_train_avg_google_tensor, y_train_google_tensor, X_test_avg_google_tensor, y_test_google_tensor, 300, 2, "Binary - Avg W2V (Google)")


Binary - Avg W2V (Google): 0.8434


In [21]:
train_mlp(X_train_avg_bin_tensor, y_train_bin_tensor, X_test_avg_bin_tensor, y_test_bin_tensor, 300, 2, "Binary - Avg Custom W2V")


Binary - Avg Custom W2V: 0.8625


In [22]:
train_mlp(X_train_avg_google_ter_tensor, y_train_google_ter_tensor, X_test_avg_google_ter_tensor, y_test_google_ter_tensor, 300, 3, "Ternary - Avg W2V (Google)")

Ternary - Avg W2V (Google): 0.6860


In [23]:
train_mlp(X_train_avg_ter_tensor, y_train_ter_tensor, X_test_avg_ter_tensor, y_test_ter_tensor, 300, 3, "Ternary - Avg Custom W2V")


Ternary - Avg Custom W2V: 0.7031


# Concatenated w2v (part b)

In [24]:
def concat_vector(review, model, vector_size=300, max_words=10):
    vectors = [model[word] for word in review if word in model][:max_words]  
    while len(vectors) < max_words:  
        vectors.append(np.zeros(vector_size))
    return np.concatenate(vectors)

X_train_bin_concat_pre = np.array([concat_vector(review, pre_model) for review in X_train_token])
X_test_bin_concat_pre = np.array([concat_vector(review, pre_model) for review in X_test_token])

X_train_bin_concat_cus = np.array([concat_vector(review, custom_mod.wv) for review in X_train_token])
X_test_bin_concat_cus = np.array([concat_vector(review, custom_mod.wv) for review in X_test_token])

X_train_ter_concat_pre = np.array([concat_vector(review, pre_model) for review in X_train_ter_token])
X_test_ter_concat_pre = np.array([concat_vector(review, pre_model) for review in X_test_ter_token])

X_train_ter_concat_cus = np.array([concat_vector(review, custom_mod.wv) for review in X_train_ter_token])
X_test_ter_concat_cus = np.array([concat_vector(review, custom_mod.wv) for review in X_test_ter_token])

#CONCAT custom-binary
X_train_concat_bin_tensor, y_train_bin_tensor = prep_tensors(X_train_bin_concat_cus, y_train_bin)
X_test_concat_bin_tensor, y_test_bin_tensor = prep_tensors(X_test_bin_concat_cus, y_test_bin)

#CONCAT pretrained-binary
X_train_concat_google_tensor, y_train_google_tensor = prep_tensors(X_train_bin_concat_pre, y_train_bin)
X_test_concat_google_tensor, y_test_google_tensor = prep_tensors(X_test_bin_concat_pre, y_test_bin)

#CONCAT custom-ternary
X_train_concat_ter_tensor, y_train_ter_tensor = prep_tensors(X_train_ter_concat_cus, y_train_ter)
X_test_concat_ter_tensor, y_test_ter_tensor = prep_tensors(X_test_ter_concat_cus, y_test_ter)

#CONCAT pretrained-ternary
X_train_concat_google_ter_tensor, y_train_google_ter_tensor = prep_tensors(X_train_ter_concat_pre, y_train_ter)
X_test_concat_google_ter_tensor, y_test_google_ter_tensor = prep_tensors(X_test_ter_concat_pre, y_test_ter)



In [25]:
train_mlp(X_train_concat_google_tensor, y_train_google_tensor, X_test_concat_google_tensor, y_test_google_tensor, 3000, 2, "Binary - concat W2V (Google)")

Binary - concat W2V (Google): 0.7531


In [26]:
train_mlp(X_train_concat_bin_tensor, y_train_bin_tensor, X_test_concat_bin_tensor, y_test_bin_tensor, 3000, 2, "Binary - concat Custom W2V")

Binary - concat Custom W2V: 0.7735


In [27]:
train_mlp(X_train_concat_google_ter_tensor, y_train_google_ter_tensor, X_test_concat_google_ter_tensor, y_test_google_ter_tensor, 3000, 3, "Ternary - concat W2V (Google)")

Ternary - concat W2V (Google): 0.5983


In [28]:
train_mlp(X_train_concat_ter_tensor, y_train_ter_tensor, X_test_concat_ter_tensor, y_test_ter_tensor, 3000, 3, "Ternary - concat Custom W2V")

Ternary - concat Custom W2V: 0.6183


# Convolutional Neural Networks

In [29]:
# Extracting Word2Vec embeddings
def w2c_embedding(tokens, model, max_len=50, vector_size=300):
    f_vectors = [model[word] if word in model else np.zeros(vector_size, dtype=np.float32) for word in tokens]
    if len(f_vectors) < max_len:
        f_vectors.extend([np.zeros(vector_size, dtype=np.float32)] * (max_len - len(f_vectors)))
    return np.array(f_vectors[:max_len], dtype=np.float32) 

# PyTorch Dataset Class
class SADataset(Dataset):
    def __init__(self, review_tokens, labels, model):
        self.review_tokens = review_tokens  
        self.labels = [label - 1 for label in labels]  # zero based indexing
        self.model = model.wv if hasattr(model, "wv") else model  # Word2Vec Model

    def __len__(self):
        return len(self.review_tokens)

    def __getitem__(self, idx):
        tokens = self.review_tokens[idx] 
        embedding = w2c_embedding(tokens, self.model) 
        label = self.labels[idx]  
        return torch.tensor(embedding, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

train_bin_pre = SADataset(X_train_token, y_train_bin, pre_model)
test_bin_pre = SADataset(X_test_token, y_test_bin, pre_model)
train_bin_cus = SADataset(X_train_token, y_train_bin, custom_mod.wv)
test_bin_cus = SADataset(X_test_token, y_test_bin, custom_mod.wv)
train_ter_pre = SADataset(X_train_ter_token, y_train_ter, pre_model)
test_ter_pre = SADataset(X_test_ter_token, y_test_ter, pre_model)
train_ter_cus = SADataset(X_train_ter_token, y_train_ter, custom_mod.wv)
test_ter_cus = SADataset(X_test_ter_token, y_test_ter, custom_mod.wv)


In [30]:

class CNN(nn.Module):
    def __init__(self, input_channels=300, op_size=3):  # 3 classes for ternary 
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=50, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=50, out_channels=10, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2)
        
        sam_ip = torch.zeros(1, input_channels, 50)  
        sam_op = self._get_conv_output(sam_ip)
        self.fc1 = nn.Linear(sam_op, op_size)  

    def _get_conv_output(self, x):
        x = torch.relu(self.conv1(x))
        x = self.pool(x)
        x = torch.relu(self.conv2(x))
        x = self.pool(x)
        return x.view(x.shape[0], -1).shape[1]  # Flatten size

    def forward(self, x):
        x = x.permute(0, 2, 1)  #(batch, channel, sequence)
        x = torch.relu(self.conv1(x))
        x = self.pool(x)
        x = torch.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(x.shape[0], -1)  # Flatten before fc1
        x = self.fc1(x)
        return x
    
#Train CNN Model with DataLoader 
def train_cnn(train_loader, test_loader, num_classes, epochs=10, lr=0.001):
    model = CNN(op_size=num_classes).to(device)  
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)  

            optimizer.zero_grad()
            outputs = model(batch_X)  
            loss = loss_fn(outputs, batch_y)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)  
            outputs = model(batch_X)
            _, predicted = torch.max(outputs, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()

    accuracy = correct / total
    print(f"{accuracy:.4f}")
    return accuracy

# Dataloaders for binary and ternary classification
train_bin_cus_loader = DataLoader(train_bin_cus, batch_size=64, shuffle=True)
test_bin_cus_loader = DataLoader(test_bin_cus, batch_size=64, shuffle=False)
train_bin_pre_loader = DataLoader(train_bin_pre, batch_size=64, shuffle=True)
test_bin_pre_loader = DataLoader(test_bin_pre, batch_size=64, shuffle=False)
train_ter_cus_loader = DataLoader(train_ter_cus, batch_size=64, shuffle=True)
test_ter_cus_loader = DataLoader(test_ter_cus, batch_size=64, shuffle=False)
train_ter_pre_loader = DataLoader(train_ter_pre, batch_size=64, shuffle=True)
test_ter_pre_loader = DataLoader(test_ter_pre, batch_size=64, shuffle=False)


In [31]:
print("\nTraining CNN for Binary Classification")
print("\nAccuracy(pretrained model with CNN): ")
accuracy_cnn_bin_pre = train_cnn(train_bin_pre_loader, test_bin_pre_loader, num_classes=2)
print("\nAccuracy(custom model with CNN):")
accuracy_cnn_bin_cus = train_cnn(train_bin_cus_loader, test_bin_cus_loader, num_classes=2)




Training CNN for Binary Classification

Accuracy(pretrained model with CNN): 
0.8610

Accuracy(custom model with CNN):
0.8642


In [32]:
print("\nTraining CNN for Ternary Classification")
print("\nAccuracy(pretrained model with CNN):")
accuracy_cnn_tern_pre = train_cnn(train_ter_pre_loader, test_ter_pre_loader, num_classes=3)



Training CNN for Ternary Classification

Accuracy(pretrained model with CNN):
0.7005


In [33]:
print("\nAccuracy(custom model with CNN):")
accuracy_cnn_tern_cus = train_cnn(train_ter_cus_loader, test_ter_cus_loader, num_classes=3)


Accuracy(custom model with CNN):
0.7069
