In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import ast
import re
import time

# Load datasets
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')

# Load GloVe model
def load_glove_model(glove_file):
    print("Loading Glove Model")
    f = open(glove_file,'r', encoding='utf8')
    model = {}
    for line in f:
        split_line = line.split()
        word = split_line[0]
        embedding = np.array([float(val) for val in split_line[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

glove_model = load_glove_model('../Data/glove.6B.100d.txt')

# Preprocess text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

train_df['review'] = train_df['review'].apply(clean_text)
test_df['review'] = test_df['review'].apply(clean_text)

Loading Glove Model
Done. 400000  words loaded!


In [10]:
# Function to create averaged word vectors using GloVe
def feature_vector(words, model):
    featureVec = np.zeros((100,), dtype="float32") 
    nwords = 0
    for word in words:
        if word in model:
            nwords += 1
            featureVec = np.add(featureVec, model[word])
    if nwords > 0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

def get_avg_feature_vectors(reviews, model):
    reviewFeatureVecs = np.zeros((len(reviews), 100), dtype="float32") 
    for i, review in enumerate(reviews):
        reviewFeatureVecs[i] = feature_vector(review, model)
    return reviewFeatureVecs

# Generate feature vectors for both train and test data using GloVe
trainDataVecs = get_avg_feature_vectors(train_df['review'], glove_model)
testDataVecs = get_avg_feature_vectors(test_df['review'], glove_model)

# Save feature vectors
np.save('trainDataVecs_glove.npy', trainDataVecs)
np.save('testDataVecs_glove.npy', testDataVecs)

In [11]:
# Encode sentiment labels
le = LabelEncoder()
y_train = le.fit_transform(train_df['sentiment'])
y_test = le.transform(test_df['sentiment'])

# Save labels
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

# Load feature vectors and labels for training
X_train = np.load('trainDataVecs_glove.npy')
X_test = np.load('testDataVecs_glove.npy')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

# Train and evaluate models (Random Forest, Naive Bayes, k-NN) using the same function as before
def train_evaluate_model(model, X_train, y_train, X_test, y_test, model_name='Model'):
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(f"{model_name} Time: {end-start:.2f} seconds")


In [12]:
# Train and evaluate Random Forest
train_evaluate_model(RandomForestClassifier(n_estimators=100), X_train, y_train, X_test, y_test, 'Random Forest')

Random Forest Accuracy: 0.7656
Random Forest Time: 25.22 seconds


In [13]:
# Train and evaluate Naive Bayes
train_evaluate_model(GaussianNB(), X_train, y_train, X_test, y_test, 'Naive Bayes')

Naive Bayes Accuracy: 0.7149
Naive Bayes Time: 0.02 seconds


In [14]:
# Train and evaluate k-NN
train_evaluate_model(KNeighborsClassifier(n_neighbors=5), X_train, y_train, X_test, y_test, 'k-NN')

k-NN Accuracy: 0.7298
k-NN Time: 0.00 seconds
