In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import ast
import re
import time

# Load datasets
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')

# Preprocess text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

train_df['review'] = train_df['review'].apply(clean_text)
test_df['review'] = test_df['review'].apply(clean_text)

In [3]:
# Load pre-trained GloVe embeddings
def load_glove_embeddings(glove_file):
    embeddings = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings('../Data/glove.6B.100d.txt')

# Function to create averaged word vectors
def feature_vector(words, embeddings, vector_size=100):
    feature_vec = np.zeros(vector_size, dtype="float32")
    nwords = 0
    for word in words:
        if word in embeddings:
            nwords += 1
            feature_vec = np.add(feature_vec, embeddings[word])
    if nwords > 0:
        feature_vec = np.divide(feature_vec, nwords)
    return feature_vec

def get_avg_feature_vectors(reviews, embeddings, vector_size=100):
    review_feature_vecs = np.zeros((len(reviews), vector_size), dtype="float32")
    for i, review in enumerate(reviews):
        review_feature_vecs[i] = feature_vector(review, embeddings, vector_size)
    return review_feature_vecs

# Generate feature vectors for both train and test data using GloVe embeddings
train_data_vecs = get_avg_feature_vectors(train_df['review'], glove_embeddings)
test_data_vecs = get_avg_feature_vectors(test_df['review'], glove_embeddings)

# Save feature vectors
np.save('trainDataVecs_glove.npy', train_data_vecs)
np.save('testDataVecs_glove.npy', test_data_vecs)

In [4]:
# Encode sentiment labels
le = LabelEncoder()
y_train = le.fit_transform(train_df['sentiment'])
y_test = le.transform(test_df['sentiment'])

# Save labels
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

# Load feature vectors and labels for training
X_train = np.load('trainDataVecs_glove.npy')
X_test = np.load('testDataVecs_glove.npy')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

In [5]:
# Function to train and evaluate a model
def train_evaluate_model(model, X_train, y_train, X_test, y_test, model_name='Model'):
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(f"{model_name} Time: {end-start:} seconds")

In [6]:
# Train and evaluate Random Forest
train_evaluate_model(RandomForestClassifier(n_estimators=100), X_train, y_train, X_test, y_test, 'Random Forest')

Random Forest Accuracy: 0.7634
Random Forest Time: 21.638729095458984 seconds


In [7]:
# Train and evaluate Naive Bayes
train_evaluate_model(GaussianNB(), X_train, y_train, X_test, y_test, 'Naive Bayes')

Naive Bayes Accuracy: 0.7149
Naive Bayes Time: 0.012629985809326172 seconds


In [8]:
# Train and evaluate k-NN
train_evaluate_model(KNeighborsClassifier(n_neighbors=5), X_train, y_train, X_test, y_test, 'k-NN')

k-NN Accuracy: 0.7298
k-NN Time: 0.005006074905395508 seconds
