In [13]:
# Import necessary libraries
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import ast
import re
import time

# Load datasets
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')

# Preprocess text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

In [14]:
train_df['review'] = train_df['review'].apply(clean_text)
test_df['review'] = test_df['review'].apply(clean_text)

# Save preprocessed data
train_df.to_csv('train_preprocessed.csv', index=False)
test_df.to_csv('test_preprocessed.csv', index=False)

In [15]:
# Load preprocessed data for Word2Vec training
train_df = pd.read_csv('train_preprocessed.csv')
train_df['review'] = train_df['review'].apply(ast.literal_eval)

# Train Word2Vec model
sentences = train_df['review'].tolist()
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)

# Save Word2Vec model
word2vec_model.save("custom_word2vec.model")

In [16]:
word2vec_model = Word2Vec.load("custom_word2vec.model")

# Function to create averaged word vectors
def feature_vector(words, model):
    featureVec = np.zeros((model.vector_size,), dtype="float32")
    nwords = 0
    index2word_set = set(model.wv.index_to_key)
    for word in words:
        if word in index2word_set:
            nwords += 1
            featureVec = np.add(featureVec, model.wv[word])
    if nwords > 0:
        featureVec = np.divide(featureVec, nwords)
    return featureVec

def get_avg_feature_vectors(reviews, model):
    reviewFeatureVecs = np.zeros((len(reviews), model.vector_size), dtype="float32")
    for i, review in enumerate(reviews):
        reviewFeatureVecs[i] = feature_vector(review, model)
    return reviewFeatureVecs

# Load preprocessed data for vectorization
test_df = pd.read_csv('test_preprocessed.csv')
test_df['review'] = test_df['review'].apply(ast.literal_eval)

# Generate feature vectors
trainDataVecs = get_avg_feature_vectors(train_df['review'], word2vec_model)
testDataVecs = get_avg_feature_vectors(test_df['review'], word2vec_model)

# Save feature vectors
np.save('trainDataVecs.npy', trainDataVecs)
np.save('testDataVecs.npy', testDataVecs)

In [17]:
# Encode sentiment labels
le = LabelEncoder()
y_train = le.fit_transform(train_df['sentiment'])
y_test = le.transform(test_df['sentiment'])

# Save labels
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

In [18]:
# Function to train and evaluate a model
def train_evaluate_model(model, X_train, y_train, X_test, y_test, model_name='Model'):
    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{model_name} Accuracy: {accuracy:.4f}")
    print(f"{model_name} Time: {end-start:.2f} seconds")

# Load feature vectors and labels for training
X_train = np.load('trainDataVecs.npy')
X_test = np.load('testDataVecs.npy')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

In [19]:
# Train and evaluate Random Forest
train_evaluate_model(RandomForestClassifier(n_estimators=100), X_train, y_train, X_test, y_test, 'Random Forest')

Random Forest Accuracy: 0.8225
Random Forest Time: 21.19 seconds


In [20]:
# Train and evaluate Naive Bayes
train_evaluate_model(GaussianNB(), X_train, y_train, X_test, y_test, 'Naive Bayes')

Naive Bayes Accuracy: 0.7579
Naive Bayes Time: 0.02 seconds


In [21]:
# Train and evaluate k-NN
train_evaluate_model(KNeighborsClassifier(n_neighbors=5), X_train, y_train, X_test, y_test, 'k-NN')

k-NN Accuracy: 0.7842
k-NN Time: 0.00 seconds
