In [2]:
import nltk
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.data.path.append('/Users/moiz/nltk_data')

# Load the dataset
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')

In [3]:
used = []

In [4]:
# Function to preprocess text
used.append("Stemmization")
def preprocess_text(text, method='stemming'):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Stemming/Lemmatization
    if method == 'stemming':
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
    else:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply preprocessing
train_df['processed_reviews'] = train_df['review'].apply(preprocess_text)
test_df['processed_reviews'] = test_df['review'].apply(preprocess_text)

In [5]:
# Vectorizing text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['processed_reviews'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['processed_reviews'])

In [6]:
# count_vectorizer = CountVectorizer(max_features=5000)
# X_train_count = count_vectorizer.fit_transform(train_df['processed_reviews'])
# X_test_count = count_vectorizer.transform(test_df['processed_reviews'])
# used.append("Count Vectorizer")

In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Label encoding
le = LabelEncoder()
y_train = le.fit_transform(train_df['sentiment'])
y_test = le.transform(test_df['sentiment'])

In [8]:
for i in used:
    print(i)

Stemmization


In [9]:
# Naive Bayes
nb_model = MultinomialNB()
start_time = time.time()
nb_model.fit(X_train_tfidf, y_train)
end_time = time.time()
nb_predictions = nb_model.predict(X_test_tfidf)
used.append("Naive Bayes")
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_predictions))
print(f"Training time (Naive): {end_time - start_time} seconds")

Naive Bayes Accuracy: 0.85155
Training time (Naive): 0.011884212493896484 seconds


In [10]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100)
start_time = time.time()
rf_model.fit(X_train_tfidf, y_train)
end_time = time.time()
rf_predictions = rf_model.predict(X_test_tfidf)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_predictions))
print(f"Training time (RandomForest): {end_time - start_time} seconds")

Random Forest Accuracy: 0.8496
Training time (RandomForest): 39.69712996482849 seconds


In [11]:
# k-NN
knn_model = KNeighborsClassifier(n_neighbors=5)
start_time = time.time()
knn_model.fit(X_train_tfidf, y_train)
end_time = time.time()
knn_predictions = knn_model.predict(X_test_tfidf)
print("k-NN Accuracy:", accuracy_score(y_test, knn_predictions))
print(f"Training time (KNN): {end_time - start_time} seconds")

k-NN Accuracy: 0.7415
Training time (KNN): 0.019961118698120117 seconds
