In [1]:
import pandas as pd
import numpy as np
import time
import re
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

nltk.data.path.append('/Users/moiz/nltk_data')

# Load your dataset
train_df = pd.read_csv('../Data/train.csv')
test_df = pd.read_csv('../Data/test.csv')

def preprocess_text(text, method='Lemmatization'):
    """Enhanced text preprocessing with lowercasing, punctuation removal."""
    text = text.lower()  # Lowercase text
    text = re.sub(r'[\d\W]+', ' ', text)  # Remove punctuation and numbers
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    if method == 'stemming':
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
    else:
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

# Apply preprocessing
train_df['processed_reviews'] = train_df['review'].apply(lambda x: preprocess_text(x))
test_df['processed_reviews'] = test_df['review'].apply(lambda x: preprocess_text(x))

In [2]:
# Feature Extraction - Example with TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_df['processed_reviews'])
X_test = vectorizer.transform(test_df['processed_reviews'])

# Label encoding
le = LabelEncoder()
y_train = le.fit_transform(train_df['sentiment'])
y_test = le.transform(test_df['sentiment'])

# Model Training and Evaluation
# Example with MultinomialNB and RandomForestClassifier
nb_model = MultinomialNB()
rf_model = RandomForestClassifier(n_estimators=100)
knn_model = KNeighborsClassifier(n_neighbors=5)

# Training and evaluating each model
for model in [nb_model, rf_model, knn_model]:
    start_time = time.time()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    end_time = time.time()
    print(f"{model.__class__.__name__} Accuracy:", accuracy_score(y_test, predictions))
    print(f"Training time: {end_time - start_time} seconds")

MultinomialNB Accuracy: 0.86325
Training time: 0.012458086013793945 seconds
RandomForestClassifier Accuracy: 0.84925
Training time: 22.505813121795654 seconds
KNeighborsClassifier Accuracy: 0.72185
Training time: 734.6023411750793 seconds


In [3]:
# Ensemble Method - Basic Voting Classifier
ensemble = VotingClassifier(estimators=[
    ('nb', nb_model),
    ('rf', rf_model),
    ('knn', knn_model)
], voting='hard')

start_time = time.time()
ensemble.fit(X_train, y_train)
end_time = time.time()
ensemble_predictions = ensemble.predict(X_test)
print("Ensemble Accuracy:", accuracy_score(y_test, ensemble_predictions))

# Cross-validation Example for MultinomialNB
scores = cross_val_score(nb_model, X_train, y_train, cv=5)
print("Average cross-validation score for MultinomialNB: {:.2f}".format(scores.mean()))
print(f"Training time: {end_time - start_time} seconds")

Ensemble Accuracy: 0.86715
Average cross-validation score for MultinomialNB: 0.86
Training time: 55.61999988555908 seconds
