Use test.csv from https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews

Rename test.csv -> amazon_review.csv

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load the dataset
# Assuming 'amazon_reviews.csv' with columns 'reviewText' and 'sentiment'
data = pd.read_csv('amazon_reviews.csv')

# Preprocess the text
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase, remove punctuation and stop words
    text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = word_tokenize(text)
    return ' '.join([word for word in tokens if word not in stop_words])

# Apply preprocessing
data['cleaned_review'] = data['reviewText'].apply(preprocess_text)


In [2]:
import gensim.downloader as api
import numpy as np

# Load pre-trained Word2Vec model (Google News, for example)
w2v_model = api.load("word2vec-google-news-300")

# Function to convert review text to an embedding
def get_review_embedding(text):
    tokens = text.split()
    word_vectors = [w2v_model[word] for word in tokens if word in w2v_model]
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(300)  # If no words match in the model

# Generate embeddings for each review
data['embedding'] = data['cleaned_review'].apply(get_review_embedding)
X = np.vstack(data['embedding'].values)
y = data['polarity'].values


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8153375
F1 Score: 0.8161121276622229
Classification Report:
               precision    recall  f1-score   support

           1       0.81      0.82      0.82     39896
           2       0.82      0.81      0.81     40104

    accuracy                           0.82     80000
   macro avg       0.82      0.82      0.82     80000
weighted avg       0.82      0.82      0.82     80000

