In [35]:
!pip install seaborn


Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting matplotlib!=3.6.1,>=3.4 (from seaborn)
  Downloading matplotlib-3.10.1-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting cycler>=0.10 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Downloading fonttools-4.57.0-cp312-cp312-win_amd64.whl.metadata (104 kB)
     ---------------------------------------- 0.0/104.6 kB ? eta -:--:--
     ---------------------- ---------------- 61.4/104.6 kB 1.7 MB/s eta 0:00:01
     -------------------------------------- 104.6/104.6 kB 1.2 MB/s eta 0:00:00
Collecting kiwisolver>=1.3.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Downloading kiwisolver-1.4.8-cp312-cp312-win_amd64.whl.metadata (6.3 kB)
Collecting pyparsing>=2.3.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Downloading pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)
Downloading



In [54]:
#1. Data Processing
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [55]:
# Load dataset
df = pd.read_csv('Reviews.csv')
df = df[['Score', 'Summary', 'Text']].dropna()

In [56]:
# Combine summary and review text
df['review'] = df['Summary'] + " " + df['Text']

In [57]:
# Convert Score to sentiment (1-2: negative, 3: neutral, 4-5: positive)
def get_sentiment(score):
    if score <= 2:
        return 'negative'
    elif score == 3:
        return 'neutral'
    else:
        return 'positive'

df['sentiment'] = df['Score'].apply(get_sentiment)

In [58]:
# Sample data for faster training
df_sampled = df.sample(10000, random_state=42)

In [59]:
#Text Preprocessing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [42]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nafiz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nafiz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nafiz\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [43]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [44]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

df_sampled['clean_review'] = df_sampled['review'].apply(clean_text)

In [45]:
#2. Feature Extraction 
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df_sampled['clean_review'])


In [46]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df_sampled['sentiment'])

In [47]:
# 3. Model Selection
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': LinearSVC()
}

results = {}

In [49]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{name} - Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred, target_names=le.classes_))
    results[name] = acc


Naive Bayes - Accuracy: 0.7980
              precision    recall  f1-score   support

    negative       0.96      0.08      0.15       303
     neutral       0.00      0.00      0.00       125
    positive       0.80      1.00      0.89      1572

    accuracy                           0.80      2000
   macro avg       0.59      0.36      0.34      2000
weighted avg       0.77      0.80      0.72      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Logistic Regression - Accuracy: 0.8430
              precision    recall  f1-score   support

    negative       0.77      0.45      0.57       303
     neutral       0.23      0.02      0.04       125
    positive       0.85      0.98      0.91      1572

    accuracy                           0.84      2000
   macro avg       0.62      0.49      0.51      2000
weighted avg       0.80      0.84      0.81      2000


SVM - Accuracy: 0.8540
              precision    recall  f1-score   support

    negative       0.72      0.59      0.65       303
     neutral       0.30      0.11      0.16       125
    positive       0.89      0.96      0.92      1572

    accuracy                           0.85      2000
   macro avg       0.64      0.56      0.58      2000
weighted avg       0.83      0.85      0.84      2000



In [50]:
# 4. Lexicon-Based Approach using VADER 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

vader = SentimentIntensityAnalyzer()

def get_vader_sentiment(text):
    score = vader.polarity_scores(text)
    if score['compound'] >= 0.05:
        return 'positive'
    elif score['compound'] <= -0.05:
        return 'negative'
    else:
        return 'neutral'


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nafiz\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [51]:
df_sampled['vader_sentiment'] = df_sampled['review'].apply(get_vader_sentiment)
vader_accuracy = accuracy_score(df_sampled['sentiment'], df_sampled['vader_sentiment'])
print(f"\nVADER Lexicon-Based Accuracy: {vader_accuracy:.4f}")
results['VADER'] = vader_accuracy


VADER Lexicon-Based Accuracy: 0.8175


In [52]:
# 5. Discussion Section
discussion = """
Discussion: 
Among the models tested, Support Vector Machine (SVM) and Logistic Regression performed the best in terms of accuracy. 
Naive Bayes was slightly behind but still decent. Lexicon-based approach using VADER provided fast results but 
struggled with neutral sentiment and longer reviews due to lack of context understanding.

Strengths:
- SVM and Logistic Regression are robust for high-dimensional text data like TF-IDF.
- VADER is fast and interpretable, ideal for quick sentiment snapshots.

Weaknesses:
- Lexicon-based models cannot adapt to new data patterns or context.
- ML models require training data, computational power, and tuning for best results.
"""

print(discussion)


Discussion: 
Among the models tested, Support Vector Machine (SVM) and Logistic Regression performed the best in terms of accuracy. 
Naive Bayes was slightly behind but still decent. Lexicon-based approach using VADER provided fast results but 
struggled with neutral sentiment and longer reviews due to lack of context understanding.

Strengths:
- SVM and Logistic Regression are robust for high-dimensional text data like TF-IDF.
- VADER is fast and interpretable, ideal for quick sentiment snapshots.

Weaknesses:
- Lexicon-based models cannot adapt to new data patterns or context.
- ML models require training data, computational power, and tuning for best results.

