In [None]:

# Cell 1: Setup and Imports
# ====================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ Libraries imported successfully")


In [None]:
# Cell 2: Create Directories
# ====================================
import os

directories = ['data', 'models', 'results']
for dir_name in directories:
    os.makedirs(dir_name, exist_ok=True)
    print(f"✓ Directory '{dir_name}' ready")


In [None]:
# Cell 3: Data Collection
# ====================================
print("="*60)
print("STEP 1: DATA COLLECTION")
print("="*60)

# Create sample dataset
def create_sample_data(n_samples=3000):
    """Create sample Amazon-like reviews"""
    import numpy as np

    # Positive samples (rating 4-5)
    positive_texts = [
        "This product is absolutely amazing! Highly recommended.",
        "Excellent quality, exceeded my expectations. Worth every penny!",
        "Best purchase ever! Love it! Will definitely buy again.",
        "Outstanding product, will recommend to friends. Very satisfied.",
        "Perfect! Exactly what I needed. Great quality and fast shipping.",
        "Superb product! Excellent value for money. Very happy with purchase.",
        "Fantastic! Works perfectly. Better than expected quality.",
        "Love this product! Great features and easy to use.",
        "Brilliant purchase! Exceptional quality and service.",
        "Amazing product! Exceeded all my expectations. Five stars!"
    ]

    # Negative samples (rating 1-2)
    negative_texts = [
        "Terrible quality, complete waste of money. Very disappointed.",
        "Very disappointed, does not work as advertised. Don't buy!",
        "Poor quality, broke after one use. Would not recommend.",
        "Not worth the price, very bad quality. Returning immediately.",
        "Horrible experience, waste of time and money. Avoid this product!",
        "Cheaply made, fell apart quickly. Terrible customer service.",
        "Defective product, doesn't work at all. Very frustrating.",
        "Absolutely awful! Not as described. Complete disappointment.",
        "Useless product, total waste. Save your money!",
        "Worst purchase ever! Broke immediately. Very poor quality."
    ]

    # Neutral samples (rating 3)
    neutral_texts = [
        "It's okay, nothing special. Average product for the price.",
        "Average product, does the job but nothing extraordinary.",
        "Not bad, but not great either. Acceptable quality.",
        "Decent for the price. Works as expected, nothing more.",
        "Works as expected, nothing special. Fair quality.",
        "Acceptable product, meets basic expectations.",
        "It's fine, does what it says. Neither good nor bad.",
        "Mediocre quality, but functional. Could be better.",
        "Standard product, nothing to complain about.",
        "Fair quality, serves its purpose adequately."
    ]

    # Generate dataset
    texts, ratings = [], []

    n_positive = int(n_samples * 0.5)
    n_negative = int(n_samples * 0.3)
    n_neutral = n_samples - n_positive - n_negative

    # Add variety by repeating and slight modifications
    texts.extend(positive_texts * (n_positive // len(positive_texts) + 1))
    ratings.extend([5] * n_positive)

    texts.extend(negative_texts * (n_negative // len(negative_texts) + 1))
    ratings.extend([1] * n_negative)

    texts.extend(neutral_texts * (n_neutral // len(neutral_texts) + 1))
    ratings.extend([3] * n_neutral)

    # Trim to exact size
    texts = texts[:n_samples]
    ratings = ratings[:n_samples]

    # Create DataFrame
    df = pd.DataFrame({
        'text': texts,
        'rating': ratings
    })

    # Shuffle
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    return df

# Create and save data
df = create_sample_data(3000)

# Add sentiment labels
def rating_to_sentiment(rating):
    if rating <= 2:
        return 0, 'negative'
    elif rating == 3:
        return 1, 'neutral'
    else:
        return 2, 'positive'

df['sentiment'], df['sentiment_label'] = zip(*df['rating'].apply(rating_to_sentiment))

# Save
df.to_csv('data/raw_reviews.csv', index=False)

print(f"✓ Dataset created: {len(df)} samples")
print(f"\nClass Distribution:")
print(df['sentiment_label'].value_counts())
print(f"\n{df.head()}")


In [None]:
# Cell 4: Data Analysis
# ====================================
print("\n" + "="*60)
print("DATA ANALYSIS")
print("="*60)

# Statistics
df['text_length'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

print("\nText Length Statistics:")
print(df['text_length'].describe())

print("\nWord Count Statistics:")
print(df['word_count'].describe())

# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Class distribution
df['sentiment_label'].value_counts().plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Class Distribution', fontsize=14, fontweight='bold')
axes[0,0].set_xlabel('Sentiment')
axes[0,0].set_ylabel('Count')

# Text length by sentiment
for sentiment in df['sentiment_label'].unique():
    data = df[df['sentiment_label']==sentiment]['text_length']
    axes[0,1].hist(data, alpha=0.6, label=sentiment, bins=30)
axes[0,1].set_title('Text Length Distribution', fontsize=14, fontweight='bold')
axes[0,1].set_xlabel('Text Length')
axes[0,1].legend()

# Word count distribution
df['word_count'].hist(bins=30, ax=axes[1,0], color='coral')
axes[1,0].set_title('Word Count Distribution', fontsize=14, fontweight='bold')
axes[1,0].set_xlabel('Word Count')

# Rating distribution
df['rating'].value_counts().sort_index().plot(kind='bar', ax=axes[1,1], color='lightgreen')
axes[1,1].set_title('Rating Distribution', fontsize=14, fontweight='bold')
axes[1,1].set_xlabel('Rating')

plt.tight_layout()
plt.savefig('results/data_analysis.png', dpi=300, bbox_inches='tight')
plt.show()




In [None]:
# Cell 5: Text Preprocessing
# ====================================
print("\n" + "="*60)
print("STEP 2: TEXT PREPROCESSING")
print("="*60)

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """Clean and preprocess text"""
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
print("Preprocessing texts...")
df['clean_text'] = df['text'].apply(clean_text)

print("✓ Preprocessing completed")
print(f"\nExample:")
print(f"Original: {df['text'].iloc[0]}")
print(f"Cleaned: {df['clean_text'].iloc[0]}")




In [None]:
# Cell 6: Feature Extraction
# ====================================
print("\n" + "="*60)
print("STEP 3: FEATURE EXTRACTION")
print("="*60)

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

# Split data
X = df['clean_text']
y = df['sentiment']

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1875, random_state=42, stratify=y_temp
)

print(f"Training set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")

# TF-IDF Features
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(X_test)

print(f"\n✓ TF-IDF features: {X_train_tfidf.shape[1]} dimensions")




In [None]:
# Cell 7: Train Traditional ML Models
# ====================================
print("\n" + "="*60)
print("STEP 4: TRAINING TRADITIONAL ML MODELS")
print("="*60)

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

results = {}

# 1. Logistic Regression
print("\n[1/3] Training Logistic Regression...")
start = time.time()
lr = LogisticRegression(C=1.0, max_iter=1000, random_state=42)
lr.fit(X_train_tfidf, y_train)
lr_time = time.time() - start

y_train_pred_lr = lr.predict(X_train_tfidf)
y_val_pred_lr = lr.predict(X_val_tfidf)

results['Logistic Regression'] = {
    'train_acc': accuracy_score(y_train, y_train_pred_lr),
    'val_acc': accuracy_score(y_val, y_val_pred_lr),
    'time': lr_time,
    'model': lr,
    'predictions': y_val_pred_lr
}
print(f"✓ Completed in {lr_time:.2f}s | Val Acc: {results['Logistic Regression']['val_acc']:.4f}")

# 2. Linear SVM
print("\n[2/3] Training Linear SVM...")
start = time.time()
svm = LinearSVC(C=1.0, max_iter=1000, random_state=42)
svm.fit(X_train_tfidf, y_train)
svm_time = time.time() - start

y_train_pred_svm = svm.predict(X_train_tfidf)
y_val_pred_svm = svm.predict(X_val_tfidf)

results['Linear SVM'] = {
    'train_acc': accuracy_score(y_train, y_train_pred_svm),
    'val_acc': accuracy_score(y_val, y_val_pred_svm),
    'time': svm_time,
    'model': svm,
    'predictions': y_val_pred_svm
}
print(f"✓ Completed in {svm_time:.2f}s | Val Acc: {results['Linear SVM']['val_acc']:.4f}")

# 3. Random Forest
print("\n[3/3] Training Random Forest...")
start = time.time()
rf = RandomForestClassifier(n_estimators=100, max_depth=20, random_state=42, n_jobs=-1)
rf.fit(X_train_tfidf, y_train)
rf_time = time.time() - start

y_train_pred_rf = rf.predict(X_train_tfidf)
y_val_pred_rf = rf.predict(X_val_tfidf)

results['Random Forest'] = {
    'train_acc': accuracy_score(y_train, y_train_pred_rf),
    'val_acc': accuracy_score(y_val, y_val_pred_rf),
    'time': rf_time,
    'model': rf,
    'predictions': y_val_pred_rf
}
print(f"✓ Completed in {rf_time:.2f}s | Val Acc: {results['Random Forest']['val_acc']:.4f}")




In [None]:
# Cell 8: Train Deep Learning Models
# ====================================
print("\n" + "="*60)
print("STEP 5: TRAINING DEEP LEARNING MODELS")
print("="*60)

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Prepare sequences
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=100)
X_val_seq = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=100)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=100)

# 1. MLP Model
print("\n[1/2] Training MLP...")
mlp = models.Sequential([
    layers.Dense(128, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(3, activation='softmax')
])

mlp.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stop = callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

start = time.time()
history_mlp = mlp.fit(
    X_train_tfidf.toarray(), y_train,
    validation_data=(X_val_tfidf.toarray(), y_val),
    epochs=20, batch_size=32, callbacks=[early_stop], verbose=0
)
mlp_time = time.time() - start

y_val_pred_mlp = np.argmax(mlp.predict(X_val_tfidf.toarray()), axis=1)

results['MLP'] = {
    'train_acc': history_mlp.history['accuracy'][-1],
    'val_acc': history_mlp.history['val_accuracy'][-1],
    'time': mlp_time,
    'model': mlp,
    'history': history_mlp.history,
    'predictions': y_val_pred_mlp
}
print(f"✓ Completed in {mlp_time:.2f}s | Val Acc: {results['MLP']['val_acc']:.4f}")

# 2. LSTM Model
print("\n[2/2] Training LSTM...")
lstm = models.Sequential([
    layers.Embedding(10000, 64, input_length=100),
    layers.LSTM(32, dropout=0.2),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(3, activation='softmax')
])

lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

start = time.time()
history_lstm = lstm.fit(
    X_train_seq, y_train,
    validation_data=(X_val_seq, y_val),
    epochs=10, batch_size=64, callbacks=[early_stop], verbose=0
)
lstm_time = time.time() - start

y_val_pred_lstm = np.argmax(lstm.predict(X_val_seq), axis=1)

results['LSTM'] = {
    'train_acc': history_lstm.history['accuracy'][-1],
    'val_acc': history_lstm.history['val_accuracy'][-1],
    'time': lstm_time,
    'model': lstm,
    'history': history_lstm.history,
    'predictions': y_val_pred_lstm
}
print(f"✓ Completed in {lstm_time:.2f}s | Val Acc: {results['LSTM']['val_acc']:.4f}")




In [None]:
# Cell 9: Model Comparison
# ====================================
print("\n" + "="*60)
print("STEP 6: MODEL COMPARISON")
print("="*60)

# Create comparison table
comparison_data = []
for model_name, res in results.items():
    comparison_data.append({
        'Model': model_name,
        'Train Accuracy': f"{res['train_acc']:.4f}",
        'Val Accuracy': f"{res['val_acc']:.4f}",
        'Training Time (s)': f"{res['time']:.2f}",
        'Overfitting': f"{res['train_acc'] - res['val_acc']:.4f}"
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Val Accuracy', ascending=False)

print("\n" + comparison_df.to_string(index=False))

# Save
comparison_df.to_csv('results/model_comparison.csv', index=False)




In [None]:
# Cell 10: Visualizations
# ====================================
print("\n" + "=" * 60)
print("CREATING VISUALIZATIONS")
print("=" * 60)

fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# Confusion matrices
for idx, (model_name, res) in enumerate(results.items()):
    row, col = idx // 3, idx % 3
    cm = confusion_matrix(y_val, res['predictions'])

    sns.heatmap(cm, annot=True, fmt='d', ax=axes[row, col], cmap='Blues',
                xticklabels=['Neg', 'Neu', 'Pos'],
                yticklabels=['Neg', 'Neu', 'Pos'])
    axes[row, col].set_title(f"{model_name}\nAcc: {res['val_acc']:.3f}")
    axes[row, col].set_xlabel('Predicted')
    axes[row, col].set_ylabel('Actual')

# Hide extra subplot
axes[1, 2].axis('off')

plt.tight_layout()
plt.savefig('results/confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

# Model comparison bar chart
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

models = [res['Model'] for res in comparison_data]
val_accs = [float(res['Val Accuracy']) for res in comparison_data]
times = [float(res['Training Time (s)']) for res in comparison_data]

axes[0].barh(models, val_accs, color='skyblue')
axes[0].set_xlabel('Validation Accuracy')
axes[0].set_title('Model Accuracy Comparison')
axes[0].grid(axis='x', alpha=0.3)

axes[1].barh(models, times, color='coral')
axes[1].set_xlabel('Training Time (seconds)')
axes[1].set_title('Training Time Comparison')
axes[1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('results/model_comparison_chart.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ All visualizations saved to 'results/' folder")



In [None]:
# Cell 11: Final Report
# ====================================
print("\n" + "=" * 80)
print("FINAL REPORT")
print("=" * 80)

best_model = comparison_df.iloc[0]

print(f"\n✓ BEST PERFORMING MODEL")
print(f"  Model: {best_model['Model']}")
print(f"  Validation Accuracy: {best_model['Val Accuracy']}")
print(f"  Training Time: {best_model['Training Time (s)']}s")
print(f"  Overfitting: {best_model['Overfitting']}")

print(f"\n✓ PROJECT COMPLETED SUCCESSFULLY!")
print(f"\nGenerated Files:")
print(f"  - data/raw_reviews.csv")
print(f"  - results/data_analysis.png")
print(f"  - results/confusion_matrices.png")
print(f"  - results/model_comparison.png")
print(f"  - results/model_comparison.csv")

print("\n" + "=" * 80)