In [None]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_preprocessing import TextPreprocessor, load_data
from feature_extraction import FeatureExtractor
from train_model import SentimentModel, prepare_train_test_split, train_multiple_models
from evaluate_model import ModelEvaluator, compare_models

import warnings
warnings.filterwarnings('ignore')

print("âœ… All modules imported successfully!")

## 1. Load and Preprocess Data

In [None]:
# Load raw data
df = load_data('../data/raw/coursera_reviews.csv')
df.head()

In [None]:
# Initialize preprocessor
preprocessor = TextPreprocessor(remove_stopwords=True, lemmatize=True)

# Preprocess data
# Adjust 'review' to your actual text column name
df = preprocessor.preprocess_dataframe(df, text_column='review', output_column='cleaned_review')

print("\nâœ… Preprocessing completed!")
print(f"Sample cleaned review:\n{df['cleaned_review'].iloc[0]}")

In [None]:
# Save processed data
df.to_csv('../data/processed/cleaned_reviews.csv', index=False)
print("âœ… Processed data saved to data/processed/cleaned_reviews.csv")

## 2. Feature Extraction (TF-IDF)

In [None]:
# Initialize feature extractor
extractor = FeatureExtractor(method='tfidf', max_features=5000, ngram_range=(1, 2))

# Extract features
X = extractor.fit_transform(df['cleaned_review'])
y = df['sentiment']  # Adjust to your actual label column

print(f"\nFeature matrix shape: {X.shape}")
print(f"Labels shape: {y.shape}")

# Save vectorizer
extractor.save_vectorizer('../models/tfidf_vectorizer.pkl')

## 3. Train-Test Split

In [None]:
# Split data
X_train, X_test, y_train, y_test = prepare_train_test_split(X, y, test_size=0.2, random_state=42)

## 4. Train Baseline Models

### 4.1 Logistic Regression

In [None]:
# Train Logistic Regression
lr_model = SentimentModel(model_type='logistic')
lr_model.train(X_train, y_train)

# Evaluate
lr_evaluator = ModelEvaluator(lr_model.model, X_test, y_test, class_names=['Negative', 'Neutral', 'Positive'])
lr_metrics = lr_evaluator.print_metrics()
lr_evaluator.print_classification_report()

In [None]:
# Confusion Matrix
lr_evaluator.plot_confusion_matrix(save_path='../reports/figures/confusion_matrix_lr.png')

### 4.2 Naive Bayes

In [None]:
# Train Naive Bayes
nb_model = SentimentModel(model_type='naive_bayes')
nb_model.train(X_train, y_train)

# Evaluate
nb_evaluator = ModelEvaluator(nb_model.model, X_test, y_test, class_names=['Negative', 'Neutral', 'Positive'])
nb_metrics = nb_evaluator.print_metrics()
nb_evaluator.print_classification_report()

In [None]:
# Confusion Matrix
nb_evaluator.plot_confusion_matrix(save_path='../reports/figures/confusion_matrix_nb.png')

### 4.3 Random Forest

In [None]:
# Train Random Forest
rf_model = SentimentModel(model_type='random_forest')
rf_model.train(X_train, y_train)

# Evaluate
rf_evaluator = ModelEvaluator(rf_model.model, X_test, y_test, class_names=['Negative', 'Neutral', 'Positive'])
rf_metrics = rf_evaluator.print_metrics()
rf_evaluator.print_classification_report()

## 5. Compare Models

In [None]:
# Compare all models
results = {
    'Logistic Regression': {
        'model': lr_model,
        'train_accuracy': lr_model.model.score(X_train, y_train),
        'test_accuracy': lr_metrics['accuracy']
    },
    'Naive Bayes': {
        'model': nb_model,
        'train_accuracy': nb_model.model.score(X_train, y_train),
        'test_accuracy': nb_metrics['accuracy']
    },
    'Random Forest': {
        'model': rf_model,
        'train_accuracy': rf_model.model.score(X_train, y_train),
        'test_accuracy': rf_metrics['accuracy']
    }
}

comparison = compare_models(results)

## 6. Save Best Model

In [None]:
# Save the best performing model
best_model_name = comparison['Test Accuracy'].idxmax()
best_model = results[best_model_name]['model']

best_model.save_model('../models/best_baseline_model.pkl')
print(f"\nâœ… Best model ({best_model_name}) saved!")

## 7. Test Predictions on Sample Reviews

In [None]:
# Test on sample reviews
sample_reviews = [
    "This course is absolutely amazing! I learned so much.",
    "Terrible experience, waste of time and money.",
    "It was okay, nothing special but not bad either."
]

# Preprocess
cleaned_samples = [preprocessor.preprocess(review) for review in sample_reviews]

# Transform to features
sample_features = extractor.transform(cleaned_samples)

# Predict
predictions = best_model.predict(sample_features)
probabilities = best_model.predict_proba(sample_features)

# Display results
print("\nðŸ”® Sample Predictions:\n")
for i, review in enumerate(sample_reviews):
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {predictions[i]}")
    print(f"Confidence: {probabilities[i].max():.2%}")
    print("-" * 80)

## 8. Summary & Next Steps

**Results:**
- Best model: [Model name]
- Test accuracy: [X%]
- Key findings: [List]

**Next Steps:**
1. Hyperparameter tuning (Grid Search / Random Search)
2. Try different feature representations (Word2Vec, FastText)
3. Experiment with deep learning models (LSTM, BERT)
4. Implement aspect-based sentiment analysis