In [1]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from data_preprocessing import load_data
from feature_extraction import FeatureExtractor
from train_model import prepare_train_test_split
from evaluate_model import ModelEvaluator

import warnings
warnings.filterwarnings('ignore')

print("‚úÖ All modules imported successfully!")

‚úÖ All modules imported successfully!


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\param\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 1. Load Processed Data

In [2]:
# Load cleaned data
df = load_data('../data/processed/cleaned_reviews.csv')
print(f"Loaded {len(df)} reviews")

# Remove rows with NaN or empty cleaned_review
print(f"\nüîç Original dataset size: {len(df)}")
df = df[df['cleaned_review'].notna() & (df['cleaned_review'].str.strip() != '')].reset_index(drop=True)
print(f"‚úÖ After removing empty reviews: {len(df)}")

df.head()

‚úÖ Loaded 140320 reviews from ../data/processed/cleaned_reviews.csv
Loaded 140320 reviews

üîç Original dataset size: 140320
‚úÖ After removing empty reviews: 135939


Unnamed: 0,CourseId,Review,Label,cleaned_review
0,2-speed-it,BOring,1,boring
1,2-speed-it,Bravo !,5,bravo
2,2-speed-it,Very goo,5,goo
3,2-speed-it,"Great course - I recommend it for all, especia...",5,great course recommend especially business man...
4,2-speed-it,One of the most useful course on IT Management!,5,one useful course management


## 2. Feature Extraction

In [3]:
# Load saved vectorizer or create new one
import joblib

# Convert ratings to sentiment if needed
if 'sentiment' not in df.columns and 'Label' in df.columns:
    print("\nüîÑ Converting ratings to sentiment labels...")
    def rating_to_sentiment(rating):
        if rating >= 4:
            return 'Positive'
        elif rating <= 2:
            return 'Negative'
        else:
            return 'Neutral'
    df['sentiment'] = df['Label'].apply(rating_to_sentiment)
    print("‚úÖ Conversion complete!")
    print(f"Sentiment distribution:\n{df['sentiment'].value_counts()}")

# Extract features
try:
    extractor = FeatureExtractor()
    extractor.load_vectorizer('../models/tfidf_vectorizer.pkl')
    X = extractor.transform(df['cleaned_review'])
except:
    print("Creating new vectorizer...")
    extractor = FeatureExtractor(method='tfidf', max_features=5000, ngram_range=(1, 2))
    X = extractor.fit_transform(df['cleaned_review'])

y = df['sentiment']  # Use sentiment labels

print(f"\nFeature matrix shape: {X.shape}")
print(f"Labels shape: {y.shape}")
print(f"\nLabel distribution:\n{y.value_counts()}")


üîÑ Converting ratings to sentiment labels...
‚úÖ Conversion complete!
Sentiment distribution:
sentiment
Positive    124855
Neutral       5781
Negative      5303
Name: count, dtype: int64
‚úÖ Vectorizer loaded from ../models/tfidf_vectorizer.pkl

Feature matrix shape: (135939, 5000)
Labels shape: (135939,)

Label distribution:
sentiment
Positive    124855
Neutral       5781
Negative      5303
Name: count, dtype: int64


In [4]:
# Split data
X_train, X_test, y_train, y_test = prepare_train_test_split(X, y, test_size=0.2, random_state=42)


üìä Data Split:
   Training samples: 108751
   Testing samples: 27188
   Features: 5000


## 3. Hyperparameter Tuning - Logistic Regression

In [None]:
# Define parameter grid
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000, 2000]
}

# Initialize model
lr = LogisticRegression(random_state=42)

# Grid search
print("üîç Performing Grid Search for Logistic Regression...")
grid_lr = GridSearchCV(
    lr,
    param_grid_lr,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_lr.fit(X_train, y_train)

print(f"\n‚úÖ Best parameters: {grid_lr.best_params_}")
print(f"‚úÖ Best cross-validation score: {grid_lr.best_score_:.4f}")
print(f"‚úÖ Test accuracy: {grid_lr.score(X_test, y_test):.4f}")

üîç Performing Grid Search for Logistic Regression...
Fitting 5 folds for each of 40 candidates, totalling 200 fits


## 4. Hyperparameter Tuning - Random Forest

In [None]:
# Define parameter grid
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use RandomizedSearchCV for faster results
rf = RandomForestClassifier(random_state=42)

print("üîç Performing Randomized Search for Random Forest...")
random_rf = RandomizedSearchCV(
    rf,
    param_grid_rf,
    n_iter=20,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)

random_rf.fit(X_train, y_train)

print(f"\n‚úÖ Best parameters: {random_rf.best_params_}")
print(f"‚úÖ Best cross-validation score: {random_rf.best_score_:.4f}")
print(f"‚úÖ Test accuracy: {random_rf.score(X_test, y_test):.4f}")

## 5. Try Gradient Boosting

In [6]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier

print("üöÄ Training Gradient Boosting Classifier...")
gb = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

gb.fit(X_train, y_train)

print(f"\n‚úÖ Training accuracy: {gb.score(X_train, y_train):.4f}")
print(f"‚úÖ Test accuracy: {gb.score(X_test, y_test):.4f}")

üöÄ Training Gradient Boosting Classifier...


NameError: name 'X_train' is not defined

## 6. Compare Optimized Models

In [None]:
# Evaluate all models
models = {
    'Optimized Logistic Regression': grid_lr.best_estimator_,
    'Optimized Random Forest': random_rf.best_estimator_,
    'Gradient Boosting': gb
}

results = {}

for name, model in models.items():
    train_acc = model.score(X_train, y_train)
    test_acc = model.score(X_test, y_test)
    
    results[name] = {
        'Train Accuracy': train_acc,
        'Test Accuracy': test_acc
    }

# Create comparison dataframe
comparison_df = pd.DataFrame(results).T
print("\nüìä Model Comparison:")
print(comparison_df)

# Plot comparison
comparison_df.plot(kind='bar', figsize=(10, 6), rot=0)
plt.title('Optimized Models Comparison', fontsize=14, fontweight='bold')
plt.ylabel('Accuracy', fontsize=12)
plt.xlabel('Model', fontsize=12)
plt.legend(['Train Accuracy', 'Test Accuracy'])
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('../reports/figures/optimized_models_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Detailed Evaluation of Best Model

In [None]:
# Find best model
best_model_name = comparison_df['Test Accuracy'].idxmax()
best_model = models[best_model_name]

print(f"\nüèÜ Best Model: {best_model_name}")
print(f"   Test Accuracy: {comparison_df.loc[best_model_name, 'Test Accuracy']:.4f}")

# Comprehensive evaluation
evaluator = ModelEvaluator(
    best_model,
    X_test,
    y_test,
    class_names=['Negative', 'Neutral', 'Positive']  # Adjust as needed
)

evaluator.print_metrics()
evaluator.print_classification_report()

In [None]:
# Confusion matrix
evaluator.plot_confusion_matrix(save_path='../reports/figures/confusion_matrix_best.png')

## 8. Save Optimized Model

In [None]:
# Save the best model
import joblib

model_path = '../models/optimized_best_model.pkl'
joblib.dump(best_model, model_path)
print(f"‚úÖ Best optimized model saved to {model_path}")

## 9. Feature Importance (if applicable)

In [None]:
# Get feature importance for tree-based models
if hasattr(best_model, 'feature_importances_'):
    feature_names = extractor.get_feature_names()
    importances = best_model.feature_importances_
    
    # Get top 20 features
    indices = np.argsort(importances)[-20:][::-1]
    top_features = [(feature_names[i], importances[i]) for i in indices]
    
    # Create dataframe
    importance_df = pd.DataFrame(top_features, columns=['Feature', 'Importance'])
    
    # Plot
    plt.figure(figsize=(10, 8))
    sns.barplot(data=importance_df, x='Importance', y='Feature', palette='viridis')
    plt.title('Top 20 Most Important Features', fontsize=14, fontweight='bold')
    plt.xlabel('Importance', fontsize=12)
    plt.ylabel('Feature', fontsize=12)
    plt.tight_layout()
    plt.savefig('../reports/figures/feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("\nüîù Top 10 Most Important Features:")
    print(importance_df.head(10).to_string(index=False))
else:
    print("\n‚ö†Ô∏è Selected model doesn't support feature importance.")

## 10. Summary

**Optimization Results:**
- Best optimized model: [Name]
- Improvement over baseline: [X%]
- Final test accuracy: [Y%]

**Key Findings:**
- Most important features identified
- Optimal hyperparameters found
- Model ready for deployment

**Next Steps:**
1. Experiment with transformer models (BERT)
2. Implement aspect-based sentiment analysis
3. Build deployment pipeline