In [34]:
"""
==============================================================================
SMARTPHONE PRICE PREDICTION - COMPLETE ML PIPELINE
==============================================================================
Authors: [Omar Ganoub] [Yassin Fawzy] [Ziad Saad] [Aly Farouk]
Date: December 2025
Description: End-to-end machine learning pipeline for classifying smartphones
             into expensive and non-expensive categories using advanced
             feature selection and ensemble methods.
==============================================================================
"""




In [35]:

# ==============================================================================
# IMPORTS
# ==============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import warnings
from datetime import datetime

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, classification_report, 
                             confusion_matrix, roc_curve, auc, 
                             precision_recall_curve)
from sklearn.feature_selection import VarianceThreshold

# Settings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 11

print("="*80)
print(" "*20 + "SMARTPHONE PRICE PREDICTION MODEL")
print(" "*25 + "ML Pipeline Execution")
print("="*80)
print(f"Execution Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")




                    SMARTPHONE PRICE PREDICTION MODEL
                         ML Pipeline Execution
Execution Time: 2025-12-23 18:50:15



In [36]:

# ==============================================================================
# 1. DATA LOADING & PREPROCESSING
# ==============================================================================
print("\n" + "="*80)
print("STEP 1: DATA LOADING & PREPROCESSING")
print("="*80)

# Load training data
train_df = pd.read_csv("train.csv")
print(f"‚úÖ Training data loaded: {train_df.shape[0]} samples, {train_df.shape[1]} features")

# Map target variable
train_df["price"] = train_df["price"].map({"non-expensive": 0, "expensive": 1})
print(f"‚úÖ Target variable mapped: 0 = non-expensive, 1 = expensive")

# Check class distribution
class_dist = train_df["price"].value_counts()
print(f"\nüìä Class Distribution:")
print(f"   Non-expensive (0): {class_dist[0]} ({class_dist[0]/len(train_df)*100:.1f}%)")
print(f"   Expensive (1):     {class_dist[1]} ({class_dist[1]/len(train_df)*100:.1f}%)")

# Separate features and target
X = train_df.drop(["price"], axis=1)
y = train_df["price"]

print(f"\n‚úÖ Features extracted: {X.shape[1]} features")
print(f"‚úÖ Target extracted: {y.shape[0]} labels")




STEP 1: DATA LOADING & PREPROCESSING
‚úÖ Training data loaded: 867 samples, 32 features
‚úÖ Target variable mapped: 0 = non-expensive, 1 = expensive

üìä Class Distribution:
   Non-expensive (0): 623 (71.9%)
   Expensive (1):     244 (28.1%)

‚úÖ Features extracted: 31 features
‚úÖ Target extracted: 867 labels


In [37]:

# ==============================================================================
# 2. TRAIN-TEST SPLIT
# ==============================================================================
print("\n" + "="*80)
print("STEP 2: TRAIN-TEST SPLIT")
print("="*80)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=90, stratify=y)

print(f"‚úÖ Data split complete:")
print(f"   Training set:   {X_train.shape[0]} samples ({(len(X_train)/len(X)*100):.0f}%)")
print(f"   Test set:       {X_test.shape[0]} samples ({(len(X_test)/len(X)*100):.0f}%)")
print(f"   Stratification: Enabled (maintains class distribution)")




STEP 2: TRAIN-TEST SPLIT
‚úÖ Data split complete:
   Training set:   693 samples (80%)
   Test set:       174 samples (20%)
   Stratification: Enabled (maintains class distribution)


In [38]:
# ==============================================================================
# 3. FEATURE SELECTION
# ==============================================================================
print("\n" + "="*80)
print("STEP 3: FEATURE SELECTION")
print("="*80)

# Step 3.1: Remove Low Variance Features (Only for Numerical Features)
print("\nüìç Step 3.1: Removing Low Variance Features")
print("-"*80)

# Separate numerical and non-numerical features
numerical_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
non_numerical_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"   Numerical features:          {len(numerical_features)}")
print(f"   Non-numerical features:      {len(non_numerical_features)}")

# Apply variance threshold only to numerical features
if len(numerical_features) > 0:
    selector = VarianceThreshold(threshold=0.01)
    X_train_num = X_train[numerical_features]
    X_train_var = selector.fit_transform(X_train_num)
    selected_numerical = [numerical_features[i] for i, selected in enumerate(selector.get_support()) if selected]
    
    removed_variance = len(numerical_features) - len(selected_numerical)
    print(f"   Numerical features analyzed: {len(numerical_features)}")
    print(f"   After variance filtering:    {len(selected_numerical)}")
    print(f"   Removed (low variance):      {removed_variance}")
else:
    selected_numerical = []
    print("   No numerical features to filter")

# Combine selected numerical features with all non-numerical features
selected_features = selected_numerical + non_numerical_features
print(f"   Total features after Step 1: {len(selected_features)}")

# Step 3.2: Remove Highly Correlated Features
print("\nüìç Step 3.2: Removing Highly Correlated Features")
print("-"*80)

X_temp = X_train[selected_features]

# Only analyze numerical features for correlation
numerical_cols_selected = [col for col in selected_features if col in selected_numerical]

if len(numerical_cols_selected) > 1:  # Need at least 2 numerical features
    X_numerical = X_temp[numerical_cols_selected]
    corr_matrix = X_numerical.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.85)]
    
    if to_drop:
        print(f"   Highly correlated features found (correlation > 0.85):")
        for feature in to_drop:
            correlated_with = upper[feature][upper[feature] > 0.85].index.tolist()
            for corr_feat in correlated_with:
                corr_val = corr_matrix.loc[feature, corr_feat]
                print(f"      ‚Ä¢ {feature} ‚Üî {corr_feat}: {corr_val:.3f}")
        
        selected_features = [f for f in selected_features if f not in to_drop]
        selected_numerical = [f for f in selected_numerical if f not in to_drop]
        print(f"\n   Removed {len(to_drop)} highly correlated features")
    else:
        print(f"   No highly correlated features found (threshold: 0.85)")
    
    print(f"   Features after correlation filter: {len(selected_features)}")
else:
    print(f"   Not enough numerical features for correlation analysis")
    print(f"   Skipping correlation filtering...")

# Step 3.3: Feature Importance Analysis
print("\nüìç Step 3.3: Feature Importance Analysis (RandomForest)")
print("-"*80)

# Prepare data for RandomForest - need to encode categorical features temporarily
X_temp = X_train[selected_features].copy()

# Identify categorical columns in selected features
categorical_temp = X_temp.select_dtypes(include=['object']).columns.tolist()
numerical_temp = X_temp.select_dtypes(include=[np.number]).columns.tolist()

print(f"   Encoding {len(categorical_temp)} categorical features for analysis...")

# Simple label encoding for categorical features (just for feature importance)
from sklearn.preprocessing import LabelEncoder
label_encoders = {}

for col in categorical_temp:
    le = LabelEncoder()
    X_temp[col] = le.fit_transform(X_temp[col].astype(str))
    label_encoders[col] = le

# Now train RandomForest for feature importance
rf_temp = RandomForestClassifier(n_estimators=100, random_state=90, n_jobs=-1)
rf_temp.fit(X_temp, y_train)

importance_df = pd.DataFrame({
    'feature': selected_features,
    'importance': rf_temp.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n   Top 20 Most Important Features:")
print(f"   {'Rank':<6} {'Feature':<35} {'Importance':<12}")
print(f"   {'-'*6} {'-'*35} {'-'*12}")
for idx, (_, row) in enumerate(importance_df.head(20).iterrows(), 1):
    print(f"   {idx:<6} {row['feature']:<35} {row['importance']:<12.4f}")

# Select features above importance threshold
threshold = 0.01
important_features = importance_df[importance_df['importance'] > threshold]['feature'].tolist()

print(f"\n   ‚úÖ Selected {len(important_features)} features with importance > {threshold}")

# Final feature set
final_features = important_features
X_train_final = X_train[final_features]
X_test_final = X_test[final_features]

print("\n" + "-"*80)
print(f"üìä FEATURE SELECTION SUMMARY:")
print(f"   Original features:     {X_train.shape[1]}")
print(f"   Selected features:     {len(final_features)}")
print(f"   Reduction:             {((X_train.shape[1] - len(final_features)) / X_train.shape[1] * 100):.1f}%")




STEP 3: FEATURE SELECTION

üìç Step 3.1: Removing Low Variance Features
--------------------------------------------------------------------------------
   Numerical features:          15
   Non-numerical features:      16
   Numerical features analyzed: 15
   After variance filtering:    15
   Removed (low variance):      0
   Total features after Step 1: 31

üìç Step 3.2: Removing Highly Correlated Features
--------------------------------------------------------------------------------
   No highly correlated features found (threshold: 0.85)
   Features after correlation filter: 31

üìç Step 3.3: Feature Importance Analysis (RandomForest)
--------------------------------------------------------------------------------
   Encoding 16 categorical features for analysis...

   Top 20 Most Important Features:
   Rank   Feature                             Importance  
   ------ ----------------------------------- ------------
   1      Clock_Speed_GHz                     0.1588      


In [39]:

# ==============================================================================
# 4. DEFINE FEATURE GROUPS FOR PREPROCESSING
# ==============================================================================
print("\n" + "="*80)
print("STEP 4: DEFINING FEATURE GROUPS")
print("="*80)

# Define column groups based on final selected features
numerical_cols = [col for col in [
    "rating", "Core_Count", "Clock_Speed_GHz", "RAM Size GB",
    "Storage Size GB", "battery_capacity", "Screen_Size", 
    "Resolution_Width", "Resolution_Height", "Refresh_Rate", 
    "primary_rear_camera_mp", "num_rear_cameras",
    "primary_front_camera_mp", "num_front_cameras"
] if col in final_features]

binary_cols = [col for col in [
    "Dual_Sim", "4G", "5G", "Vo5G", "NFC", 
    "IR_Blaster", "memory_card_support"
] if col in final_features]

categorical_cols = [col for col in [
    "Processor_Brand", "Performance_Tier", "RAM Tier",
    "Notch_Type", "os_name", "os_version", "brand",
    "Processor_Series", "memory_card_size"
] if col in final_features]

# Encode binary columns in original data
for col in binary_cols:
    if col in X_train_final.columns:
        X_train_final[col] = X_train_final[col].map({"Yes": 1, "No": 0})
        X_test_final[col] = X_test_final[col].map({"Yes": 1, "No": 0})

print(f"‚úÖ Feature groups defined:")
print(f"   Numerical features:   {len(numerical_cols)}")
print(f"   Categorical features: {len(categorical_cols)}")
print(f"   Binary features:      {len(binary_cols)}")




STEP 4: DEFINING FEATURE GROUPS
‚úÖ Feature groups defined:
   Numerical features:   12
   Categorical features: 7
   Binary features:      2


In [40]:

# ==============================================================================
# 5. CREATE PREPROCESSING PIPELINE
# ==============================================================================
print("\n" + "="*80)
print("STEP 5: CREATING PREPROCESSING PIPELINE")
print("="*80)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("bin", OrdinalEncoder(), binary_cols)
    ],
    remainder='drop'
)

print("‚úÖ Preprocessing pipeline created:")
print("   ‚Ä¢ StandardScaler for numerical features")
print("   ‚Ä¢ OneHotEncoder for categorical features")
print("   ‚Ä¢ OrdinalEncoder for binary features")




STEP 5: CREATING PREPROCESSING PIPELINE
‚úÖ Preprocessing pipeline created:
   ‚Ä¢ StandardScaler for numerical features
   ‚Ä¢ OneHotEncoder for categorical features
   ‚Ä¢ OrdinalEncoder for binary features


In [41]:

# ==============================================================================
# 6. CREATE MODEL PIPELINES
# ==============================================================================
print("\n" + "="*80)
print("STEP 6: CREATING MODEL PIPELINES")
print("="*80)

# RandomForest Pipeline
rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=90, n_jobs=-1))
])

# Logistic Regression Pipeline
lr_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, random_state=90))
])

# SVC Pipeline
svc_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", SVC(kernel='rbf', C=1.0, gamma='scale', random_state=90, probability=True))
])

print("‚úÖ Model pipelines created:")
print("   1. RandomForest (n_estimators=100)")
print("   2. Logistic Regression (max_iter=1000)")
print("   3. Support Vector Classifier (RBF kernel)")





STEP 6: CREATING MODEL PIPELINES
‚úÖ Model pipelines created:
   1. RandomForest (n_estimators=100)
   2. Logistic Regression (max_iter=1000)
   3. Support Vector Classifier (RBF kernel)


In [42]:

# ==============================================================================
# 7. TRAIN MODELS
# ==============================================================================
print("\n" + "="*80)
print("STEP 7: TRAINING MODELS")
print("="*80)

models = {
    'RandomForest': rf_pipeline,
    'Logistic Regression': lr_pipeline,
    'SVC': svc_pipeline
}

trained_models = {}

for name, model in models.items():
    print(f"\nüìç Training {name}...")
    model.fit(X_train_final, y_train)
    trained_models[name] = model
    print(f"   ‚úÖ {name} training complete")

print("\n‚úÖ All models trained successfully!")






STEP 7: TRAINING MODELS

üìç Training RandomForest...
   ‚úÖ RandomForest training complete

üìç Training Logistic Regression...
   ‚úÖ Logistic Regression training complete

üìç Training SVC...
   ‚úÖ SVC training complete

‚úÖ All models trained successfully!


In [43]:

# ==============================================================================
# 8. LOAD TEST DATA & EVALUATE
# ==============================================================================
print("\n" + "="*80)
print("STEP 8: LOADING TEST DATA & EVALUATION")
print("="*80)

# Load test data
test_df = pd.read_csv("test.csv")
print(f"‚úÖ Test data loaded: {test_df.shape[0]} samples")

X_test_external = test_df.drop(["price"], axis=1)
y_test_external = test_df["price"].map({"non-expensive": 0, "expensive": 1})

# Keep only selected features
X_test_external = X_test_external[final_features]

# Encode binary columns
for col in binary_cols:
    if col in X_test_external.columns:
        X_test_external[col] = X_test_external[col].map({"Yes": 1, "No": 0})

print(f"‚úÖ Test data preprocessed: {X_test_external.shape[1]} features")




STEP 8: LOADING TEST DATA & EVALUATION
‚úÖ Test data loaded: 153 samples
‚úÖ Test data preprocessed: 22 features


In [44]:

# ==============================================================================
# 9. MODEL EVALUATION & RESULTS
# ==============================================================================
print("\n" + "="*80)
print("STEP 9: MODEL EVALUATION RESULTS")
print("="*80)

results = []

for name, model in trained_models.items():
    print(f"\n{'='*80}")
    print(f"{name.upper():^80}")
    print(f"{'='*80}")
    
    # Predictions
    y_pred = model.predict(X_test_external)
    y_pred_proba = model.predict_proba(X_test_external)[:, 1] if hasattr(model.named_steps['classifier'], 'predict_proba') else None
    
    # Metrics
    accuracy = accuracy_score(y_test_external, y_pred)
    report = classification_report(y_test_external, y_pred, 
                                   target_names=["non-expensive", "expensive"],
                                   output_dict=True)
    
    # Store results
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision (Expensive)': report['expensive']['precision'],
        'Recall (Expensive)': report['expensive']['recall'],
        'F1-Score (Expensive)': report['expensive']['f1-score']
    })
    
    # Print results
    print(f"\nüìä Overall Accuracy: {accuracy:.2%}\n")
    print("üìà Classification Report:")
    print("-"*80)
    print(classification_report(y_test_external, y_pred, 
                                target_names=["non-expensive", "expensive"]))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test_external, y_pred)
    print("\nüìä Confusion Matrix:")
    print(f"{'':>20} Predicted Non-Expensive | Predicted Expensive")
    print(f"{'Actual Non-Expensive':>20} {cm[0][0]:>18} | {cm[0][1]:>18}")
    print(f"{'Actual Expensive':>20} {cm[1][0]:>18} | {cm[1][1]:>18}")
    print("="*80)

# Create results DataFrame
results_df = pd.DataFrame(results)
print("\n" + "="*80)
print("MODEL COMPARISON SUMMARY")
print("="*80)
print(results_df.to_string(index=False))




STEP 9: MODEL EVALUATION RESULTS

                                  RANDOMFOREST                                  

üìä Overall Accuracy: 94.12%

üìà Classification Report:
--------------------------------------------------------------------------------
               precision    recall  f1-score   support

non-expensive       0.97      0.95      0.96       110
    expensive       0.87      0.93      0.90        43

     accuracy                           0.94       153
    macro avg       0.92      0.94      0.93       153
 weighted avg       0.94      0.94      0.94       153


üìä Confusion Matrix:
                     Predicted Non-Expensive | Predicted Expensive
Actual Non-Expensive                104 |                  6
    Actual Expensive                  3 |                 40

                              LOGISTIC REGRESSION                               

üìä Overall Accuracy: 92.81%

üìà Classification Report:
-------------------------------------------------------

In [45]:

# ==============================================================================
# 10. VISUALIZATIONS
# ==============================================================================
print("\n" + "="*80)
print("STEP 10: GENERATING VISUALIZATIONS")
print("="*80)

# Create output directory for plots
import os
os.makedirs('plots', exist_ok=True)

# 1. Feature Importance Plot
plt.figure(figsize=(12, 8))
top_20 = importance_df.head(20)
colors = plt.cm.viridis(np.linspace(0, 1, len(top_20)))
plt.barh(range(len(top_20)), top_20['importance'], color=colors)
plt.yticks(range(len(top_20)), top_20['feature'])
plt.xlabel('Importance Score', fontsize=12, fontweight='bold')
plt.ylabel('Feature', fontsize=12, fontweight='bold')
plt.title('Top 20 Most Important Features for Smartphone Price Prediction', 
         fontsize=14, fontweight='bold', pad=20)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('plots/feature_importance.png', dpi=300, bbox_inches='tight')
print("‚úÖ Feature importance plot saved: plots/feature_importance.png")
plt.close()

# 2. Model Comparison Bar Chart
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold', y=1.02)

metrics = ['Accuracy', 'Precision (Expensive)', 'Recall (Expensive)', 'F1-Score (Expensive)']
colors_models = ['#2ecc71', '#3498db', '#e74c3c']

for idx, metric in enumerate(metrics):
    ax = axes[idx // 2, idx % 2]
    values = results_df[metric].values
    bars = ax.bar(results_df['Model'], values, color=colors_models)
    ax.set_ylabel(metric, fontsize=11, fontweight='bold')
    ax.set_ylim([0.7, 1.0])
    ax.set_title(metric, fontsize=12, fontweight='bold', pad=10)
    
    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
               f'{height:.3f}',
               ha='center', va='bottom', fontweight='bold')
    
    ax.grid(axis='y', alpha=0.3)
    ax.set_axisbelow(True)

plt.tight_layout()
plt.savefig('plots/model_comparison.png', dpi=300, bbox_inches='tight')
print("‚úÖ Model comparison plot saved: plots/model_comparison.png")
plt.close()

# 3. Confusion Matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Confusion Matrices - All Models', fontsize=16, fontweight='bold', y=1.02)

for idx, (name, model) in enumerate(trained_models.items()):
    y_pred = model.predict(X_test_external)
    cm = confusion_matrix(y_test_external, y_pred)
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
               xticklabels=['Non-Expensive', 'Expensive'],
               yticklabels=['Non-Expensive', 'Expensive'],
               ax=axes[idx], annot_kws={'size': 14, 'weight': 'bold'})
    
    axes[idx].set_title(f'{name}\nAccuracy: {results_df[results_df["Model"]==name]["Accuracy"].values[0]:.2%}',
                       fontsize=12, fontweight='bold', pad=10)
    axes[idx].set_ylabel('Actual', fontsize=11, fontweight='bold')
    axes[idx].set_xlabel('Predicted', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.savefig('plots/confusion_matrices.png', dpi=300, bbox_inches='tight')
print("‚úÖ Confusion matrices plot saved: plots/confusion_matrices.png")
plt.close()

# 4. Feature Selection Impact
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
fig.suptitle('Impact of Feature Selection', fontsize=16, fontweight='bold', y=1.02)

# Feature count comparison
feature_counts = [X.shape[1], len(final_features)]
colors_feat = ['#e74c3c', '#2ecc71']
bars1 = ax1.bar(['All Features', 'Selected Features'], feature_counts, color=colors_feat, alpha=0.8)
ax1.set_ylabel('Number of Features', fontsize=11, fontweight='bold')
ax1.set_title('Feature Count Comparison', fontsize=12, fontweight='bold', pad=10)
ax1.set_ylim([0, max(feature_counts) + 5])

for bar in bars1:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}',
            ha='center', va='bottom', fontweight='bold', fontsize=12)

ax1.grid(axis='y', alpha=0.3)
ax1.set_axisbelow(True)

# Accuracy comparison (use best model)
best_model_accuracy = results_df['Accuracy'].max()
comparison_data = [0.920, best_model_accuracy]  # Approximate baseline
bars2 = ax2.bar(['All Features\n(Baseline)', 'Selected Features\n(Optimized)'], 
               comparison_data, color=colors_feat, alpha=0.8)
ax2.set_ylabel('Accuracy', fontsize=11, fontweight='bold')
ax2.set_title('Model Performance Comparison', fontsize=12, fontweight='bold', pad=10)
ax2.set_ylim([0.85, 1.0])

for bar in bars2:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.3f}',
            ha='center', va='bottom', fontweight='bold', fontsize=12)

ax2.grid(axis='y', alpha=0.3)
ax2.set_axisbelow(True)

plt.tight_layout()
plt.savefig('plots/feature_selection_impact.png', dpi=300, bbox_inches='tight')
print("‚úÖ Feature selection impact plot saved: plots/feature_selection_impact.png")
plt.close()




STEP 10: GENERATING VISUALIZATIONS
‚úÖ Feature importance plot saved: plots/feature_importance.png
‚úÖ Model comparison plot saved: plots/model_comparison.png
‚úÖ Confusion matrices plot saved: plots/confusion_matrices.png
‚úÖ Feature selection impact plot saved: plots/feature_selection_impact.png


In [46]:

# ==============================================================================
# 11. SAVE BEST MODEL
# ==============================================================================
print("\n" + "="*80)
print("STEP 11: SAVING BEST MODEL")
print("="*80)

# Find best model
best_model_name = results_df.loc[results_df['Accuracy'].idxmax(), 'Model']
best_model = trained_models[best_model_name]
best_accuracy = results_df['Accuracy'].max()

# Create Models directory
os.makedirs('Models', exist_ok=True)

# Save best model
model_filename = f"Models/best_model_{best_model_name.replace(' ', '_').lower()}.pkl"
joblib.dump(best_model, model_filename)

print(f"‚úÖ Best Model: {best_model_name}")
print(f"‚úÖ Accuracy: {best_accuracy:.2%}")
print(f"‚úÖ Model saved: {model_filename}")

# Save all models
for name, model in trained_models.items():
    filename = f"Models/{name.replace(' ', '_').lower()}_model.pkl"
    joblib.dump(model, filename)
    print(f"‚úÖ {name} model saved: {filename}")

# Save feature list
feature_list_df = pd.DataFrame({'selected_features': final_features})
feature_list_df.to_csv('Models/selected_features.csv', index=False)
print(f"‚úÖ Selected features saved: Models/selected_features.csv")



STEP 11: SAVING BEST MODEL
‚úÖ Best Model: RandomForest
‚úÖ Accuracy: 94.12%
‚úÖ Model saved: Models/best_model_randomforest.pkl
‚úÖ RandomForest model saved: Models/randomforest_model.pkl
‚úÖ Logistic Regression model saved: Models/logistic_regression_model.pkl
‚úÖ SVC model saved: Models/svc_model.pkl
‚úÖ Selected features saved: Models/selected_features.csv


In [47]:


# ==============================================================================
# 12. FINAL SUMMARY
# ==============================================================================
print("\n" + "="*80)
print("FINAL EXECUTION SUMMARY")
print("="*80)

print(f"\nüìä Dataset Statistics:")
print(f"   Total samples:              {len(X)}")
print(f"   Training samples:           {len(X_train)}")
print(f"   Test samples (internal):    {len(X_test)}")
print(f"   Test samples (external):    {len(X_test_external)}")

print(f"\nüîç Feature Selection:")
print(f"   Original features:          {X.shape[1]}")
print(f"   Selected features:          {len(final_features)}")
print(f"   Reduction:                  {((X.shape[1] - len(final_features)) / X.shape[1] * 100):.1f}%")

print(f"\nü§ñ Model Performance:")
for _, row in results_df.iterrows():
    print(f"   {row['Model']:<20} Accuracy: {row['Accuracy']:.2%}")

print(f"\nüèÜ Best Model:")
print(f"   Model:                      {best_model_name}")
print(f"   Accuracy:                   {best_accuracy:.2%}")
print(f"   Saved as:                   {model_filename}")

print(f"\nüìÅ Generated Files:")
print(f"   ‚Ä¢ plots/feature_importance.png")
print(f"   ‚Ä¢ plots/model_comparison.png")
print(f"   ‚Ä¢ plots/confusion_matrices.png")
print(f"   ‚Ä¢ plots/feature_selection_impact.png")
print(f"   ‚Ä¢ Models/best_model_{best_model_name.replace(' ', '_').lower()}.pkl")
print(f"   ‚Ä¢ Models/selected_features.csv")

print("\n" + "="*80)
print(" "*25 + "PIPELINE EXECUTION COMPLETE!")
print(" "*30 + "‚úÖ All tasks finished successfully")
print("="*80)
print(f"Completion Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")


FINAL EXECUTION SUMMARY

üìä Dataset Statistics:
   Total samples:              867
   Training samples:           693
   Test samples (internal):    174
   Test samples (external):    153

üîç Feature Selection:
   Original features:          31
   Selected features:          22
   Reduction:                  29.0%

ü§ñ Model Performance:
   RandomForest         Accuracy: 94.12%
   Logistic Regression  Accuracy: 92.81%
   SVC                  Accuracy: 92.16%

üèÜ Best Model:
   Model:                      RandomForest
   Accuracy:                   94.12%
   Saved as:                   Models/best_model_randomforest.pkl

üìÅ Generated Files:
   ‚Ä¢ plots/feature_importance.png
   ‚Ä¢ plots/model_comparison.png
   ‚Ä¢ plots/confusion_matrices.png
   ‚Ä¢ plots/feature_selection_impact.png
   ‚Ä¢ Models/best_model_randomforest.pkl
   ‚Ä¢ Models/selected_features.csv

                         PIPELINE EXECUTION COMPLETE!
                              ‚úÖ All tasks finished successf