# Model Deployment & Export

This notebook handles the export of the trained model for production deployment.
It creates .pkl files and deployment utilities for both Django and FastAPI frameworks.

**Prerequisites**: Run `model-building-evaluation.ipynb` first to train and select the best model.

## 1. Environment Setup & Model Loading

In [None]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import joblib
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set working directory
print("Current directory:", os.getcwd())
os.chdir('C:/Users/oldbe/Machine Learning/survey-seeding/backend')
print("Changed to:", os.getcwd())

In [None]:
# Load the dataset and recreate the model selection results
# This assumes you've run the model-building-evaluation notebook

# Load data
df = pd.read_csv('model_framing_assembling/ml_dataset_final.csv', index_col=0)
target_column = 'target_dropoff'  
X = df.drop(columns=[target_column])
y = df[target_column]

print(f"Dataset loaded: {df.shape}")
print(f"Features: {X.shape[1]}")
print(f"Target distribution: {y.value_counts().to_dict()}")

In [None]:
# Recreate the train-test split and SMOTE application
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek

# Split data (same as in evaluation notebook)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Apply SMOTE+Tomek
smote_tomek = SMOTETomek(random_state=42)
X_train_balanced, y_train_balanced = smote_tomek.fit_resample(X_train, y_train)
X_train_balanced = pd.DataFrame(X_train_balanced, columns=X_train.columns)

print(f"Training data prepared:")
print(f"  Original: {len(X_train)} samples")
print(f"  Balanced: {len(X_train_balanced)} samples")
print(f"  Test set: {len(X_test)} samples")

## 2. Train Best Model for Export

Based on the evaluation results, we'll train the best performing model.

In [None]:
# Import all models and train them to determine the best one
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

# Define models (same configuration as evaluation)
models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42, max_depth=10),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10),
    'Naive Bayes': GaussianNB(),
    'Neural Network': MLPClassifier(
        random_state=42, max_iter=1000, hidden_layer_sizes=(100, 50),
        early_stopping=True, validation_fraction=0.1
    ),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

print("🚀 Training models for deployment export...")

# Train and evaluate all models
results = {}
trained_models = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Train on balanced data
    model.fit(X_train_balanced, y_train_balanced)
    trained_models[name] = model
    
    # Evaluate on test set
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    results[name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'ROC-AUC': roc_auc_score(y_test, y_pred_proba) if y_pred_proba is not None else None
    }
    
    print(f"  Accuracy: {results[name]['Accuracy']:.4f}")
    print(f"  F1-Score: {results[name]['F1-Score']:.4f}")

print("\n✅ All models trained successfully!")

In [None]:
# Select the best model using weighted scoring
weights = {
    'F1-Score': 0.30,
    'Accuracy': 0.25,
    'ROC-AUC': 0.20,
    'Precision': 0.15,
    'Recall': 0.10
}

# Calculate weighted scores
weighted_scores = {}
for model_name in results.keys():
    score = 0
    total_weight = 0
    
    for metric, weight in weights.items():
        if metric == 'ROC-AUC' and results[model_name][metric] is None:
            continue
        
        score += results[model_name][metric] * weight
        total_weight += weight
    
    weighted_scores[model_name] = score / total_weight if total_weight > 0 else 0

# Find best model
best_model_name = max(weighted_scores.keys(), key=lambda k: weighted_scores[k])
best_model = trained_models[best_model_name]
best_score = weighted_scores[best_model_name]

print(f"🏆 BEST MODEL SELECTED: {best_model_name}")
print(f"Weighted Score: {best_score:.4f}")
print(f"\nPerformance Metrics:")
for metric, value in results[best_model_name].items():
    if value is not None:
        print(f"  {metric:12s}: {value:.4f} ({value:.1%})")
    else:
        print(f"  {metric:12s}: N/A")

# Check if target is met
target_accuracy = 0.70
accuracy_achieved = results[best_model_name]['Accuracy']
target_met = accuracy_achieved >= target_accuracy

print(f"\n🎯 TARGET ACHIEVEMENT:")
if target_met:
    print(f"✅ EXCEEDS 70% accuracy target: {accuracy_achieved:.1%}")
else:
    print(f"❌ Below 70% accuracy target: {accuracy_achieved:.1%}")
    print(f"⚠️  Proceeding with best available model")

## 3. Export Model Files

### 3.1 Create Export Directories

In [None]:
# Create directories for different deployment frameworks
export_dirs = {
    'fastapi': 'fastapi_models',
    'general': 'exported_models'  # For general use
}

for framework, dir_name in export_dirs.items():
    os.makedirs(dir_name, exist_ok=True)
    print(f"✅ Created directory: {dir_name}")

print(f"\n📁 Export directories ready for deployment files")

### 3.2 Export Core Model Files

In [None]:
# Export the trained model and preprocessing components
export_timestamp = datetime.now().isoformat()

print("📦 EXPORTING MODEL FOR PRODUCTION DEPLOYMENT")
print("=" * 60)

# Export to both directories
for framework, models_dir in export_dirs.items():
    print(f"\nExporting to {framework.upper()} directory: {models_dir}/")
    
    # 1. Export the trained model
    model_filename = f"{models_dir}/movie_dropoff_model.pkl"
    joblib.dump(best_model, model_filename)
    print(f"✅ Model: {model_filename}")
    
    # 2. Export SMOTE+Tomek transformer
    smote_filename = f"{models_dir}/smote_transformer.pkl"
    joblib.dump(smote_tomek, smote_filename)
    print(f"✅ SMOTE transformer: {smote_filename}")
    
    # 3. Export model metadata
    model_info = {
        'model_type': best_model_name,
        'model_class': str(type(best_model).__name__),
        'training_date': export_timestamp,
        'framework': framework,
        'target_column': target_column,
        'feature_names': X.columns.tolist(),
        'feature_count': len(X.columns),
        'performance_metrics': results[best_model_name],
        'weighted_score': best_score,
        'target_achieved': target_met,
        'training_samples': {
            'original': len(X_train),
            'balanced': len(X_train_balanced),
            'test': len(X_test)
        },
        'preprocessing': {
            'balancing_method': 'SMOTE+Tomek',
            'train_test_split': 0.25,
            'random_state': 42
        },
        'version': '1.0',
        'export_timestamp': export_timestamp
    }
    
    info_filename = f"{models_dir}/model_info.json"
    with open(info_filename, 'w') as f:
        json.dump(model_info, f, indent=2, default=str)
    print(f"✅ Model info: {info_filename}")
    
    # 4. Export class mappings
    class_mappings = {
        'label_mapping': {
            '0': 'Will Complete Movie',
            '1': 'Will Drop Off',
            0: 'Will Complete Movie',
            1: 'Will Drop Off'
        },
        'class_distribution': {
            'original': {
                'class_0': int(sum(y == 0)),
                'class_1': int(sum(y == 1))
            },
            'training_balanced': {
                'class_0': int(sum(y_train_balanced == 0)),
                'class_1': int(sum(y_train_balanced == 1))
            }
        },
        'target_names': ['Complete', 'Dropout']
    }
    
    mappings_filename = f"{models_dir}/class_mappings.json"
    with open(mappings_filename, 'w') as f:
        json.dump(class_mappings, f, indent=2)
    print(f"✅ Class mappings: {mappings_filename}")

print(f"\n🎉 Model export completed successfully!")
print(f"Exported model: {best_model_name} with {accuracy_achieved:.1%} accuracy")

### 3.3 Export Survey Questions for Frontend

In [None]:
# Generate survey questions based on the features used in the model
# This creates a comprehensive survey form for web applications

survey_questions = [
    {
        'id': 'age_group',
        'question': 'What is your age group?',
        'type': 'select',
        'options': [
            {'value': '18-25', 'label': '18-25 years'},
            {'value': '26-35', 'label': '26-35 years'},
            {'value': '36-45', 'label': '36-45 years'},
            {'value': '46-55', 'label': '46-55 years'},
            {'value': '56+', 'label': '56+ years'}
        ],
        'required': True
    },
    {
        'id': 'viewing_frequency',
        'question': 'How often do you watch movies?',
        'type': 'select',
        'options': [
            {'value': 'Daily', 'label': 'Daily'},
            {'value': 'Weekly', 'label': 'Several times a week'},
            {'value': 'Monthly', 'label': 'A few times a month'},
            {'value': 'Rarely', 'label': 'Rarely'}
        ],
        'required': True
    },
    {
        'id': 'preferred_genre',
        'question': 'What is your preferred movie genre?',
        'type': 'select',
        'options': [
            {'value': 'Action', 'label': 'Action'},
            {'value': 'Comedy', 'label': 'Comedy'},
            {'value': 'Drama', 'label': 'Drama'},
            {'value': 'Horror', 'label': 'Horror'},
            {'value': 'Romance', 'label': 'Romance'},
            {'value': 'Sci-Fi', 'label': 'Science Fiction'},
            {'value': 'Thriller', 'label': 'Thriller'}
        ],
        'required': True
    },
    {
        'id': 'attention_span',
        'question': 'How would you describe your attention span for movies?',
        'type': 'select',
        'options': [
            {'value': 'Short', 'label': 'Short (prefer movies under 90 minutes)'},
            {'value': 'Medium', 'label': 'Medium (comfortable with 90-120 minutes)'},
            {'value': 'Long', 'label': 'Long (enjoy movies over 2 hours)'}
        ],
        'required': True
    },
    {
        'id': 'device_preference',
        'question': 'What device do you primarily use to watch movies?',
        'type': 'select',
        'options': [
            {'value': 'TV', 'label': 'Television'},
            {'value': 'Laptop', 'label': 'Laptop/Computer'},
            {'value': 'Tablet', 'label': 'Tablet'},
            {'value': 'Phone', 'label': 'Smartphone'}
        ],
        'required': True
    },
    {
        'id': 'viewing_companion',
        'question': 'Who do you usually watch movies with?',
        'type': 'select',
        'options': [
            {'value': 'Alone', 'label': 'Alone'},
            {'value': 'Partner', 'label': 'Partner/Spouse'},
            {'value': 'Family', 'label': 'Family'},
            {'value': 'Friends', 'label': 'Friends'}
        ],
        'required': True
    },
    {
        'id': 'interruption_tolerance',
        'question': 'How well do you handle interruptions while watching movies?',
        'type': 'select',
        'options': [
            {'value': 'Low', 'label': 'Low - I prefer no interruptions'},
            {'value': 'Medium', 'label': 'Medium - Some interruptions are okay'},
            {'value': 'High', 'label': 'High - Interruptions don\'t bother me'}
        ],
        'required': True
    },
    {
        'id': 'mood_influence',
        'question': 'How much does your mood influence your movie completion?',
        'type': 'select',
        'options': [
            {'value': 'High', 'label': 'High - My mood greatly affects viewing'},
            {'value': 'Medium', 'label': 'Medium - Some mood influence'},
            {'value': 'Low', 'label': 'Low - Mood rarely affects viewing'}
        ],
        'required': True
    },
    {
        'id': 'content_discovery',
        'question': 'How do you usually discover movies to watch?',
        'type': 'select',
        'options': [
            {'value': 'Recommendations', 'label': 'Platform recommendations'},
            {'value': 'Browse', 'label': 'Browsing categories'},
            {'value': 'Search', 'label': 'Searching for specific titles'},
            {'value': 'Social', 'label': 'Social media/friends'}
        ],
        'required': True
    },
    {
        'id': 'time_of_day',
        'question': 'When do you prefer to watch movies?',
        'type': 'select',
        'options': [
            {'value': 'Morning', 'label': 'Morning (6 AM - 12 PM)'},
            {'value': 'Afternoon', 'label': 'Afternoon (12 PM - 6 PM)'},
            {'value': 'Evening', 'label': 'Evening (6 PM - 10 PM)'},
            {'value': 'Night', 'label': 'Night (10 PM - 2 AM)'}
        ],
        'required': True
    }
]

# Export survey questions to both directories
for framework, models_dir in export_dirs.items():
    questions_filename = f"{models_dir}/survey_questions.json"
    with open(questions_filename, 'w') as f:
        json.dump(survey_questions, f, indent=2)
    print(f"✅ Survey questions exported: {questions_filename}")

print(f"\n📝 Survey form ready with {len(survey_questions)} questions")
print("Questions cover key behavioral and preference factors for dropout prediction")

## 4. Export Training Data for Retraining

In [None]:
# Export datasets for potential model retraining
data_export_dir = 'exported_datasets'
os.makedirs(data_export_dir, exist_ok=True)

print("📊 EXPORTING TRAINING DATASETS")
print("=" * 50)

# 1. Export original training set
X_train_export = X_train.copy()
X_train_export[target_column] = y_train
train_file = f"{data_export_dir}/training_set_original.csv"
X_train_export.to_csv(train_file, index=False)
print(f"✅ Original training set: {train_file} ({len(X_train_export)} samples)")

# 2. Export balanced training set
X_train_balanced_export = X_train_balanced.copy()
X_train_balanced_export[target_column] = y_train_balanced
balanced_file = f"{data_export_dir}/training_set_smote_balanced.csv"
X_train_balanced_export.to_csv(balanced_file, index=False)
print(f"✅ Balanced training set: {balanced_file} ({len(X_train_balanced_export)} samples)")

# 3. Export test set
X_test_export = X_test.copy()
X_test_export[target_column] = y_test
test_file = f"{data_export_dir}/test_set.csv"
X_test_export.to_csv(test_file, index=False)
print(f"✅ Test set: {test_file} ({len(X_test_export)} samples)")

# 4. Export full dataset
full_file = f"{data_export_dir}/full_dataset_original.csv"
df.to_csv(full_file, index=False)
print(f"✅ Full dataset: {full_file} ({len(df)} samples)")

# Create dataset summary
dataset_summary = {
    'export_timestamp': export_timestamp,
    'datasets': {
        'full_original': {
            'file': full_file,
            'samples': len(df),
            'features': len(X.columns),
            'class_distribution': y.value_counts().to_dict()
        },
        'training_original': {
            'file': train_file,
            'samples': len(X_train),
            'features': len(X_train.columns),
            'class_distribution': y_train.value_counts().to_dict()
        },
        'training_balanced': {
            'file': balanced_file,
            'samples': len(X_train_balanced),
            'features': len(X_train_balanced.columns),
            'class_distribution': pd.Series(y_train_balanced).value_counts().to_dict(),
            'balancing_method': 'SMOTE+Tomek'
        },
        'test_set': {
            'file': test_file,
            'samples': len(X_test),
            'features': len(X_test.columns),
            'class_distribution': y_test.value_counts().to_dict()
        }
    }
}

summary_file = f"{data_export_dir}/dataset_summary.json"
with open(summary_file, 'w') as f:
    json.dump(dataset_summary, f, indent=2, default=str)
print(f"✅ Dataset summary: {summary_file}")

print(f"\n📁 All datasets exported to: {data_export_dir}/")
print("These files can be used for model retraining and validation")

## 5. Deployment Verification

In [None]:
# Verify that all exported files can be loaded correctly
print("🔍 DEPLOYMENT VERIFICATION")
print("=" * 50)

verification_passed = True

for framework, models_dir in export_dirs.items():
    print(f"\nVerifying {framework.upper()} exports:")
    
    try:
        # Test model loading
        loaded_model = joblib.load(f"{models_dir}/movie_dropoff_model.pkl")
        print(f"  ✅ Model loads successfully: {type(loaded_model).__name__}")
        
        # Test SMOTE transformer loading
        loaded_smote = joblib.load(f"{models_dir}/smote_transformer.pkl")
        print(f"  ✅ SMOTE transformer loads successfully")
        
        # Test model info loading
        with open(f"{models_dir}/model_info.json", 'r') as f:
            model_info = json.load(f)
        print(f"  ✅ Model info loads successfully")
        
        # Test class mappings loading
        with open(f"{models_dir}/class_mappings.json", 'r') as f:
            class_mappings = json.load(f)
        print(f"  ✅ Class mappings load successfully")
        
        # Test survey questions loading
        with open(f"{models_dir}/survey_questions.json", 'r') as f:
            questions = json.load(f)
        print(f"  ✅ Survey questions load successfully ({len(questions)} questions)")
        
        # Test prediction with sample data
        sample_input = X_test.iloc[:1]
        prediction = loaded_model.predict(sample_input)
        probability = loaded_model.predict_proba(sample_input) if hasattr(loaded_model, 'predict_proba') else None
        print(f"  ✅ Model prediction works: {prediction[0]} (prob: {probability[0] if probability is not None else 'N/A'})")
        
    except Exception as e:
        print(f"  ❌ Verification failed: {str(e)}")
        verification_passed = False

# Verify dataset exports
print(f"\nVerifying dataset exports:")
dataset_files = [
    f"{data_export_dir}/training_set_original.csv",
    f"{data_export_dir}/training_set_smote_balanced.csv",
    f"{data_export_dir}/test_set.csv",
    f"{data_export_dir}/full_dataset_original.csv"
]

for file_path in dataset_files:
    try:
        test_df = pd.read_csv(file_path)
        print(f"  ✅ {os.path.basename(file_path)}: {test_df.shape}")
    except Exception as e:
        print(f"  ❌ {os.path.basename(file_path)}: {str(e)}")
        verification_passed = False

print(f"\n{'🎉 ALL VERIFICATIONS PASSED!' if verification_passed else '❌ SOME VERIFICATIONS FAILED!'}")
if verification_passed:
    print("Model is ready for production deployment!")
else:
    print("Please check the failed components before deployment.")

## 6. Deployment Summary & Next Steps

In [None]:
# Generate final deployment summary
print("🚀 DEPLOYMENT SUMMARY")
print("=" * 60)

print(f"\n📋 MODEL DETAILS:")
print(f"  Selected Model: {best_model_name}")
print(f"  Accuracy: {accuracy_achieved:.1%}")
print(f"  F1-Score: {results[best_model_name]['F1-Score']:.3f}")
print(f"  Target Achievement: {'✅ EXCEEDED' if target_met else '❌ Below target'}")
print(f"  Export Date: {export_timestamp}")

print(f"\n📁 EXPORTED FILES:")

# List FastAPI files
print(f"\n  FastAPI Deployment ({export_dirs['fastapi']}):")
fastapi_files = [
    "movie_dropoff_model.pkl - Trained model",
    "smote_transformer.pkl - Data preprocessing", 
    "model_info.json - Model metadata",
    "class_mappings.json - Label mappings",
    "survey_questions.json - Frontend form questions",
    "fastapi_prediction_utils.py - API utilities (pre-created)",
    "fastapi_retraining_utils.py - Retraining system (pre-created)",
    "requirements.txt - Dependencies (pre-created)"
]
for file_desc in fastapi_files:
    print(f"    • {file_desc}")

# List general export files
print(f"\n  General Export ({export_dirs['general']}):")
general_files = [
    "movie_dropoff_model.pkl - Trained model",
    "smote_transformer.pkl - Data preprocessing",
    "model_info.json - Model metadata", 
    "class_mappings.json - Label mappings",
    "survey_questions.json - Frontend form questions"
]
for file_desc in general_files:
    print(f"    • {file_desc}")

# List dataset files
print(f"\n  Training Data ({data_export_dir}):")
data_files = [
    "training_set_original.csv - Original training data",
    "training_set_smote_balanced.csv - Balanced training data",
    "test_set.csv - Test/validation data",
    "full_dataset_original.csv - Complete dataset",
    "dataset_summary.json - Data statistics"
]
for file_desc in data_files:
    print(f"    • {file_desc}")

print(f"\n🎯 RECOMMENDED NEXT STEPS:")
next_steps = [
    "Review the FastAPI implementation guide (FASTAPI_IMPLEMENTATION_GUIDE.md)",
    "Set up production environment with required dependencies",
    "Deploy FastAPI application using uvicorn or container deployment",
    "Implement frontend integration using the survey questions JSON",
    "Set up monitoring and logging for prediction requests",
    "Configure automated retraining pipeline with user feedback",
    "Perform A/B testing to validate prediction effectiveness",
    "Monitor model performance and retrain as needed"
]

for i, step in enumerate(next_steps, 1):
    print(f"{i}. {step}")

print(f"\n✨ Model deployment package ready!")
print(f"🔗 Use the FastAPI directory for immediate web deployment")
print(f"📚 Refer to the implementation guides for detailed setup instructions")