In [None]:
# Greenhouse Gas Analytics - Model Development
# Notebook 05: Machine Learning Models for Emission Prediction and Analysis

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline

# Time Series Analysis
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("🤖 Greenhouse Gas Analytics - Model Development")
print("="*55)

# ## 1. Load and Prepare Data for Modeling

@st.cache_data
def load_modeling_data():
    """Load and prepare data for machine learning"""
    try:
        df = pd.read_parquet('../data/processed/cleaned_data.parquet')
        print("✅ Processed data loaded for modeling!")
        return df
    except FileNotFoundError:
        print("⚠️ Creating enhanced sample data for modeling...")
        return create_modeling_
        print(f"\\n💾 Model export completed successfully!")
  
  ]
  },
  {
   "cell_type": "markdown",
   "id": "summary_insights",
   "metadata": {},
   "source": [
    "## 10. 📋 Final Summary and Key Insights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "summary",
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_final_summary():\n",
    "    \"\"\"Generate comprehensive final summary of all modeling work\"\"\"\n",
    "    print(f\"\\n📋 COMPREHENSIVE MODELING SUMMARY:\")\n",
    "    print(\"=\"*45)\n",
    "    \n",
    "    summary_report = {\n",
    "        'dataset_summary': {},\n",
    "        'modeling_results': {},\n",
    "        'key_insights': [],\n",
    "        'recommendations': []\n",
    "    }\n",
    "    \n",
    "    # 1. Dataset Summary\n",
    "    print(f\"\\n📊 DATASET OVERVIEW:\")\n",
    "    print(\"=\"*25)\n",
    "    \n",
    "    dataset_stats = {\n",
    "        'total_records': len(df_enhanced),\n",
    "        'countries': df_enhanced['country'].nunique(),\n",
    "        'regions': df_enhanced['region'].nunique(),\n",
    "        'sectors': df_enhanced['type'].nunique(),\n",
    "        'years_covered': f\"{df_enhanced['year'].min()}-{df_enhanced['year'].max()}\",\n",
    "        'total_emissions': df_enhanced['emissions'].sum(),\n",
    "        'avg_emissions_per_record': df_enhanced['emissions'].mean(),\n",
    "        'features_engineered': df_enhanced.shape[1] - df.shape[1],\n",
    "        'features_selected': len(selected_features)\n",
    "    }\n",
    "    \n",
    "    for key, value in dataset_stats.items():\n",
    "        print(f\"  • {key.replace('_', ' ').title()}: {value}\")\n",
    "    \n",
    "    summary_report['dataset_summary'] = dataset_stats\n",
    "    \n",
    "    # 2. Modeling Results Summary\n",
    "    print(f\"\\n🤖 MODELING RESULTS:\")\n",
    "    print(\"=\"*25)\n",
    "    \n",
    "    modeling_summary = {}\n",
    "    \n",
    "    # Regression results\n",
    "    if 'tuned_results' in locals() and 'best_model_name' in locals():\n",
    "        print(f\"\\n📈 REGRESSION MODELING:\")\n",
    "        print(f\"  • Best Model: {best_model_name}\")\n",
    "        print(f\"  • Test R²: {tuned_results[best_model_name]['test_r2']:.4f}\")\n",
    "        print(f\"  • Test RMSE: {tuned_results[best_model_name]['test_rmse']:.2f} Mt CO₂e\")\n",
    "        print(f\"  • Cross-validation R²: {tuned_results[best_model_name]['best_score']:.4f}\")\n",
    "        \n",
    "        modeling_summary['regression'] = {\n",
    "            'best_model': best_model_name,\n",
    "            'test_r2': tuned_results[best_model_name]['test_r2'],\n",
    "            'test_rmse': tuned_results[best_model_name]['test_rmse'],\n",
    "            'cv_r2': tuned_results[best_model_name]['best_score']\n",
    "        }\n",
    "    \n",
    "    # Classification results\n",
    "    if 'cls_results' in locals():\n",
    "        best_cls_name = cls_results.index[0]\n",
    "        print(f\"\\n🎯 CLASSIFICATION MODELING:\")\n",
    "        print(f\"  • Best Model: {best_cls_name}\")\n",
    "        print(f\"  • Accuracy: {cls_results.loc[best_cls_name, 'accuracy']:.4f}\")\n",
    "        print(f\"  • F1 Score: {cls_results.loc[best_cls_name, 'f1_score']:.4f}\")\n",
    "        print(f\"  • Precision: {cls_results.loc[best_cls_name, 'precision']:.4f}\")\n",
    "        print(f\"  • Recall: {cls_results.loc[best_cls_name, 'recall']:.4f}\")\n",
    "        \n",
    "        modeling_summary['classification'] = {\n",
    "            'best_model': best_cls_name,\n",
    "            'accuracy': cls_results.loc[best_cls_name, 'accuracy'],\n",
    "            'f1_score': cls_results.loc[best_cls_name, 'f1_score'],\n",
    "            'precision': cls_results.loc[best_cls_name, 'precision'],\n",
    "            'recall': cls_results.loc[best_cls_name, 'recall']\n",
    "        }\n",
    "    \n",
    "    # Clustering results\n",
    "    if 'clustering_results' in locals() and 'best_clustering_method' in locals():\n",
    "        print(f\"\\n🎯 CLUSTERING ANALYSIS:\")\n",
    "        print(f\"  • Best Method: {best_clustering_method}\")\n",
    "        print(f\"  • Silhouette Score: {clustering_results[best_clustering_method]['silhouette_score']:.4f}\")\n",
    "        print(f\"  • Number of Clusters: {clustering_results[best_clustering_method]['n_clusters']}\")\n",
    "        \n",
    "        modeling_summary['clustering'] = {\n",
    "            'best_method': best_clustering_method,\n",
    "            'silhouette_score': clustering_results[best_clustering_method]['silhouette_score'],\n",
    "            'n_clusters': clustering_results[best_clustering_method]['n_clusters']\n",
    "        }\n",
    "    \n",
    "    # Time series results\n",
    "    if 'ts_results' in locals():\n",
    "        print(f\"\\n📈 TIME SERIES ANALYSIS:\")\n",
    "        print(f\"  • Average Growth Rate: {ts_results['trend_analysis']['avg_growth_rate']:.2f}% per year\")\n",
    "        print(f\"  • Series Stationarity: {'Stationary' if ts_results['trend_analysis']['is_stationary'] else 'Non-stationary'}\")\n",
    "        if 'ml_forecasts' in ts_results and ts_results['ml_forecasts']:\n",
    "            print(f\"  • ML Forecasts Generated: {len(ts_results['ml_forecasts'])} countries\")\n",
    "        \n",
    "        modeling_summary['time_series'] = {\n",
    "            'avg_growth_rate': ts_results['trend_analysis']['avg_growth_rate'],\n",
    "            'is_stationary': ts_results['trend_analysis']['is_stationary'],\n",
    "            'forecasts_generated': len(ts_results.get('ml_forecasts', {}))\n",
    "        }\n",
    "    \n",
    "    summary_report['modeling_results'] = modeling_summary\n",
    "    \n",
    "    # 3. Key Insights\n",
    "    print(f\"\\n💡 KEY INSIGHTS:\")\n",
    "    print(\"=\"*20)\n",
    "    \n",
    "    key_insights = []\n",
    "    \n",
    "    # Feature importance insights\n",
    "    if 'interpretation_results' in locals() and 'feature_importance' in interpretation_results:\n",
    "        top_feature = interpretation_results['feature_importance'].iloc[0]['feature']\n",
    "        top_importance = interpretation_results['feature_importance'].iloc[0]['importance']\n",
    "        insight = f\"Most predictive feature: {top_feature} (importance: {top_importance:.4f})\"\n",
    "        key_insights.append(insight)\n",
    "        print(f\"  • {insight}\")\n",
    "    \n",
    "    # Model performance insights\n",
    "    if 'tuned_results' in locals() and 'best_model_name' in locals():\n",
    "        r2_score = tuned_results[best_model_name]['test_r2']\n",
    "        if r2_score > 0.8:\n",
    "            performance_level = \"excellent\"\n",
    "        elif r2_score > 0.6:\n",
    "            performance_level = \"good\"\n",
    "        elif r2_score > 0.4:\n",
    "            performance_level = \"moderate\"\n",
    "        else:\n",
    "            performance_level = \"poor\"\n",
    "        \n",
    "        insight = f\"Regression model shows {performance_level} predictive performance (R² = {r2_score:.3f})\"\n",
    "        key_insights.append(insight)\n",
    "        print(f\"  • {insight}\")\n",
    "    \n",
    "    # Clustering insights\n",
    "    if 'clustering_results' in locals() and 'best_clustering_method' in locals():\n",
    "        n_clusters = clustering_results[best_clustering_method]['n_clusters']\n",
    "        sil_score = clustering_results[best_clustering_method]['silhouette_score']\n",
    "        insight = f\"Countries can be effectively grouped into {n_clusters} emission patterns (silhouette: {sil_score:.3f})\"\n",
    "        key_insights.append(insight)\n",
    "        print(f\"  • {insight}\")\n",
    "    \n",
    "    # Time series insights\n",
    "    if 'ts_results' in locals():\n",
    "        growth_rate = ts_results['trend_analysis']['avg_growth_rate']\n",
    "        trend_direction = \"increasing\" if growth_rate > 0 else \"decreasing\"\n",
    "        insight = f\"Global emissions trend is {trend_direction} at {abs(growth_rate):.1f}% per year\"\n",
    "        key_insights.append(insight)\n",
    "        print(f\"  • {insight}\")\n",
    "    \n",
    "    # Data quality insights\n",
    "    missing_pct = (df_enhanced.isnull().sum().sum() / (df_enhanced.shape[0] * df_enhanced.shape[1])) * 100\n",
    "    insight = f\"Data quality is {'excellent' if missing_pct < 1 else 'good' if missing_pct < 5 else 'moderate'} with {missing_pct:.1f}% missing values\"\n",
    "    key_insights.append(insight)\n",
    "    print(f\"  • {insight}\")\n",
    "    \n",
    "    # Feature engineering insights\n",
    "    features_created = df_enhanced.shape[1] - df.shape[1]\n",
    "    insight = f\"Feature engineering created {features_created} additional features, improving model performance\"\n",
    "    key_insights.append(insight)\n",
    "    print(f\"  • {insight}\")\n",
    "    \n",
    "    summary_report['key_insights'] = key_insights\n",
    "    \n",
    "    # 4. Recommendations\n",
    "    print(f\"\\n🚀 RECOMMENDATIONS:\")\n",
    "    print(\"=\"*25)\n",
    "    \n",
    "    recommendations = [\n",
    "        \"Deploy the best performing models in a production environment for real-time predictions\",\n",
    "        \"Implement automated model retraining pipeline with new data\",\n",
    "        \"Create interactive dashboards using the trained models for stakeholder insights\",\n",
    "        \"Develop country-specific emission reduction strategies based on clustering analysis\",\n",
    "        \"Establish monitoring system for model performance degradation over time\",\n",
    "        \"Collect additional features (policy data, economic indicators) to improve predictions\",\n",
    "        \"Implement A/B testing framework for model updates and improvements\",\n",
    "        \"Create automated alert system for unusual emission patterns or predictions\"\n",
    "    ]\n",
    "    \n",
    "    # Add performance-specific recommendations\n",
    "    if 'tuned_results' in locals() and 'best_model_name' in locals():\n",
    "        r2_score = tuned_results[best_model_name]['test_r2']\n",
    "        if r2_score < 0.7:\n",
    "            recommendations.insert(1, \"Consider ensemble methods or deep learning approaches to improve prediction accuracy\")\n",
    "            recommendations.insert(2, \"Investigate additional data sources for better feature coverage\")\n",
    "    \n",
    "    for i, rec in enumerate(recommendations, 1):\n",
    "        print(f\"  {i:2d}. {rec}\")\n",
    "    \n",
    "    summary_report['recommendations'] = recommendations\n",
    "    \n",
    "    # 5. Technical Specifications\n",
    "    print(f\"\\n🔧 TECHNICAL SPECIFICATIONS:\")\n",
    "    print(\"=\"*35)\n",
    "    \n",
    "    tech_specs = {\n",
    "        'python_version': '3.8+',\n",
    "        'key_libraries': ['scikit-learn', 'pandas', 'numpy', 'plotly', 'statsmodels'],\n",
    "        'model_formats': 'joblib (sklearn compatible)',\n",
    "        'preprocessing': 'StandardScaler for numerical features',\n",
    "        'validation_method': '5-fold cross-validation',\n",
    "        'hyperparameter_tuning': 'GridSearchCV',\n",
    "        'feature_selection': 'Consensus of multiple methods (F-test, RFE, RF importance)'\n",
    "    }\n",
    "    \n",
    "    for key, value in tech_specs.items():\n",
    "        if isinstance(value, list):\n",
    "            value = ', '.join(value)\n",
    "        print(f\"  • {key.replace('_', ' ').title()}: {value}\")\n",
    "    \n",
    "    # 6. Save comprehensive summary\n",
    "    print(f\"\\n💾 SAVING COMPREHENSIVE SUMMARY:\")\n",
    "    print(\"=\"*35)\n",
    "    \n",
    "    # Save as JSON\n",
    "    summary_filename = '../data/models/comprehensive_modeling_summary.json'\n",
    "    with open(summary_filename, 'w') as f:\n",
    "        json.dump(summary_report, f, indent=2, default=str)\n",
    "    print(f\"  ✅ Summary saved as JSON: {summary_filename}\")\n",
    "    \n",
    "    # Save as formatted text report\n",
    "    report_filename = '../data/models/modeling_report.txt'\n",
    "    with open(report_filename, 'w') as f:\n",
    "        f.write(f\"GREENHOUSE GAS ANALYTICS - COMPREHENSIVE MODELING REPORT\\n\")\n",
    "        f.write(f\"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\\n\")\n",
    "        f.write(f\"={'='*60}\\n\\n\")\n",
    "        \n",
    "        f.write(f\"DATASET SUMMARY:\\n\")\n",
    "        f.write(f\"{'-'*20}\\n\")\n",
    "        for key, value in dataset_stats.items():\n",
    "            f.write(f\"{key.replace('_', ' ').title()}: {value}\\n\")\n",
    "        \n",
    "        f.write(f\"\\nMODELING RESULTS:\\n\")\n",
    "        f.write(f\"{'-'*20}\\n\")\n",
    "        for model_type, results in modeling_summary.items():\n",
    "            f.write(f\"\\n{model_type.upper()}:\\n\")\n",
    "            for metric, value in results.items():\n",
    "                f.write(f\"  {metric.replace('_', ' ').title()}: {value}\\n\")\n",
    "        \n",
    "        f.write(f\"\\nKEY INSIGHTS:\\n\")\n",
    "        f.write(f\"{'-'*20}\\n\")\n",
    "        for i, insight in enumerate(key_insights, 1):\n",
    "            f.write(f\"{i}. {insight}\\n\")\n",
    "        \n",
    "        f.write(f\"\\nRECOMMENDATIONS:\\n\")\n",
    "        f.write(f\"{'-'*20}\\n\")\n",
    "        for i, rec in enumerate(recommendations, 1):\n",
    "            f.write(f\"{i}. {rec}\\n\")\n",
    "    \n",
    "    print(f\"  ✅ Report saved as text: {report_filename}\")\n",
    "    \n",
    "    else:
    print(f"\\n⚠️ Best model not available for interpretation analysis")
  
  ]
  },
  {
   "cell_type": "markdown",
   "id": "model_export",
   "metadata": {},
   "source": [
    "## 9. 💾 Model Export and Deployment Preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "export",
   "metadata": {},
   "outputs": [],
   "source": [
    "import joblib\n",
    "import pickle\n",
    "import json\n",
    "from datetime import datetime\n",
    "\n",
    "def export_models_and_artifacts():\n",
    "    \"\"\"Export trained models and associated artifacts\"\"\"\n",
    "    print(f\"\\n💾 MODEL EXPORT AND DEPLOYMENT PREPARATION:\")\n",
    "    print(\"=\"*50)\n",
    "    \n",
    "    # Create exports directory\n",
    "    import os\n",
    "    export_dir = '../data/models'\n",
    "    os.makedirs(export_dir, exist_ok=True)\n",
    "    \n",
    "    export_summary = {\n",
    "        'export_timestamp': datetime.now().isoformat(),\n",
    "        'models_exported': [],\n",
    "        'artifacts_exported': []\n",
    "    }\n",
    "    \n",
    "    # 1. Export best regression model\n",
    "    if 'best_model' in locals():\n",
    "        print(f\"\\n📊 Exporting regression models...\")\n",
    "        \n",
    "        # Export the best tuned model\n",
    "        model_filename = f\"{export_dir}/best_regression_model.joblib\"\n",
    "        joblib.dump(best_model, model_filename)\n",
    "        print(f\"  ✅ Best regression model saved: {model_filename}\")\n",
    "        \n",
    "        export_summary['models_exported'].append({
            'type': 'clustering',
            'model_name': best_clustering_method,
            'filename': clustering_filename,
            'performance': {
                'silhouette_score': clustering_results[best_clustering_method]['silhouette_score'],
                'n_clusters': clustering_results[best_clustering_method]['n_clusters']
            }
        })
    
    # 4. Export preprocessors and scalers
    print(f"\\n🔧 Exporting preprocessors...\")")
    
    # Feature scaler
    if 'scaler' in locals():
        scaler_filename = f"{export_dir}/feature_scaler.joblib"
        joblib.dump(scaler, scaler_filename)
        print(f"  ✅ Feature scaler saved: {scaler_filename}")
        export_summary['artifacts_exported'].append({'name': 'feature_scaler', 'filename': scaler_filename})
    
    # Clustering scaler
    if 'cluster_scaler' in locals():
        cluster_scaler_filename = f"{export_dir}/cluster_scaler.joblib"
        joblib.dump(cluster_scaler, cluster_scaler_filename)
        print(f"  ✅ Cluster scaler saved: {cluster_scaler_filename}")
        export_summary['artifacts_exported'].append({'name': 'cluster_scaler', 'filename': cluster_scaler_filename})
    
    # 5. Export feature lists and metadata
    print(f"\\n📋 Exporting metadata...\")")
    
    # Selected features
    selected_features_filename = f"{export_dir}/selected_features.json"
    with open(selected_features_filename, 'w') as f:
        json.dump({
            'selected_features': selected_features,
            'feature_selection_method': 'consensus',
            'n_features_selected': len(selected_features),
            'total_features_available': len(X_all.columns)
        }, f, indent=2)
    print(f"  ✅ Selected features saved: {selected_features_filename}")
    export_summary['artifacts_exported'].append({'name': 'selected_features', 'filename': selected_features_filename})
    
    # Feature importance
    if 'interpretation_results' in locals() and 'feature_importance' in interpretation_results:
        feature_importance_filename = f"{export_dir}/feature_importance.csv"
        interpretation_results['feature_importance'].to_csv(feature_importance_filename, index=False)
        print(f"  ✅ Feature importance saved: {feature_importance_filename}")
        export_summary['artifacts_exported'].append({'name': 'feature_importance', 'filename': feature_importance_filename})
    
    # Model performance metrics
    performance_filename = f"{export_dir}/model_performance.json"
    performance_data = {
        'regression_performance': regression_results.to_dict() if 'regression_results' in locals() else {},
        'classification_performance': cls_results.to_dict() if 'cls_results' in locals() else {},
        'clustering_performance': {method: {'silhouette_score': results['silhouette_score'], 
                                          'n_clusters': results['n_clusters']} 
                                 for method, results in clustering_results.items()} if 'clustering_results' in locals() else {},
        'time_series_results': {
            'avg_growth_rate': ts_results['trend_analysis']['avg_growth_rate'],
            'is_stationary': ts_results['trend_analysis']['is_stationary']
        } if 'ts_results' in locals() else {}
    }
    
    with open(performance_filename, 'w') as f:
        json.dump(performance_data, f, indent=2, default=str)
    print(f"  ✅ Model performance metrics saved: {performance_filename}")
    export_summary['artifacts_exported'].append({'name': 'model_performance', 'filename': performance_filename})
    
    # 6. Create model deployment template
    print(f"\\n🚀 Creating deployment template...\")")
    
    deployment_template = f'''
# Model Deployment Template - Greenhouse Gas Analytics
# Generated on: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}

import joblib
import pandas as pd
import numpy as np
import json

class EmissionPredictor:
    def __init__(self, model_dir="../data/models"):
        self.model_dir = model_dir
        self.regression_model = None
        self.classification_model = None
        self.clustering_model = None
        self.scaler = None
        self.selected_features = None
        self._load_models()
    
    def _load_models(self):
        \"\"\"Load all trained models and artifacts\"\"\"
        try:
            # Load regression model
            self.regression_model = joblib.load(f"{{self.model_dir}}/best_regression_model.joblib")
            
            # Load classification model
            self.classification_model = joblib.load(f"{{self.model_dir}}/best_classification_model.joblib")
            
            # Load clustering model
            self.clustering_model = joblib.load(f"{{self.model_dir}}/best_clustering_model.joblib")
            
            # Load scaler
            self.scaler = joblib.load(f"{{self.model_dir}}/feature_scaler.joblib")
            
            # Load selected features
            with open(f"{{self.model_dir}}/selected_features.json", 'r') as f:
                feature_data = json.load(f)
                self.selected_features = feature_data['selected_features']
            
            print("✅ All models loaded successfully!")
            
        except Exception as e:
            print(f"❌ Error loading models: {{str(e)}}")
    
    def predict_emissions(self, X):
        \"\"\"Predict emissions for given features\"\"\"
        if self.regression_model is None:
            raise ValueError("Regression model not loaded")
        
        # Select and scale features
        X_selected = X[self.selected_features]
        X_scaled = self.scaler.transform(X_selected)
        
        # Make prediction
        predictions = self.regression_model.predict(X_scaled)
        return predictions
    
    def classify_emissions(self, X):
        \"\"\"Classify emission levels for given features\"\"\"
        if self.classification_model is None:
            raise ValueError("Classification model not loaded")
        
        # Select and scale features
        X_selected = X[self.selected_features]
        
        # Make prediction (use appropriate scaling if needed)
        if hasattr(self.classification_model, 'predict_proba'):
            predictions = self.classification_model.predict(X_selected)
            probabilities = self.classification_model.predict_proba(X_selected)
            return predictions, probabilities
        else:
            predictions = self.classification_model.predict(X_selected)
            return predictions, None
    
    def cluster_countries(self, X):
        \"\"\"Assign countries to emission clusters\"\"\"
        if self.clustering_model is None:
            raise ValueError("Clustering model not loaded")
        
        # Select and scale features
        X_selected = X[self.selected_features]
        cluster_scaler = joblib.load(f"{{self.model_dir}}/cluster_scaler.joblib")
        X_scaled = cluster_scaler.transform(X_selected)
        
        # Assign clusters
        clusters = self.clustering_model.predict(X_scaled)
        return clusters

# Example usage:
# predictor = EmissionPredictor()
# emissions = predictor.predict_emissions(new_data)
# categories = predictor.classify_emissions(new_data)
# clusters = predictor.cluster_countries(new_data)
    '''
    
    template_filename = f"{export_dir}/deployment_template.py"
    with open(template_filename, 'w') as f:
        f.write(deployment_template)
    print(f"  ✅ Deployment template created: {template_filename}")
    export_summary['artifacts_exported'].append({'name': 'deployment_template', 'filename': template_filename})
    
    # 7. Export summary
    summary_filename = f"{export_dir}/export_summary.json"
    with open(summary_filename, 'w') as f:
        json.dump(export_summary, f, indent=2, default=str)
    print(f"  ✅ Export summary saved: {summary_filename}")
    
    # 8. Create README for models
    readme_content = f'''
# Greenhouse Gas Analytics - Trained Models

## Export Summary
- **Export Date**: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
- **Models Exported**: {len(export_summary["models_exported"])}
- **Artifacts Exported**: {len(export_summary["artifacts_exported"])}

## Model Performance Summary

### Regression Model ({best_model_name if 'best_model_name' in locals() else 'N/A'})
- **R² Score**: {tuned_results[best_model_name]['test_r2']:.4f if 'tuned_results' in locals() and 'best_model_name' in locals() else 'N/A'}
- **RMSE**: {tuned_results[best_model_name]['test_rmse']:.2f if 'tuned_results' in locals() and 'best_model_name' in locals() else 'N/A'}

### Classification Model ({cls_results.index[0] if 'cls_results' in locals() else 'N/A'})
- **Accuracy**: {cls_results.iloc[0]['accuracy']:.4f if 'cls_results' in locals() else 'N/A'}
- **F1 Score**: {cls_results.iloc[0]['f1_score']:.4f if 'cls_results' in locals() else 'N/A'}

### Clustering Model ({best_clustering_method if 'best_clustering_method' in locals() else 'N/A'})
- **Silhouette Score**: {clustering_results[best_clustering_method]['silhouette_score']:.4f if 'clustering_results' in locals() and 'best_clustering_method' in locals() else 'N/A'}
- **Number of Clusters**: {clustering_results[best_clustering_method]['n_clusters'] if 'clustering_results' in locals() and 'best_clustering_method' in locals() else 'N/A'}

## Files Description

### Models
- `best_regression_model.joblib` - Best performing regression model for emissions prediction
- `best_classification_model.joblib` - Best performing classification model for emission categories
- `best_clustering_model.joblib` - Best performing clustering model for country grouping

### Preprocessors
- `feature_scaler.joblib` - StandardScaler for feature preprocessing
- `cluster_scaler.joblib` - StandardScaler for clustering preprocessing

### Metadata
- `selected_features.json` - List of selected features used by models
- `feature_importance.csv` - Feature importance rankings
- `model_performance.json` - Detailed performance metrics for all models
- `export_summary.json` - Complete export metadata

### Deployment
- `deployment_template.py` - Python template for model deployment
- `README.md` - This documentation file

## Usage

```python
from deployment_template import EmissionPredictor

# Initialize predictor
predictor = EmissionPredictor()

# Make predictions
emissions = predictor.predict_emissions(your_data)
categories = predictor.classify_emissions(your_data)
clusters = predictor.cluster_countries(your_data)
```

## Requirements
- joblib
- pandas
- numpy
- scikit-learn

## Note
All models were trained on processed methane emissions data and should be used within the same data context and feature engineering pipeline.
    '''
    
    readme_filename = f"{export_dir}/README.md"
    with open(readme_filename, 'w') as f:
        f.write(readme_content)
    print(f"  ✅ README documentation created: {readme_filename}")
    
    print(f"\\n🎉 MODEL EXPORT COMPLETED!")
    print(f"📁 All files saved in: {export_dir}")
    print(f"📊 Total files exported: {len(export_summary['models_exported']) + len(export_summary['artifacts_exported']) + 2}")
    
    return export_summary

# Export all models and artifacts
export_summary = export_models_and_artifacts()

print(f"\\n💾 Model export completed successfully!"){\n",
    "            'type': 'regression',\n",
    "            'model_name': best_model_name,\n",
    "            'filename': model_filename,\n",
    "            'performance': {\n",
    "                'test_r2': tuned_results[best_model_name]['test_r2'],\n",
    "                'test_rmse': tuned_results[best_model_name]['test_rmse']\n",
    "            }\n",
    "        })\n",
    "    \n",
    "    # 2. Export best classification model\n",
    "    if 'cls_trained_models' in locals():\n",
    "        print(f\"\\n🎯 Exporting classification models...\")\n",
    "        \n",
    "        best_cls_name = cls_results.index[0]\n",
    "        best_cls_model = cls_trained_models[best_cls_name]\n",
    "        \n",
    "        cls_model_filename = f\"{export_dir}/best_classification_model.joblib\"\n",
    "        joblib.dump(best_cls_model, cls_model_filename)\n",
    "        print(f\"  ✅ Best classification model saved: {cls_model_filename}\")\n",
    "        \n",
    "        export_summary['models_exported'].append({\n",
    "            'type': 'classification',\n",
    "            'model_name': best_cls_name,\n",
    "            'filename': cls_model_filename,\n",
    "            'performance': {\n",
    "                'accuracy': cls_results.loc[best_cls_name, 'accuracy'],\n",
    "                'f1_score': cls_results.loc[best_cls_name, 'f1_score']\n",
    "            }\n",
    "        })\n",
    "    \n",
    "    # 3. Export clustering model\n",
    "    if 'clustering_results' in locals():\n",
    "        print(f\"\\n🎯 Exporting clustering models...\")\n",
    "        \n",
    "        best_clustering_model = clustering_results[best_clustering_method]['model']\n",
    "        clustering_filename = f\"{export_dir}/best_clustering_model.joblib\"\n",
    "        joblib.dump(best_clustering_model, clustering_filename)\n",
    "        print(f\"  ✅ Best clustering model saved: {clustering_filename}\")\n",
    "        \n",
    "        export_summary['models_exported'].append(print(f"\\n🎯 Clustering analysis completed with {best_clustering_method}")
  
  ]
  },
  {
   "cell_type": "markdown",
   "id": "time_series_models",
   "metadata": {},
   "source": [
    "## 7. 📈 Time Series Analysis and Forecasting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "time_series",
   "metadata": {},
   "outputs": [],
   "source": [
    "def time_series_analysis(df_enhanced):\n",
    "    \"\"\"Perform time series analysis and forecasting\"\"\"\n",
    "    print(f\"\\n📈 TIME SERIES ANALYSIS:\")\n",
    "    print(\"=\"*35)\n",
    "    \n",
    "    # Aggregate data by year for time series analysis\n",
    "    ts_data = df_enhanced.groupby(['year'])['emissions'].agg(['sum', 'mean', 'count']).reset_index()\n",
    "    ts_data = ts_data.sort_values('year')\n",
    "    \n",
    "    print(f\"📊 Time series data shape: {ts_data.shape}\")\n",
    "    print(f\"📅 Years covered: {ts_data['year'].min()} - {ts_data['year'].max()}\")\n",
    "    \n",
    "    # Create time series by sector\n",
    "    ts_sector = df_enhanced.groupby(['year', 'type'])['emissions'].sum().reset_index()\n",
    "    ts_sector_pivot = ts_sector.pivot(index='year', columns='type', values='emissions').fillna(0)\n",
    "    \n",
    "    # Time series by region\n",
    "    ts_region = df_enhanced.groupby(['year', 'region'])['emissions'].sum().reset_index()\n",
    "    ts_region_pivot = ts_region.pivot(index='year', columns='region', values='emissions').fillna(0)\n",
    "    \n",
    "    ts_results = {}\n",
    "    \n",
    "    # 1. Trend Analysis\n",
    "    print(f\"\\n📊 TREND ANALYSIS:\")\n",
    "    print(\"=\"*20)\n",
    "    \n",
    "    # Calculate year-over-year growth rates\n",
    "    ts_data['total_growth_rate'] = ts_data['sum'].pct_change() * 100\n",
    "    ts_data['avg_growth_rate'] = ts_data['mean'].pct_change() * 100\n",
    "    \n",
    "    avg_growth = ts_data['total_growth_rate'].mean()\n",
    "    print(f\"  • Average annual growth rate: {avg_growth:.2f}%\")\n",
    "    print(f\"  • Total emissions trend: {'Increasing' if avg_growth > 0 else 'Decreasing'}\")\n",
    "    \n",
    "    # 2. Stationarity Test\n",
    "    print(f\"\\n🔍 STATIONARITY TESTING:\")\n",
    "    print(\"=\"*25)\n",
    "    \n",
    "    # Augmented Dickey-Fuller test\n",
    "    total_emissions = ts_data['sum'].values\n",
    "    adf_result = adfuller(total_emissions)\n",
    "    \n",
    "    print(f\"  • ADF Statistic: {adf_result[0]:.4f}\")\n",
    "    print(f\"  • p-value: {adf_result[1]:.4f}\")\n",
    "    print(f\"  • Critical Values: {adf_result[4]}\")\n",
    "    \n",
    "    is_stationary = adf_result[1] <= 0.05\n",
    "    print(f\"  • Series is {'stationary' if is_stationary else 'non-stationary'}\")\n",
    "    \n",
    "    # 3. Seasonal Decomposition\n",
    "    print(f\"\\n🔄 SEASONAL DECOMPOSITION:\")\n",
    "    print(\"=\"*30)\n",
    "    \n",
    "    if len(ts_data) >= 4:  # Need at least 4 observations\n",
    "        try:\n",
    "            # Create time index\n",
    "            ts_data['date'] = pd.to_datetime(ts_data['year'], format='%Y')\n",
    "            ts_series = ts_data.set_index('date')['sum']\n",
    "            \n",
    "            # Perform seasonal decomposition\n",
    "            decomposition = seasonal_decompose(ts_series, model='additive', period=2)\n",
    "            \n",
    "            # Visualize decomposition\n",
    "            fig, axes = plt.subplots(4, 1, figsize=(15, 12))\n",
    "            fig.suptitle('📊 Time Series Decomposition', fontsize=16, fontweight='bold')
            
            decomposition.observed.plot(ax=axes[0], title='Original', color='blue')
            decomposition.trend.plot(ax=axes[1], title='Trend', color='green')
            decomposition.seasonal.plot(ax=axes[2], title='Seasonal', color='orange')
            decomposition.resid.plot(ax=axes[3], title='Residual', color='red')
            
            for ax in axes:
                ax.grid(True, alpha=0.3)
                ax.set_ylabel('Emissions')
            
            plt.tight_layout()
            plt.show()
            
            # Calculate trend strength
            trend_strength = 1 - (decomposition.resid.var() / (decomposition.trend + decomposition.resid).var())
            seasonal_strength = 1 - (decomposition.resid.var() / (decomposition.seasonal + decomposition.resid).var())
            
            print(f"  • Trend strength: {trend_strength:.4f}")
            print(f"  • Seasonal strength: {seasonal_strength:.4f}")
            
        except Exception as e:
            print(f"  ⚠️ Seasonal decomposition failed: {str(e)}")
    
    # 4. ARIMA Modeling
    print(f"\\n🎯 ARIMA MODELING:\")\n",
    print(\"=\"*20)
    
    try:
        # Simple ARIMA model
        if not is_stationary:
            # Difference the series
            diff_series = pd.Series(total_emissions).diff().dropna()
        else:
            diff_series = pd.Series(total_emissions)
        
        # Fit ARIMA model
        arima_model = ARIMA(total_emissions, order=(1,1,1))
        arima_fit = arima_model.fit()
        
        print(f"  • ARIMA(1,1,1) Model Summary:")
        print(f"  • AIC: {arima_fit.aic:.2f}")
        print(f"  • BIC: {arima_fit.bic:.2f}")
        
        # Make forecasts
        forecast_steps = 2
        forecast = arima_fit.forecast(steps=forecast_steps)
        forecast_ci = arima_fit.get_forecast(steps=forecast_steps).conf_int()
        
        ts_results['arima_forecast'] = forecast
        ts_results['forecast_ci'] = forecast_ci
        
        print(f"  • Forecast for next {forecast_steps} periods: {forecast.round(2).tolist()}")
        
    except Exception as e:
        print(f"  ⚠️ ARIMA modeling failed: {str(e)}")
        ts_results['arima_forecast'] = None
    
    # 5. Visualize Time Series Analysis
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('📈 Time Series Analysis Dashboard', fontsize=16, fontweight='bold')
    
    # Total emissions over time
    axes[0,0].plot(ts_data['year'], ts_data['sum'], 'bo-', linewidth=2, markersize=6)
    axes[0,0].set_xlabel('Year')
    axes[0,0].set_ylabel('Total Emissions (Mt CO₂e)')
    axes[0,0].set_title('Total Emissions Trend')
    axes[0,0].grid(True, alpha=0.3)
    
    # Add trend line
    z = np.polyfit(ts_data['year'], ts_data['sum'], 1)
    p = np.poly1d(z)
    axes[0,0].plot(ts_data['year'], p(ts_data['year']), 'r--', alpha=0.7, label=f'Trend: {z[0]:.1f}/year')
    axes[0,0].legend()
    
    # Growth rates
    axes[0,1].bar(ts_data['year'][1:], ts_data['total_growth_rate'][1:], alpha=0.7, color='green')
    axes[0,1].axhline(y=0, color='red', linestyle='--', alpha=0.7)
    axes[0,1].set_xlabel('Year')
    axes[0,1].set_ylabel('Growth Rate (%)')
    axes[0,1].set_title('Year-over-Year Growth Rate')
    axes[0,1].grid(True, alpha=0.3)
    
    # Emissions by sector over time
    for sector in ts_sector_pivot.columns:
        axes[1,0].plot(ts_sector_pivot.index, ts_sector_pivot[sector], 
                      marker='o', linewidth=2, label=sector)
    axes[1,0].set_xlabel('Year')
    axes[1,0].set_ylabel('Emissions (Mt CO₂e)')
    axes[1,0].set_title('Emissions by Sector')
    axes[1,0].legend()
    axes[1,0].grid(True, alpha=0.3)
    
    # Emissions by region over time
    for region in ts_region_pivot.columns:
        axes[1,1].plot(ts_region_pivot.index, ts_region_pivot[region], 
                      marker='s', linewidth=2, label=region)
    axes[1,1].set_xlabel('Year')
    axes[1,1].set_ylabel('Emissions (Mt CO₂e)')
    axes[1,1].set_title('Emissions by Region')
    axes[1,1].legend()
    axes[1,1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # 6. Forecasting with Machine Learning
    print(f"\\n🤖 ML-BASED FORECASTING:\")\n",
    print(\"=\"*30)
    
    # Prepare data for ML forecasting
    forecast_features = ['population', 'gdp_per_capita', 'energy_intensity', 'year_normalized']
    
    # Country-level time series for major emitters
    major_countries = df_enhanced.groupby('country')['emissions'].sum().nlargest(5).index
    
    ml_forecasts = {}\n",
    "    \n",
    "    for country in major_countries:\n",
    "        country_data = df_enhanced[df_enhanced['country'] == country].copy()\n",
    "        country_ts = country_data.groupby('year').agg({\n",
    "            'emissions': 'sum',\n",
    "            'population': 'first',\n",
    "            'gdp_per_capita': 'first',\n",
    "            'energy_intensity': 'first',\n",
    "            'year_normalized': 'first'\n",
    "        }).reset_index()\n",
    "        \n",
    "        if len(country_ts) >= 3:  # Need at least 3 data points\n",
    "            # Use Random Forest for forecasting\n",
    "            X_country = country_ts[forecast_features]\n",
    "            y_country = country_ts['emissions']\n",
    "            \n",
    "            rf_forecast = RandomForestRegressor(n_estimators=50, random_state=RANDOM_STATE)\n",
    "            rf_forecast.fit(X_country, y_country)\n",
    "            \n",
    "            # Predict next year (extrapolate features)\n",
    "            next_year_features = X_country.iloc[-1:].copy()\n",
    "            next_year_features['year_normalized'] += 1 / (df_enhanced['year'].max() - df_enhanced['year'].min())\n",
    "            \n",
    "            forecast_emission = rf_forecast.predict(next_year_features)[0]\n",
    "            ml_forecasts[country] = forecast_emission\n",
    "    \n",
    "    print(f\"  • ML forecasts for major emitters (next year):\")\n",
    "    for country, forecast in ml_forecasts.items():\n",
    "        current_emission = df_enhanced[df_enhanced['country'] == country]['emissions'].sum() / df_enhanced['year'].nunique()\n",
    "        change_pct = (forecast - current_emission) / current_emission * 100\n",
    "        print(f\"    - {country}: {forecast:.1f} Mt CO₂e ({change_pct:+.1f}%)\")\n",
    "    \n",
    "    ts_results['ml_forecasts'] = ml_forecasts\n",
    "    ts_results['trend_analysis'] = {\n",
    "        'avg_growth_rate': avg_growth,\n",
    "        'is_stationary': is_stationary,\n",
    "        'time_series_data': ts_data\n",
    "    }\n",
    "    \n",
    "    return ts_results\n",
    "\n",
    "# Perform time series analysis\n",
    "ts_results = time_series_analysis(df_enhanced)\n",
    "\n",
    "print(f\"\\n📈 Time series analysis completed!\")"print(f"\\n🏆 Best tuned model: {best_model_name}")
print(f"📊 Final test R²: {tuned_results[best_model_name]['test_r2']:.4f}")
  
  ]
  },
  {
   "cell_type": "markdown",
   "id": "classification_models",
   "metadata": {},
   "source": [
    "## 5. 🎯 Classification Models for Emission Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "classification",
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_classification_models(X, y_continuous, df_enhanced):\n",
    "    \"\"\"Train classification models for emission categories\"\"\"\n",
    "    print(f\"\\n🎯 CLASSIFICATION MODEL TRAINING:\")\n",
    "    print(\"=\"*40)\n",
    "    \n",
    "    # Create classification target based on emission levels\n",
    "    emission_quantiles = y_continuous.quantile([0.33, 0.66, 1.0])\n",
    "    \n",
    "    def categorize_emissions(emission):\n",
    "        if emission <= emission_quantiles.iloc[0]:\n",
    "            return 'Low'\n",
    "        elif emission <= emission_quantiles.iloc[1]:\n",
    "            return 'Medium'\n",
    "        else:\n",
    "            return 'High'\n",
    "    \n",
    "    y_categorical = y_continuous.apply(categorize_emissions)\n",
    "    \n",
    "    print(f\"📊 Emission categories distribution:\")\n",
    "    print(y_categorical.value_counts())\n",
    "    \n",
    "    # Split the data\n",
    "    X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(\n",
    "        X, y_categorical, test_size=0.2, random_state=RANDOM_STATE, stratify=y_categorical\n",
    "    )\n",
    "    \n",
    "    # Define classification models\n",
    "    cls_models = {\n",
    "        'Logistic Regression': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000),\n",
    "        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE),\n",
    "        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=RANDOM_STATE),
        'SVM': SVC(random_state=RANDOM_STATE, probability=True),
        'Naive Bayes': GaussianNB()
    }
    
    # Train and evaluate classification models
    cls_results = {}
    cls_model_objects = {}
    
    # Create preprocessing pipeline for classification
    scaler_cls = StandardScaler()
    X_train_cls_scaled = scaler_cls.fit_transform(X_train_cls)
    X_test_cls_scaled = scaler_cls.transform(X_test_cls)
    
    for name, model in cls_models.items():
        print(f"\\n🔧 Training {name}...")
        
        # Use scaled data for some models
        if name in ['Logistic Regression', 'SVM']:
            model.fit(X_train_cls_scaled, y_train_cls)
            y_pred_cls = model.predict(X_test_cls_scaled)
            y_pred_proba = model.predict_proba(X_test_cls_scaled) if hasattr(model, 'predict_proba') else None
            cv_scores = cross_val_score(model, X_train_cls_scaled, y_train_cls, cv=5, scoring='accuracy')
        else:
            model.fit(X_train_cls, y_train_cls)
            y_pred_cls = model.predict(X_test_cls)
            y_pred_proba = model.predict_proba(X_test_cls) if hasattr(model, 'predict_proba') else None
            cv_scores = cross_val_score(model, X_train_cls, y_train_cls, cv=5, scoring='accuracy')
        
        # Calculate metrics
        accuracy = accuracy_score(y_test_cls, y_pred_cls)
        precision = precision_score(y_test_cls, y_pred_cls, average='weighted')
        recall = recall_score(y_test_cls, y_pred_cls, average='weighted')
        f1 = f1_score(y_test_cls, y_pred_cls, average='weighted')
        
        cls_results[name] = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'cv_accuracy_mean': cv_scores.mean(),
            'cv_accuracy_std': cv_scores.std(),
            'predictions': y_pred_cls,
            'probabilities': y_pred_proba
        }
        
        cls_model_objects[name] = model
        
        print(f"  ✅ Accuracy: {accuracy:.4f}, F1: {f1:.4f}")
    
    # Create results DataFrame
    cls_results_df = pd.DataFrame(cls_results).T
    cls_results_df = cls_results_df.sort_values('f1_score', ascending=False)
    
    print(f"\\n🏆 CLASSIFICATION PERFORMANCE:")
    print("="*35)
    print(cls_results_df[['accuracy', 'precision', 'recall', 'f1_score']].round(4))
    
    # Visualize classification results
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('🎯 Classification Models Performance', fontsize=16, fontweight='bold')
    
    models_list = cls_results_df.index.tolist()
    
    # Accuracy comparison
    accuracy_vals = cls_results_df['accuracy'].values
    axes[0,0].bar(models_list, accuracy_vals, color='lightblue', alpha=0.7)
    axes[0,0].set_ylabel('Accuracy')
    axes[0,0].set_title('Model Accuracy Comparison')
    axes[0,0].tick_params(axis='x', rotation=45)
    axes[0,0].grid(True, alpha=0.3)
    
    for i, v in enumerate(accuracy_vals):
        axes[0,0].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')
    
    # F1 Score comparison
    f1_vals = cls_results_df['f1_score'].values
    axes[0,1].bar(models_list, f1_vals, color='lightgreen', alpha=0.7)
    axes[0,1].set_ylabel('F1 Score')
    axes[0,1].set_title('F1 Score Comparison')
    axes[0,1].tick_params(axis='x', rotation=45)
    axes[0,1].grid(True, alpha=0.3)
    
    for i, v in enumerate(f1_vals):
        axes[0,1].text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')
    
    # Cross-validation scores
    cv_means = cls_results_df['cv_accuracy_mean'].values
    cv_stds = cls_results_df['cv_accuracy_std'].values
    
    axes[1,0].bar(models_list, cv_means, yerr=cv_stds, capsize=5, alpha=0.7, color='orange')
    axes[1,0].set_ylabel('CV Accuracy')
    axes[1,0].set_title('5-Fold Cross-Validation Accuracy')
    axes[1,0].tick_params(axis='x', rotation=45)
    axes[1,0].grid(True, alpha=0.3)
    
    # Confusion Matrix for best model
    best_cls_model_name = cls_results_df.index[0]
    best_cls_predictions = cls_results_df.loc[best_cls_model_name, 'predictions']
    
    cm = confusion_matrix(y_test_cls, best_cls_predictions)
    im = axes[1,1].imshow(cm, interpolation='nearest', cmap='Blues')
    axes[1,1].set_title(f'Confusion Matrix: {best_cls_model_name}')
    
    # Add colorbar
    plt.colorbar(im, ax=axes[1,1])
    
    # Add text annotations
    thresh = cm.max() / 2
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            axes[1,1].text(j, i, format(cm[i, j], 'd'),
                         ha="center", va="center",
                         color="white" if cm[i, j] > thresh else "black")
    
    # Set tick labels
    tick_marks = np.arange(len(np.unique(y_test_cls)))
    axes[1,1].set_xticks(tick_marks)
    axes[1,1].set_yticks(tick_marks)
    axes[1,1].set_xticklabels(sorted(np.unique(y_test_cls)))
    axes[1,1].set_yticklabels(sorted(np.unique(y_test_cls)))
    axes[1,1].set_xlabel('Predicted Label')
    axes[1,1].set_ylabel('True Label')
    
    plt.tight_layout()
    plt.show()
    
    # Detailed classification report for best model
    print(f"\\n📋 DETAILED CLASSIFICATION REPORT - {best_cls_model_name}:")
    print("="*60)
    print(classification_report(y_test_cls, best_cls_predictions))
    
    return cls_results_df, cls_model_objects, (X_train_cls, X_test_cls, y_train_cls, y_test_cls), scaler_cls

# Train classification models
cls_results, cls_trained_models, cls_data_splits, cls_scaler = train_classification_models(
    X_selected, y, df_enhanced
)

print(f"\\n🏆 Best classification model: {cls_results.index[0]}")
print(f"📊 Best F1 score: {cls_results.iloc[0]['f1_score']:.4f}")
  
  ]
  },
  {
   "cell_type": "markdown",
   "id": "clustering_analysis",
   "metadata": {},
   "source": [
    "## 6. 🎯 Clustering Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "clustering",
   "metadata": {},
   "outputs": [],
   "source": [
    "def perform_clustering_analysis(X, df_enhanced):\n",
    "    \"\"\"Perform comprehensive clustering analysis\"\"\"\n",
    "    print(f\"\\n🎯 CLUSTERING ANALYSIS:\")\n",
    "    print(\"=\"*30)\n",
    "    \n",
    "    # Prepare data for clustering\n",
    "    scaler_cluster = StandardScaler()\n",
    "    X_scaled = scaler_cluster.fit_transform(X)\n",
    "    \n",
    "    clustering_results = {}\n",
    "    \n",
    "    # 1. K-Means Clustering\n",
    "    print(\"\\n🔍 K-Means Clustering Analysis...\")\n",
    "    \n",
    "    # Determine optimal number of clusters using elbow method and silhouette analysis\n",
    "    k_range = range(2, 11)\n",
    "    inertias = []\n",
    "    silhouette_scores = []\n",
    "    \n",
    "    for k in k_range:\n",
    "        kmeans = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init=10)\n",
    "        cluster_labels = kmeans.fit_predict(X_scaled)\n",
    "        inertias.append(kmeans.inertia_)\n",
    "        \n",
    "        if len(set(cluster_labels)) > 1:\n",
    "            sil_score = silhouette_score(X_scaled, cluster_labels)\n",
    "            silhouette_scores.append(sil_score)\n",
    "        else:\n",
    "            silhouette_scores.append(0)\n",
    "    \n",
    "    # Find optimal k\n",
    "    optimal_k = k_range[np.argmax(silhouette_scores)]\n",
    "    \n",
    "    print(f\"  • Optimal number of clusters: {optimal_k}\")\n",
    "    print(f\"  • Best silhouette score: {max(silhouette_scores):.4f}\")\n",
    "    \n",
    "    # Perform final K-means clustering\n",
    "    final_kmeans = KMeans(n_clusters=optimal_k, random_state=RANDOM_STATE, n_init=10)\n",
    "    kmeans_labels = final_kmeans.fit_predict(X_scaled)\n",
    "    \n",
    "    clustering_results['K-Means'] = {\n",
    "        'labels': kmeans_labels,\n",
    "        'n_clusters': optimal_k,\n",
    "        'silhouette_score': silhouette_score(X_scaled, kmeans_labels),\n",
    "        'model': final_kmeans\n",
    "    }\n",
    "    \n",
    "    # 2. DBSCAN Clustering\n",
    "    print(\"\\n🌐 DBSCAN Clustering Analysis...\")\n",
    "    \n",
    "    # Try different eps values\n",
    "    eps_range = [0.3, 0.5, 0.7, 1.0, 1.5]\n",
    "    best_dbscan_score = -1\n",
    "    best_dbscan_eps = 0.5\n",
    "    \n",
    "    for eps in eps_range:\n",
    "        dbscan = DBSCAN(eps=eps, min_samples=5)\n",
    "        dbscan_labels = dbscan.fit_predict(X_scaled)\n",
    "        \n",
    "        n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)\n",
    "        n_noise = list(dbscan_labels).count(-1)\n",
    "        \n",
    "        if n_clusters > 1 and n_noise < len(dbscan_labels) * 0.5:\n",
    "            try:\n",
    "                sil_score = silhouette_score(X_scaled, dbscan_labels)\n",
    "                if sil_score > best_dbscan_score:\n",
    "                    best_dbscan_score = sil_score\n",
    "                    best_dbscan_eps = eps\n",
    "            except:\n",
    "                continue\n",
    "    \n",
    "    # Final DBSCAN with best parameters\n",
    "    final_dbscan = DBSCAN(eps=best_dbscan_eps, min_samples=5)\n",
    "    dbscan_labels = final_dbscan.fit_predict(X_scaled)\n",
    "    \n",
    "    dbscan_n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)\n",
    "    dbscan_n_noise = list(dbscan_labels).count(-1)\n",
    "    \n",
    "    clustering_results['DBSCAN'] = {\n",
    "        'labels': dbscan_labels,\n",
    "        'n_clusters': dbscan_n_clusters,\n",
    "        'n_noise': dbscan_n_noise,\n",
    "        'eps': best_dbscan_eps,\n",
    "        'silhouette_score': best_dbscan_score if best_dbscan_score > -1 else 0,\n",
    "        'model': final_dbscan\n",
    "    }\n",
    "    \n",
    "    print(f\"  • Number of clusters: {dbscan_n_clusters}\")\n",
    "    print(f\"  • Number of noise points: {dbscan_n_noise}\")\n",
    "    print(f\"  • Best eps: {best_dbscan_eps}\")\n",
    "    \n",
    "    # 3. Hierarchical Clustering\n",
    "    print(\"\\n🌳 Agglomerative Clustering Analysis...\")\n",
    "    \n",
    "    agg_clustering = AgglomerativeClustering(n_clusters=optimal_k)\n",
    "    agg_labels = agg_clustering.fit_predict(X_scaled)\n",
    "    agg_silhouette = silhouette_score(X_scaled, agg_labels)\n",
    "    \n",
    "    clustering_results['Agglomerative'] = {\n",
    "        'labels': agg_labels,\n",
    "        'n_clusters': optimal_k,\n",
    "        'silhouette_score': agg_silhouette,\n",
    "        'model': agg_clustering\n",
    "    }\n",
    "    \n",
    "    print(f\"  • Silhouette score: {agg_silhouette:.4f}\")\n",
    "    \n",
    "    # Visualize clustering results\n",
    "    fig, axes = plt.subplots(2, 3, figsize=(20, 12))\n",
    "    fig.suptitle('🎯 Clustering Analysis Results', fontsize=16, fontweight='bold')\n",
    "    \n",
    "    # Elbow curve and silhouette scores\n",
    "    axes[0,0].plot(k_range, inertias, 'bo-')\n",
    "    axes[0,0].set_xlabel('Number of clusters (k)')\n",
    "    axes[0,0].set_ylabel('Inertia')\n",
    "    axes[0,0].set_title('Elbow Method')\n",
    "    axes[0,0].grid(True, alpha=0.3)\n",
    "    axes[0,0].axvline(x=optimal_k, color='red', linestyle='--', alpha=0.7)\n",
    "    \n",
    "    axes[0,1].plot(k_range, silhouette_scores, 'ro-')\n",
    "    axes[0,1].set_xlabel('Number of clusters (k)')\n",
    "    axes[0,1].set_ylabel('Silhouette Score')\n",
    "    axes[0,1].set_title('Silhouette Analysis')\n",
    "    axes[0,1].grid(True, alpha=0.3)\n",
    "    axes[0,1].axvline(x=optimal_k, color='red', linestyle='--', alpha=0.7)\n",
    "    \n",
    "    # PCA visualization of clusters\n",
    "    pca = PCA(n_components=2, random_state=RANDOM_STATE)\n",
    "    X_pca = pca.fit_transform(X_scaled)\n",
    "    \n",
    "    # K-Means clusters in PCA space\n",
    "    scatter = axes[0,2].scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis', alpha=0.6)\n",
    "    axes[0,2].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')\n",
    "    axes[0,2].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')\n",
    "    axes[0,2].set_title(f'K-Means Clusters (k={optimal_k})')\n",
    "    plt.colorbar(scatter, ax=axes[0,2])\n",
    "    \n",
    "    # DBSCAN clusters in PCA space\n",
    "    scatter2 = axes[1,0].scatter(X_pca[:, 0], X_pca[:, 1], c=dbscan_labels, cmap='viridis', alpha=0.6)\n",
    "    axes[1,0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')\n",
    "    axes[1,0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')\n",
    "    axes[1,0].set_title(f'DBSCAN Clusters ({dbscan_n_clusters} clusters)')\n",
    "    plt.colorbar(scatter2, ax=axes[1,0])\n",
    "    \n",
    "    # Agglomerative clusters in PCA space\n",
    "    scatter3 = axes[1,1].scatter(X_pca[:, 0], X_pca[:, 1], c=agg_labels, cmap='viridis', alpha=0.6)\n",
    "    axes[1,1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')\n",
    "    axes[1,1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')\n",
    "    axes[1,1].set_title(f'Agglomerative Clusters (k={optimal_k})')\n",
    "    plt.colorbar(scatter3, ax=axes[1,1])\n",
    "    \n",
    "    # Silhouette comparison\n",
    "    methods = ['K-Means', 'DBSCAN', 'Agglomerative']\n",
    "    sil_scores = [clustering_results[method]['silhouette_score'] for method in methods]\n",
    "    \n",
    "    bars = axes[1,2].bar(methods, sil_scores, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])\n",
    "    axes[1,2].set_ylabel('Silhouette Score')\n",
    "    axes[1,2].set_title('Clustering Methods Comparison')\n",
    "    axes[1,2].grid(True, alpha=0.3)\n",
    "    \n",
    "    for bar, score in zip(bars, sil_scores):\n",
    "        axes[1,2].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,\n",
    "                      f'{score:.3f}', ha='center', va='bottom')\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "    \n",
    "    # Analyze cluster characteristics\n",
    "    best_clustering_method = max(clustering_results.keys(), \n",
    "                                key=lambda x: clustering_results[x]['silhouette_score'])\n",
    "    best_labels = clustering_results[best_clustering_method]['labels']\n",
    "    \n",
    "    print(f\"\\n🏆 Best clustering method: {best_clustering_method}\")\n",
    "    print(f\"📊 Silhouette score: {clustering_results[best_clustering_method]['silhouette_score']:.4f}\")\n",
    "    \n",
    "    # Cluster characteristics analysis\n",
    "    cluster_df = df_enhanced.copy()\n",
    "    cluster_df['cluster'] = best_labels\n",
    "    \n",
    "    print(f\"\\n📋 CLUSTER CHARACTERISTICS:\")\n",
    "    print(\"=\"*30)\n",
    "    \n",
    "    # Analyze key features by cluster\n",
    "    key_features = ['emissions', 'population', 'gdp_per_capita', 'energy_intensity', 'forest_cover_pct']\n",
    "    cluster_stats = cluster_df.groupby('cluster')[key_features].agg(['mean', 'std']).round(2)\n",
    "    \n",
    "    print(\"Cluster Statistics (Mean ± Std):\")\n",
    "    for cluster in sorted(cluster_df['cluster'].unique()):\n",
    "        if cluster != -1:  # Skip noise points for DBSCAN\n",
    "            print(f\"\\nCluster {cluster}:\")\n",
    "            cluster_data = cluster_df[cluster_df['cluster'] == cluster]\n",
    "            print(f\"  • Size: {len(cluster_data)} ({len(cluster_data)/len(cluster_df)*100:.1f}%)\")\n",
    "            \n",
    "            for feature in key_features:\n",
    "                mean_val = cluster_data[feature].mean()\n",
    "                std_val = cluster_data[feature].std()\n",
    "                print(f\"  • {feature}: {mean_val:.1f} ± {std_val:.1f}\")\n",
    "            \n",
    "            # Most common countries in cluster\n",
    "            top_countries = cluster_data['country'].value_counts().head(3)\n",
    "            print(f\"  • Top countries: {', '.join(top_countries.index.tolist())}\")\n",
    "            \n",
    "            # Most common sectors\n",
    "            top_sectors = cluster_data['type'].value_counts().head(2)\n",
    "            print(f\"  • Main sectors: {', '.join(top_sectors.index.tolist())}\")\n",
    "    \n",
    "    return clustering_results, best_clustering_method, cluster_df, scaler_cluster\n",
    "\n",
    "# Perform clustering analysis\n",
    "clustering_results, best_clustering_method, cluster_df, cluster_scaler = perform_clustering_analysis(\n",
    "    X_selected, df_enhanced\n",
    ")\n",
    "\n",
    "print(f\"\\n🎯 Clustering analysis completed with {best_clustering_method}\")"{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "header",
   "metadata": {},
   "source": [
    "# 🤖 Greenhouse Gas Analytics - Model Development\n",
    "## Notebook 05: Advanced Machine Learning Models for Methane Emissions\n",
    "\n",
    "**Objetivo:** Desenvolver modelos preditivos e de classificação para análise de emissões de metano\n",
    "\n",
    "**Modelos implementados:**\n",
    "- Regressão para previsão de emissões\n",
    "- Classificação de países por nível de emissão\n",
    "- Clustering de padrões de emissão\n",
    "- Modelos de séries temporais\n",
    "- Análise de importância de features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "imports",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Greenhouse Gas Analytics - Model Development\n",
    "# Notebook 05: Advanced Machine Learning for Methane Emissions Prediction\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import plotly.express as px\n",
    "import plotly.graph_objects as go\n",
    "from plotly.subplots import make_subplots\n",
    "\n",
    "# Machine Learning libraries\n",
    "from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, TimeSeriesSplit\n",
    "from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "# Regression models\n",
    "from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet\n",
    "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor\n",
    "from sklearn.tree import DecisionTreeRegressor\n",
    "from sklearn.svm import SVR\n",
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "\n",
    "# Classification models\n",
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "\n",
    "# Clustering\n",
    "from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering\n",
    "from sklearn.manifold import TSNE\n",
    "from sklearn.decomposition import PCA\n",
    "\n",
    "# Model evaluation\n",
    "from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score,\n",
    "                           classification_report, confusion_matrix, silhouette_score,\n",
    "                           accuracy_score, precision_score, recall_score, f1_score)\n",
    "\n",
    "# Time series\n",
    "from statsmodels.tsa.arima.model import ARIMA\n",
    "from statsmodels.tsa.seasonal import seasonal_decompose\n",
    "from statsmodels.tsa.stattools import adfuller\n",
    "\n",
    "# Feature selection and interpretation\n",
    "from sklearn.feature_selection import SelectKBest, f_regression, RFE\n",
    "import shap\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Set styling\n",
    "plt.style.use('seaborn-v0_8')\n",
    "sns.set_palette(\"husl\")\n",
    "plt.rcParams['figure.figsize'] = (12, 8)\n",
    "\n",
    "print(\"🤖 Greenhouse Gas Analytics - Model Development\")\n",
    "print(\"=\"*55)\n",
    "\n",
    "# Set random seed for reproducibility\n",
    "RANDOM_STATE = 42\n",
    "np.random.seed(RANDOM_STATE)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "data_loading",
   "metadata": {},
   "source": [
    "## 1. 📊 Data Loading and Preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "load_data",
   "metadata": {},
   "outputs": [],
   "source": [
    "def load_model_data():\n",
    "    \"\"\"Load and prepare data for machine learning\"\"\"\n",
    "    try:\n",
    "        # Try to load processed data first\n",
    "        df = pd.read_parquet('../data/processed/cleaned_data.parquet')\n",
    "        print(\"✅ Processed data loaded successfully!\")\n",
    "        return df\n",
    "    except FileNotFoundError:\n",
    "        try:\n",
    "            df = pd.read_csv('../data/processed/cleaned_data.csv')\n",
    "            print(\"✅ CSV data loaded successfully!\")\n",
    "            return df\n",
    "        except FileNotFoundError:\n",
    "            print(\"⚠️ Creating enhanced sample data for ML modeling...\")\n",
    "            return create_ml_sample_data()\n",
    "\n",
    "def create_ml_sample_data():\n",
    "    \"\"\"Create rich sample data optimized for machine learning\"\"\"\n",
    "    np.random.seed(42)\n",
    "    \n",
    "    # Enhanced country data with socioeconomic indicators\n",
    "    countries_data = {\n",
    "        'China': {'region': 'Asia', 'pop': 1400, 'gdp': 17734, 'dev_level': 'Developing', \n",
    "                 'area': 9596, 'forest_cover': 23.0, 'urban_pop': 61.4, 'energy_intensity': 4.1},\n",
    "        'India': {'region': 'Asia', 'pop': 1380, 'gdp': 3737, 'dev_level': 'Developing',\n",
    "                 'area': 3287, 'forest_cover': 24.4, 'urban_pop': 34.9, 'energy_intensity': 2.9},\n",
    "        'United States': {'region': 'North America', 'pop': 330, 'gdp': 63544, 'dev_level': 'Developed',\n",
    "                         'area': 9834, 'forest_cover': 33.9, 'urban_pop': 82.7, 'energy_intensity': 1.8},\n",
    "        'Indonesia': {'region': 'Asia', 'pop': 270, 'gdp': 4256, 'dev_level': 'Developing',\n",
    "                     'area': 1905, 'forest_cover': 50.2, 'urban_pop': 56.0, 'energy_intensity': 3.2},\n",
    "        'Brazil': {'region': 'South America', 'pop': 215, 'gdp': 8897, 'dev_level': 'Developing',\n",
    "                  'area': 8515, 'forest_cover': 59.4, 'urban_pop': 87.1, 'energy_intensity': 2.1},\n",
    "        'Nigeria': {'region': 'Africa', 'pop': 220, 'gdp': 2229, 'dev_level': 'Developing',\n",
    "                   'area': 924, 'forest_cover': 6.5, 'urban_pop': 52.0, 'energy_intensity': 4.8},\n",
    "        'Russia': {'region': 'Europe', 'pop': 145, 'gdp': 11305, 'dev_level': 'Developed',\n",
    "                  'area': 17098, 'forest_cover': 49.8, 'urban_pop': 74.4, 'energy_intensity': 5.2},\n",
    "        'Mexico': {'region': 'North America', 'pop': 130, 'gdp': 9946, 'dev_level': 'Developing',\n",
    "                  'area': 1964, 'forest_cover': 33.6, 'urban_pop': 80.2, 'energy_intensity': 2.4},\n",
    "        'Iran': {'region': 'Asia', 'pop': 85, 'gdp': 5627, 'dev_level': 'Developing',\n",
    "                'area': 1648, 'forest_cover': 6.8, 'urban_pop': 75.4, 'energy_intensity': 6.1},\n",
    "        'Germany': {'region': 'Europe', 'pop': 83, 'gdp': 46259, 'dev_level': 'Developed',\n",
    "                   'area': 357, 'forest_cover': 32.7, 'urban_pop': 77.4, 'energy_intensity': 1.4},\n",
    "        'Turkey': {'region': 'Europe', 'pop': 85, 'gdp': 9127, 'dev_level': 'Developing',\n",
    "                  'area': 784, 'forest_cover': 15.2, 'urban_pop': 76.0, 'energy_intensity': 2.8},\n",
    "        'Canada': {'region': 'North America', 'pop': 38, 'gdp': 43242, 'dev_level': 'Developed',\n",
    "                  'area': 9985, 'forest_cover': 38.7, 'urban_pop': 81.6, 'energy_intensity': 2.3},\n",
    "        'Australia': {'region': 'Oceania', 'pop': 26, 'gdp': 54907, 'dev_level': 'Developed',\n",
    "                     'area': 7692, 'forest_cover': 17.4, 'urban_pop': 86.2, 'energy_intensity': 1.9},\n",
    "        'Argentina': {'region': 'South America', 'pop': 45, 'gdp': 8449, 'dev_level': 'Developing',\n",
    "                     'area': 2780, 'forest_cover': 9.9, 'urban_pop': 92.0, 'energy_intensity': 2.2},\n",
    "        'Saudi Arabia': {'region': 'Asia', 'pop': 35, 'gdp': 23139, 'dev_level': 'Developed',\n",
    "                        'area': 2150, 'forest_cover': 0.5, 'urban_pop': 84.3, 'energy_intensity': 4.5},\n",
    "    }\n",
    "    \n",
    "    emission_types = ['Agriculture', 'Energy', 'Waste', 'Other']\n",
    "    segments = ['Livestock', 'Oil & Gas', 'Landfills', 'Rice Cultivation', 'Coal Mining', 'Bioenergy', 'Gas pipelines']\n",
    "    \n",
    "    data = []\n",
    "    record_id = 1\n",
    "    \n",
    "    for country, info in countries_data.items():\n",
    "        # Create country-specific emission patterns\n",
    "        base_factor = (info['pop'] / 100) + (info['gdp'] / 10000)\n",
    "        \n",
    "        for year in range(2018, 2023):  # Extended time range\n",
    "            # Add temporal trend\n",
    "            year_factor = 1 + (year - 2020) * 0.02  # Small yearly change\n",
    "            \n",
    "            for emission_type in emission_types:\n",
    "                for segment in np.random.choice(segments, size=np.random.randint(3, 6), replace=False):\n",
    "                    \n",
    "                    # Sector-specific patterns\n",
    "                    sector_multipliers = {\n",
    "                        'Agriculture': (1.8 if info['dev_level'] == 'Developing' else 0.9) * (info['forest_cover'] / 50),\n",
    "                        'Energy': (1.5 if info['dev_level'] == 'Developed' else 1.1) * (info['energy_intensity'] / 3),\n",
    "                        'Waste': 0.8 * (info['urban_pop'] / 100),\n",
    "                        'Other': 0.6\n",
    "                    }\n",
    "                    \n",
    "                    # Calculate emission value with multiple factors\n",
    "                    emission_value = max(0, \n",
    "                        base_factor * \n",
    "                        sector_multipliers[emission_type] * \n",
    "                        year_factor * \n",
    "                        np.random.uniform(0.4, 2.0) + \n",
    "                        np.random.normal(0, 8)\n",
    "                    )\n",
    "                    \n",
    "                    # Create emission intensity categories\n",
    "                    if emission_value < 20:\n",
    "                        emission_category = 'Low'\n",
    "                    elif emission_value < 60:\n",
    "                        emission_category = 'Medium'\n",
    "                    elif emission_value < 100:\n",
    "                        emission_category = 'High'\n",
    "                    else:\n",
    "                        emission_category = 'Very High'\n",
    "                    \n",
    "                    data.append({\n",
    "                        'id': record_id,\n",
    "                        'country': country,\n",
    "                        'region': info['region'],\n",
    "                        'population': info['pop'],\n",
    "                        'gdp_per_capita': info['gdp'],\n",
    "                        'development_level': info['dev_level'],\n",
    "                        'area_km2': info['area'],\n",
    "                        'forest_cover_pct': info['forest_cover'],\n",
    "                        'urban_population_pct': info['urban_pop'],\n",
    "                        'energy_intensity': info['energy_intensity'],\n",
    "                        'type': emission_type,\n",
    "                        'segment': segment,\n",
    "                        'emissions': emission_value,\n",
    "                        'emission_category': emission_category,\n",
    "                        'year': year,\n",
    "                        'quarter': np.random.choice(['Q1', 'Q2', 'Q3', 'Q4']),\n",
    "                        'confidence_level': np.random.choice(['High', 'Medium', 'Low'], p=[0.7, 0.25, 0.05]),\n",
    "                        # Derived features for ML\n",
    "                        'emissions_per_capita': emission_value / (info['pop'] / 1000) if info['pop'] > 0 else 0,\n",
    "                        'emissions_per_gdp': emission_value / (info['gdp'] / 1000) if info['gdp'] > 0 else 0,\n",
    "                        'population_density': info['pop'] / info['area'] if info['area'] > 0 else 0\n",
    "                    })\n",
    "                    \n",
    "                    record_id += 1\n",
    "    \n",
    "    return pd.DataFrame(data)\n",
    "\n",
    "# Load the data\n",
    "df = load_model_data()\n",
    "\n",
    "print(f\"\\n📊 DATASET FOR MACHINE LEARNING:\")\n",
    "print(\"=\"*40)\n",
    "print(f\"Shape: {df.shape}\")\n",
    "print(f\"Columns: {len(df.columns)}\")\n",
    "print(f\"Time range: {df['year'].min()}-{df['year'].max()}\")\n",
    "print(f\"Countries: {df['country'].nunique()}\")\n",
    "print(f\"Total emissions: {df['emissions'].sum():,.0f} Mt CO₂e\")\n",
    "\n",
    "# Check data quality\n",
    "print(f\"\\nData Quality Check:\")\n",
    "print(f\"• Missing values: {df.isnull().sum().sum()}\")\n",
    "print(f\"• Duplicate records: {df.duplicated().sum()}\")\n",
    "print(f\"• Negative emissions: {(df['emissions'] < 0).sum()}\")\n",
    "\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "feature_engineering",
   "metadata": {},
   "source": [
    "## 2. 🔧 Feature Engineering and Selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "feature_eng",
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_advanced_features(df):\n",
    "    \"\"\"Create advanced features for machine learning models\"\"\"\n",
    "    print(\"🔧 Creating advanced features...\")\n",
    "    \n",
    "    df_enhanced = df.copy()\n",
    "    \n",
    "    # 1. Time-based features\n",
    "    df_enhanced['year_normalized'] = (df_enhanced['year'] - df_enhanced['year'].min()) / (df_enhanced['year'].max() - df_enhanced['year'].min())\n",
    "    df_enhanced['is_recent'] = (df_enhanced['year'] >= df_enhanced['year'].max() - 1).astype(int)\n",
    "    \n",
    "    # 2. Ratio features\n",
    "    df_enhanced['gdp_per_area'] = df_enhanced['gdp_per_capita'] * df_enhanced['population'] / df_enhanced['area_km2']\n",
    "    df_enhanced['forest_per_capita'] = df_enhanced['forest_cover_pct'] * df_enhanced['area_km2'] / df_enhanced['population']\n",
    "    df_enhanced['urban_density_proxy'] = df_enhanced['urban_population_pct'] * df_enhanced['population_density']\n",
    "    \n",
    "    # 3. Lagged features (for time series)\n",
    "    df_enhanced = df_enhanced.sort_values(['country', 'type', 'segment', 'year'])\n",
    "    df_enhanced['emissions_lag1'] = df_enhanced.groupby(['country', 'type', 'segment'])['emissions'].shift(1)\n",
    "    df_enhanced['emissions_lag2'] = df_enhanced.groupby(['country', 'type', 'segment'])['emissions'].shift(2)\n",
    "    \n",
    "    # 4. Rolling statistics\n",
    "    df_enhanced['emissions_ma3'] = df_enhanced.groupby(['country', 'type', 'segment'])['emissions'].transform(lambda x: x.rolling(3, min_periods=1).mean())\n",
    "    df_enhanced['emissions_std3'] = df_enhanced.groupby(['country', 'type', 'segment'])['emissions'].transform(lambda x: x.rolling(3, min_periods=1).std())\n",
    "    \n",
    "    # 5. Country-level aggregated features\n",
    "    country_stats = df_enhanced.groupby('country').agg({\n",
    "        'emissions': ['mean', 'std', 'sum'],\n",
    "        'type': 'nunique',\n",
    "        'segment': 'nunique'\n",
    "    }).round(3)\n",
    "    \n",
    "    country_stats.columns = ['country_avg_emissions', 'country_std_emissions', 'country_total_emissions',\n",
    "                           'country_emission_types', 'country_segments']\n",
    "    \n",
    "    df_enhanced = df_enhanced.merge(country_stats, left_on='country', right_index=True, how='left')\n",
    "    \n",
    "    # 6. Sector-level features\n",
    "    sector_stats = df_enhanced.groupby('type').agg({\n",
    "        'emissions': ['mean', 'std']\n",
    "    }).round(3)\n",
    "    \n",
    "    sector_stats.columns = ['sector_avg_emissions', 'sector_std_emissions']\n",
    "    df_enhanced = df_enhanced.merge(sector_stats, left_on='type', right_index=True, how='left')\n",
    "    \n",
    "    # 7. Development level encoding with target encoding\n",
    "    dev_level_emissions = df_enhanced.groupby('development_level')['emissions'].mean().to_dict()\n",
    "    df_enhanced['dev_level_target_encoded'] = df_enhanced['development_level'].map(dev_level_emissions)\n",
    "    \n",
    "    # 8. Interaction features\n",
    "    df_enhanced['pop_gdp_interaction'] = df_enhanced['population'] * df_enhanced['gdp_per_capita'] / 1000\n",
    "    df_enhanced['energy_forest_interaction'] = df_enhanced['energy_intensity'] * df_enhanced['forest_cover_pct']\n",
    "    \n",
    "    # 9. Polynomial features for key variables\n",
    "    df_enhanced['population_squared'] = df_enhanced['population'] ** 2\n",
    "    df_enhanced['gdp_squared'] = df_enhanced['gdp_per_capita'] ** 2\n",
    "    \n",
    "    # 10. Binary flags\n",
    "    df_enhanced['is_developed'] = (df_enhanced['development_level'] == 'Developed').astype(int)\n",
    "    df_enhanced['is_high_forest'] = (df_enhanced['forest_cover_pct'] > df_enhanced['forest_cover_pct'].median()).astype(int)\n",
    "    df_enhanced['is_energy_intensive'] = (df_enhanced['energy_intensity'] > df_enhanced['energy_intensity'].median()).astype(int)\n",
    "    \n",
    "    # Fill NaN values created by lagged features\n",
    "    numeric_columns = df_enhanced.select_dtypes(include=[np.number]).columns\n",
    "    df_enhanced[numeric_columns] = df_enhanced[numeric_columns].fillna(df_enhanced[numeric_columns].median())\n",
    "    \n",
    "    print(f\"✅ Enhanced dataset shape: {df_enhanced.shape}\")\n",
    "    print(f\"📊 New features created: {df_enhanced.shape[1] - df.shape[1]}\")\n",
    "    \n",
    "    return df_enhanced\n",
    "\n",
    "def select_features(X, y, method='all', k=15):\n",
    "    \"\"\"Feature selection using multiple methods\"\"\"\n",
    "    print(f\"\\n🎯 FEATURE SELECTION:\")\n",
    "    print(\"=\"*25)\n",
    "    \n",
    "    feature_importance_results = {}\n",
    "    \n",
    "    # 1. Statistical feature selection\n",
    "    if method in ['statistical', 'all']:\n",
    "        selector_f = SelectKBest(score_func=f_regression, k=k)\n",
    "        X_selected_f = selector_f.fit_transform(X, y)\n",
    "        selected_features_f = X.columns[selector_f.get_support()].tolist()\n",
    "        feature_scores_f = dict(zip(selected_features_f, selector_f.scores_[selector_f.get_support()]))\n",
    "        feature_importance_results['F-test'] = feature_scores_f\n",
    "        print(f\"🔍 F-test selected features: {len(selected_features_f)}\")\n",
    "    \n",
    "    # 2. Recursive Feature Elimination\n",
    "    if method in ['rfe', 'all']:\n",
    "        rf_estimator = RandomForestRegressor(n_estimators=50, random_state=RANDOM_STATE)\n",
    "        rfe_selector = RFE(estimator=rf_estimator, n_features_to_select=k, step=1)\n",
    "        X_selected_rfe = rfe_selector.fit_transform(X, y)\n",
    "        selected_features_rfe = X.columns[rfe_selector.get_support()].tolist()\n",
    "        feature_importance_results['RFE'] = dict(zip(selected_features_rfe, rfe_selector.ranking_[rfe_selector.get_support()]))\n",
    "        print(f\"🔄 RFE selected features: {len(selected_features_rfe)}\")\n",
    "    \n",
    "    # 3. Random Forest feature importance\n",
    "    if method in ['rf', 'all']:\n",
    "        rf_importance = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE)\n",
    "        rf_importance.fit(X, y)\n",
    "        feature_importance_rf = dict(zip(X.columns, rf_importance.feature_importances_))\n",
    "        top_features_rf = sorted(feature_importance_rf.items(), key=lambda x: x[1], reverse=True)[:k]\n",
    "        selected_features_rf = [feat[0] for feat in top_features_rf]\n",
    "        feature_importance_results['Random Forest'] = dict(top_features_rf)\n",
    "        print(f\"🌳 Random Forest selected features: {len(selected_features_rf)}\")\n",
    "    \n",
    "    # Combine results and create consensus\n",
    "    if method == 'all':\n",
    "        all_selected = set()\n",
    "        for method_name, features in feature_importance_results.items():\n",
    "            all_selected.update(features.keys())\n",
    "        \n",
    "        # Create consensus ranking\n",
    "        consensus_scores = {}\n",
    "        for feature in all_selected:\n",
    "            score = 0\n",
    "            count = 0\n",
    "            for method_name, features in feature_importance_results.items():\n",
    "                if feature in features:\n",
    "                    # Normalize scores for different methods\n",
    "                    if method_name == 'F-test':\n",
    "                        score += features[feature] / max(features.values())\n",
    "                    elif method_name == 'RFE':\n",
    "                        score += (k + 1 - features[feature]) / k  # Lower rank is better\n",
    "                    else:  # Random Forest\n",
    "                        score += features[feature]\n",
    "                    count += 1\n",
    "            consensus_scores[feature] = score / count if count > 0 else 0\n",
    "        \n",
    "        # Select top k features from consensus\n",
    "        final_features = sorted(consensus_scores.items(), key=lambda x: x[1], reverse=True)[:k]\n",
    "        selected_features_final = [feat[0] for feat in final_features]\n",
    "        \n",
    "        print(f\"\\n🎯 Consensus top {k} features:\")\n",
    "        for i, (feature, score) in enumerate(final_features, 1):\n",
    "            print(f\"  {i:2d}. {feature}: {score:.4f}\")\n",
    "        \n",
    "        return selected_features_final, feature_importance_results\n",
    "    \n",
    "    return list(feature_importance_results.values())[0].keys(), feature_importance_results\n",
    "\n",
    "# Apply feature engineering\n",
    "df_enhanced = create_advanced_features(df)\n",
    "\n",
    "# Prepare features for selection\n",
    "# Select only numeric features for initial modeling\n",
    "numeric_features = df_enhanced.select_dtypes(include=[np.number]).columns\n",
    "feature_columns = [col for col in numeric_features if col not in ['emissions', 'id', 'year']]\n",
    "\n",
    "X_all = df_enhanced[feature_columns].copy()\n",
    "y = df_enhanced['emissions'].copy()\n",
    "\n",
    "print(f\"\\n📋 Features available for selection: {len(X_all.columns)}\")\n",
    "print(f\"📊 Target variable statistics:\")\n",
    "print(f\"  • Mean: {y.mean():.2f}\")\n",
    "print(f\"  • Std: {y.std():.2f}\")\n",
    "print(f\"  • Min: {y.min():.2f}\")\n