In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
import matplotlib.pyplot as plt
import traceback



In [None]:
file_path = "C:/Users/Admin/Desktop/PROJECT WORK/DROUGHT-FORECASTING-IN-KENYA-USING-MACHINE-LEARNING/MLFile3.xlsx"
sheet_names = pd.ExcelFile(file_path).sheet_names
sheet_dict = {sheet: pd.read_excel(file_path, sheet_name=sheet) for sheet in sheet_names}
print (sheet_dict)



In [None]:
#CORRELATION AND COMPOSITE INDICES

filtered_sheet_dict = {}
correlation_dict = {} 

for station, df in sheet_dict.items():
    print(f"\nProcessing station: {station}")
    print(f"Original columns: {df.columns.tolist()}")
    
    df_copy = df.copy()
    
    df_copy.dropna(inplace=True)
    

    if df_copy.empty:
        print(f"No data left for {station} after dropping missing values. Skipping...")
        continue
    

    year_col = "YEAR"
    rainfall_col = station  
    
    print(f"Year column: {year_col}")
    print(f"Rainfall column: {rainfall_col}")
    
    sst_cols = [col for col in df_copy.columns if col not in [year_col, rainfall_col]]
    print(f"SST columns: {sst_cols}")
    
    print(f"\nCorrelation results for {station}:")
    
    retained_cols = []
    station_correlations = {}  # Store correlations for this station
    
    for col in sst_cols:
        try:
            corr, p_value = stats.pearsonr(df_copy[col], df_copy[rainfall_col])
            print(f"{col}: r={corr:.2f}, p={p_value:.3f}")
            station_correlations[col] = {'correlation': corr, 'p_value': p_value}  # Save correlation stats
            
            if p_value <= 0.05:
                retained_cols.append(col)
            else:
                print(f"Dropped {col} from {station} (p={p_value:.3f})")
        except Exception as e:
            print(f"Error processing column {col}: {str(e)}")
    
    correlation_dict[station] = station_correlations
    
    print(f"Retained {len(retained_cols)} predictors out of {len(sst_cols)} original variables")
    
    if not retained_cols:
        print(f"No significant predictors found for {station}. Skipping this station...")
        continue
    
    # Keep only YEAR, the rainfall column, and significant SST predictors
    filtered_df = df_copy[[year_col, rainfall_col] + retained_cols]
    filtered_sheet_dict[station] = filtered_df
    
    print(f"Final data for {station}:")
    display(filtered_df.head())
    print(f"Shape: {filtered_df.shape} (rows, columns)")
    print(f"Columns: {filtered_df.columns.tolist()}")
    print("-" * 60)

# Cell 4: Create Weighted SST Composite Index
# This code creates weights based on correlation values and calculates a composite SST index

import pandas as pd
import numpy as np

# Dictionary to store dataframes with composite indices
composite_df_dict = {}

for station, filtered_df in filtered_sheet_dict.items():
    print(f"\nProcessing station: {station}")
    
    # Get the correlation dictionary for this station
    station_corr = correlation_dict[station]
    
    # Identify SST columns (all columns except YEAR and the station rainfall)
    year_col = "YEAR"
    rainfall_col = station
    sst_cols = [col for col in filtered_df.columns if col not in [year_col, rainfall_col]]
    
    if not sst_cols:
        print(f"No significant SST predictors for {station}. Skipping composite index creation.")
        continue
    
    print(f"Creating weights for significant SST columns: {sst_cols}")
    
    # Calculate weights based on absolute correlation values
    abs_corrs = {}
    for col in sst_cols:
        abs_corrs[col] = abs(station_corr[col]['correlation'])
    
    # Sum of absolute correlations
    sum_abs_corr = sum(abs_corrs.values())
    
    # Calculate normalized weights
    weights = {}
    for col, abs_corr in abs_corrs.items():
        weights[col] = abs_corr / sum_abs_corr
    
    # Display the weights
    print(f"Weights for {station}:")
    for col, weight in weights.items():
        corr = station_corr[col]['correlation']
        print(f"  {col}: correlation = {corr:.4f}, weight = {weight:.4f}")
    
    # Create a new dataframe with YEAR, rainfall, and a new composite index
    df_composite = filtered_df[[year_col, rainfall_col]].copy()
    
    # Calculate the weighted composite SST index
    df_composite['SST_COMPOSITE'] = 0
    for col in sst_cols:
        df_composite['SST_COMPOSITE'] += filtered_df[col] * weights[col]
    
    # Also keep the individual SST columns for reference
    for col in sst_cols:
        df_composite[col] = filtered_df[col]
    
    print(f"Composite index created for {station}")
    print("First few rows of the dataframe with composite index:")
    print(df_composite.head())
    
    # Store the dataframe with composite index
    composite_df_dict[station] = df_composite
    
    # Calculate correlation between composite index and rainfall
    corr, p_value = stats.pearsonr(df_composite['SST_COMPOSITE'], df_composite[rainfall_col])
    print(f"Correlation between composite SST index and {station} rainfall: r={corr:.4f}, p={p_value:.4f}")
    
    print("-" * 60)

print("\nSummary of composite indices:")
for station, df in composite_df_dict.items():
    print(f"{station}: {df.shape[0]} years, {df.shape[1]-1} variables (including composite index)")

# The composite_df_dict now contains dataframes for each station with:
# - YEAR
# - Station rainfall
# - SST_COMPOSITE (weighted index)
# - Individual significant SST predictors

In [None]:
#MODEL
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt

# List of known El Niño and La Niña years
el_nino_years = [1961, 1997, 2013, 2019, 2023]
la_nina_years = [1988, 1998, 2007, 2010, 2016, 2020]

# Create model configurations
model_configs = [
    {"Station": station_name, "Model": model_type} 
    for station_name in composite_df_dict.keys() 
    for model_type in ["Random Forest", "Gradient Boosting", "SVR", "Neural Network"]
]
model_df = pd.DataFrame(model_configs)

results_dict = {}

# Function to add additional features
def engineer_features(df):
    df_new = df.copy()
    if 'YEAR' in df_new.index.names:
        df_new = df_new.reset_index()
    df_new = df_new.sort_values('YEAR')
    rainfall_col = df_new.columns[1]  # Station name
    
    # Simplified rolling statistics
    if len(df) > 3:
        df_new['rolling_mean_3yr'] = df_new[rainfall_col].rolling(window=3, min_periods=1).mean().shift(1)
        df_new['rolling_std_3yr'] = df_new[rainfall_col].rolling(window=3, min_periods=1).std().shift(1)
        df_new['rolling_mean_3yr'] = df_new['rolling_mean_3yr'].fillna(df_new[rainfall_col].mean())
        df_new['rolling_std_3yr'] = df_new['rolling_std_3yr'].fillna(df_new[rainfall_col].std())
    
    return df_new

for _, row in model_df.iterrows():
    station = row["Station"]
    model_name = row["Model"]

    if station not in composite_df_dict:
        print(f"Station {station} not found in composite data. Skipping...")
        continue

    df = composite_df_dict[station].copy()
    print(f"\nWorking with {station} data from {df['YEAR'].min()} to {df['YEAR'].max()}")
    print(f"Total data points: {len(df)}")
    
    # Enhance El Niño and La Niña signals with higher weighting
    df['is_el_nino'] = df['YEAR'].apply(lambda x: 1 if x in el_nino_years else 0)
    df['is_la_nina'] = df['YEAR'].apply(lambda x: 1 if x in la_nina_years else 0)
    df['recent_strong_el_nino'] = df['YEAR'].apply(lambda x: 2 if x in [2019, 2023] else (1 if x == 2013 else 0))
    df['recent_strong_la_nina'] = df['YEAR'].apply(lambda x: 2 if x in [2016, 2020] else (1 if x == 2010 else 0))
    df['el_nino_weight'] = df['is_el_nino'] * (1 + df['recent_strong_el_nino'] * 1.0)
    df['la_nina_weight'] = df['is_la_nina'] * (1 + df['recent_strong_la_nina'] * 1.0)
    
    # Add engineered features
    df = engineer_features(df)
    df = df.sort_values('YEAR')
    
    # Define features and target
    target_col = station  # Station name is the target (rainfall)
    years = df['YEAR'].values
    
    # Use all columns except YEAR and the target (rainfall) column
    feature_columns = [col for col in df.columns if col != 'YEAR' and col != target_col]
    print(f"Features used: {', '.join(feature_columns)}")
    
    X = df[feature_columns]
    y = df[target_col]
    
    if X.shape[1] == 0:
        print(f"No features available for {station}. Skipping model creation.")
        continue
    
    # Select top features using Random Forest
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X, y)
    importances = pd.Series(rf.feature_importances_, index=X.columns)
    top_features = importances.nlargest(6).index.tolist()
    
    # Ensure SST_COMPOSITE is always included in top features
    if 'SST_COMPOSITE' not in top_features:
        top_features = ['SST_COMPOSITE'] + top_features[:5]
    
    # Make sure El Niño and La Niña indicators are considered
    enso_features = ['el_nino_weight', 'la_nina_weight']
    for feature in enso_features:
        if feature not in top_features and feature in X.columns:
            if len(top_features) >= 6:
                top_features = top_features[:5] + [feature]
            else:
                top_features.append(feature)
        
    print(f"Top features for {station}: {top_features}")
    X = X[top_features]
    
    if len(X) < 5:
        print(f"Not enough data points for {station}. Skipping model creation.")
        continue

    # Split data into training, validation, and test sets
    train_size = 0.7
    val_size = 0.15
    test_size = 0.15
    train_end = int(len(X) * train_size)
    val_end = train_end + int(len(X) * val_size)

    print(f"Training samples: {train_end}")
    print(f"Validation samples: {val_end - train_end}")
    print(f"Testing samples: {len(X) - val_end}")

    X_train = X.iloc[:train_end]
    y_train = y.iloc[:train_end]
    years_train = years[:train_end]
    X_val = X.iloc[train_end:val_end]
    y_val = y.iloc[train_end:val_end]
    years_val = years[train_end:val_end]
    X_test = X.iloc[val_end:]
    y_test = y.iloc[val_end:]
    years_test = years[val_end:]

    print(f"Training years: {min(years_train)} to {max(years_train)}")
    print(f"Validation years: {min(years_val)} to {max(years_val)}")
    print(f"Testing years: {min(years_test)} to {max(years_test)}")
    
    # Configure and train the model based on the model type
    if model_name == "Random Forest":
        model = Pipeline([
            ('scaler', RobustScaler()),
            ('model', RandomForestRegressor(
                n_estimators=100, 
                max_depth=5,
                min_samples_leaf=3,
                min_samples_split=5,
                random_state=42
            ))
        ])
    elif model_name == "Gradient Boosting":
        model = Pipeline([
            ('scaler', StandardScaler()),
            ('model', GradientBoostingRegressor(
                loss='huber', 
                n_estimators=100,
                learning_rate=0.01,
                max_depth=3,
                min_samples_leaf=3,
                subsample=0.8,
                random_state=42
            ))
        ])
    elif model_name == "SVR":
        model = Pipeline([
            ('scaler', StandardScaler()),
            ('model', SVR(
                kernel='rbf',
                C=1.0,
                epsilon=0.1,
                gamma='scale'
            ))
        ])
    elif model_name == "Neural Network":
        model = Pipeline([
            ('scaler', StandardScaler()),
            ('model', MLPRegressor(
                hidden_layer_sizes=(50, 25),
                activation='relu',
                solver='adam',
                alpha=0.01,
                max_iter=2000,
                early_stopping=True,
                validation_fraction=0.1,
                random_state=42
            ))
        ])
    else:
        print(f"Unknown model type: {model_name}. Skipping...")
        continue
        
    try:
        # Cross-validation on training data
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        cv_r2_scores = []
        for train_idx, val_idx in kf.split(X_train):
            X_cv_train, X_cv_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
            model.fit(X_cv_train, y_cv_train)
            y_cv_pred = model.predict(X_cv_val)
            cv_r2 = r2_score(y_cv_val, y_cv_pred)
            cv_r2_scores.append(cv_r2)
        print(f"Cross-validation R² scores for {station} - {model_name}: {cv_r2_scores}")
        print(f"Average CV R²: {np.mean(cv_r2_scores):.3f}")

        # Train on full training set
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)
        y_test_pred = model.predict(X_test)
        
        # Metrics for all sets
        train_r2 = r2_score(y_train, y_train_pred)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        val_r2 = r2_score(y_val, y_val_pred)
        val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
        test_r2 = r2_score(y_test, y_test_pred)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
        test_mae = mean_absolute_error(y_test, y_test_pred)
        
        # Calculate special metrics for drought events (values below zero)
        below_zero_indices = np.where(y_test < 0)[0]
        if len(below_zero_indices) > 0:
            below_zero_rmse = np.sqrt(mean_squared_error(y_test.iloc[below_zero_indices], y_test_pred[below_zero_indices]))
            print(f"  RMSE for values below zero: {below_zero_rmse:.3f}")
            print(f"  Number of below-zero actual values in test set: {len(below_zero_indices)}")
        else:
            below_zero_rmse = None
            print("  No below-zero values in test set to evaluate")
        
        # Store results
        results_dict.setdefault(station, {})[model_name] = {
            'rmse': test_rmse,
            'r2': test_r2,
            'mae': test_mae,
            'below_zero_rmse': below_zero_rmse,
            'predictions': y_test_pred,
            'actuals': y_test.values,
            'years': years_test
        }
        
        print(f"\nResults for {station} using {model_name}:")
        print(f"  Train R² = {train_r2:.3f}, RMSE = {train_rmse:.3f}")
        print(f"  Validation R² = {val_r2:.3f}, RMSE = {val_rmse:.3f}")
        print(f"  Test R² = {test_r2:.3f}, RMSE = {test_rmse:.3f}, MAE = {test_mae:.3f}")
        
        # Generate plots
        plt.figure(figsize=(14, 8))
        plt.subplot(2, 1, 1)
        plt.plot(years_test, y_test.values, label="Observed", marker='o', color='blue')
        plt.plot(years_test, y_test_pred, label="Predicted", linestyle='--', marker='x', color='red')
        plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
        plt.title(f"{station} - {model_name}: Observed vs Predicted (Test Set)")
        plt.xlabel("Year")
        plt.ylabel(f"{target_col}")
        plt.legend()
        plt.grid(True)
        plt.xticks(years_test)
        if len(years_test) > 6:
            plt.xticks(rotation=45)
        
        plt.subplot(2, 1, 2)
        errors = y_test.values - y_test_pred
        plt.bar(years_test, errors, color='purple', alpha=0.6)
        plt.axhline(y=0, color='black', linestyle='-', alpha=0.3)
        plt.title("Prediction Errors (Actual - Predicted)")
        plt.xlabel("Year")
        plt.ylabel("Error")
        plt.grid(True, axis='y')
        plt.xticks(years_test)
        if len(years_test) > 6:
            plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        # Scatter plot of actual vs predicted values
        plt.figure(figsize=(10, 6))
        plt.scatter(y_test.values, y_test_pred, alpha=0.6)
        min_val = min(min(y_test.values), min(y_test_pred))
        max_val = max(max(y_test.values), max(y_test_pred))
        plt.plot([min_val, max_val], [min_val, max_val], 'k--')
        plt.axvline(x=0, color='red', linestyle='--', alpha=0.3)
        plt.axhline(y=0, color='red', linestyle='--', alpha=0.3)
        plt.title(f"{station} - {model_name}: Actual vs Predicted Values")
        plt.xlabel("Actual Values")
        plt.ylabel("Predicted Values")
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    except Exception as e:
        print(f"Error training {model_name} for {station}: {str(e)}")

# Create summary table of results
summary_rows = []
for station in results_dict:
    station_models = results_dict[station]
    if not station_models:
        continue
        
    best_model_name = max(station_models.keys(), key=lambda k: station_models[k]['r2'])
    best_model = station_models[best_model_name]
    
    # Get the features used for this station
    station_df = composite_df_dict[station]
    features_used = top_features
    
    summary_rows.append({
        'Station': station,
        'Best Model': best_model_name,
        'R2': best_model['r2'],
        'RMSE': best_model['rmse'],
        'MAE': best_model['mae'],
        'Below Zero RMSE': best_model.get('below_zero_rmse', 'N/A'),
        'Features Used': ', '.join(features_used)
    })

summary_df = pd.DataFrame(summary_rows)
print("\n" + "="*80)
print("Summary of Best Models for Each Station")
print("="*80)
print(summary_df.to_string(index=False))
print("="*80)

# Add a feature importance plot for the best models
print("\nFeature Importance for Best Models:")
for station in results_dict:
    station_models = results_dict[station]
    if not station_models:
        continue
        
    best_model_name = max(station_models.keys(), key=lambda k: station_models[k]['r2'])
    
    # Skip models that don't support feature importance
    if best_model_name in ["SVR", "Neural Network"]:
        print(f"  {station} - {best_model_name}: Feature importance not available for this model type")
        continue
    
    # Get the pipeline model
    model_dict = model_df[(model_df["Station"] == station) & (model_df["Model"] == best_model_name)]
    if model_dict.empty:
        continue
    
    # Extract the trained model
    for _, row in model_df.iterrows():
        if row["Station"] == station and row["Model"] == best_model_name:
            try:
                # Get feature importance
                station_df = composite_df_dict[station]
                feature_columns = [col for col in station_df.columns if col != 'YEAR' and col != station]
                
                # Select top features using Random Forest
                rf = RandomForestRegressor(n_estimators=100, random_state=42)
                X = station_df[feature_columns]
                y = station_df[station]
                rf.fit(X, y)
                
                # Plot feature importance
                importances = pd.Series(rf.feature_importances_, index=feature_columns)
                plt.figure(figsize=(10, 6))
                importances.sort_values().plot(kind='barh')
                plt.title(f'Feature Importance for {station} using Random Forest')
                plt.xlabel('Importance')
                plt.tight_layout()
                plt.show()
                
                print(f"  {station} - Feature Importance:")
                for feature, importance in importances.sort_values(ascending=False).items():
                    print(f"    {feature}: {importance:.4f}")
            except Exception as e:
                print(f"Error extracting feature importance for {station}: {str(e)}")
                
print("\nDrought Forecasting Model Training Complete!")



In [None]:
#MODEL SUMMARY

best_models_summary = pd.DataFrame(columns=[
    'Station', 'Best Model', 'R2', 'RMSE', 'MAE', 'Features Used'
])

forecasting_dfs = {}

print("\n" + "="*70)
print("BEST MODEL SELECTION SUMMARY")
print("="*70)

for station, models_dict in results_dict.items():
    print(f"\nAnalyzing models for station: {station}")
    
    best_model_name = None
    best_r2 = float('-inf')
    best_metrics = {}

    # Find the model with the highest R² value from the results_dict (from cell 4)
    for model_name, metrics in models_dict.items():
        r2 = metrics['r2']
        print(f"  {model_name}: R² = {r2:.4f}, RMSE = {metrics['rmse']:.4f}")
        
        if r2 > best_r2:
            best_r2 = r2
            best_model_name = model_name
            best_metrics = metrics
    
    if best_model_name:
        try:
            print(f"\n✅ Best model for {station}: {best_model_name}")
            print(f"  R² = {best_r2:.4f}")
            print(f"  RMSE = {best_metrics['rmse']:.4f}")
            print(f"  MAE = {best_metrics['mae']:.4f}")
            
            # Get the original dataframe for this station
            station_df = filtered_sheet_dict[station].copy()
            
            # Get the features used for this station - use exactly the same features from cell 4
            features_used = [col for col in station_df.columns if col not in ['YEAR', station]]
            features_str = ', '.join(features_used)
            
            # Add El Niño indicator for future forecasting
            el_nino_years = [1997, 2013, 2019, 2023]
            station_df['is_el_nino'] = station_df['YEAR'].apply(lambda x: 1 if int(x) in el_nino_years else 0)
            
            best_models_summary = pd.concat([
                best_models_summary,
                pd.DataFrame({
                    'Station': [station],
                    'Best Model': [best_model_name],
                    'R2': [best_r2],
                    'RMSE': [best_metrics['rmse']],
                    'MAE': [best_metrics['mae']],
                    'Features Used': [features_str]
                })
            ], ignore_index=True)
            
            # Store the original dataframe with the best model predictions for forecasting
            forecasting_dfs[station] = {
                'df': station_df,
                'model_name': best_model_name,
                'features': features_used,
                'target': station,
                'years': best_metrics['years'],
                'actuals': best_metrics['actuals'],
                'predictions': best_metrics['predictions'],
                'r2': best_r2,
                'rmse': best_metrics['rmse'],
                'mae': best_metrics['mae']
            }

            
            years_test = best_metrics['years']
            y_test = best_metrics['actuals']
            y_pred = best_metrics['predictions']
            target_col = station

            print("Years in test set:", years_test)

            if len(set(years_test)) == 1:
                print("WARNING: All test data is from the same year!")

            plt.figure(figsize=(12, 5))

            if len(set(years_test)) == 1:
                x_vals = np.arange(len(y_test))
                plt.plot(x_vals, y_test, label="Observed", marker='o')
                plt.plot(x_vals, y_pred, label="Predicted", linestyle='--', marker='x')
                plt.xticks(x_vals, [years_test[0]] * len(y_test))
            else:
                plt.plot(years_test, y_test, label="Observed", marker='o')
                plt.plot(years_test, y_pred, label="Predicted", linestyle='--', marker='x')

            plt.title(f"{station} - {best_model_name}: Observed vs Predicted (Test Set)")
            plt.xlabel("Year")
            plt.ylabel(f"{target_col}")
            plt.legend()
            plt.grid(True)
            plt.tight_layout()
            plt.show()

        except Exception as e:
            print(f"Error handling best model for {station}: {str(e)}")
            
print("\n" + "="*70)
print("BEST MODELS SUMMARY")
print("="*70)
display(best_models_summary)

print("\nPrepared forecasting dataframes for these stations:")
for station in forecasting_dfs.keys():
    print(f"- {station}")

print("\nForecasting-ready dataframes and best models are stored in 'forecasting_dfs'")
print("You can use these in cell 6 for drought prediction and forecasting.")


In [None]:
#HITS AND MISSES
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product

def calculate_dynamic_drought_threshold(
    data, 
    percentile=25,  
    mad_multiplier=5,  
    min_threshold=None  
):
    """
    More flexible drought threshold calculation
    """
    # Remove extreme outliers using Median Absolute Deviation (MAD)
    median = np.median(data)
    mad = np.median(np.abs(data - median))
    
    # Create a mask to remove outliers
    outlier_mask = np.abs(data - median) <= (mad_multiplier * mad)
    cleaned_data = data[outlier_mask]
    
    # Calculate threshold based on percentile of cleaned data
    threshold = np.percentile(cleaned_data, percentile)
    
    # Optional minimum threshold check
    if min_threshold is not None:
        threshold = max(threshold, min_threshold)
    
    return threshold

def comprehensive_drought_threshold_analysis(results_dict):
    """
    Explore multiple drought threshold configurations
    """
    # Threshold exploration parameters
    percentile_options = [10, 20, 25, 30, 40]
    mad_multiplier_options = [3, 4, 5, 6]
    
    # Comprehensive results storage
    comprehensive_results = {}
    
    # Visualize threshold exploration
    plt.figure(figsize=(20, 15))
    
    for station, models in results_dict.items():
        # Collect all actual values
        all_actuals = np.concatenate([models[model]['actuals'] for model in models])
        
        # Store results for this station
        station_results = []
        
        # Explore different threshold configurations
        for percentile, mad_multiplier in product(percentile_options, mad_multiplier_options):
            # Calculate dynamic threshold
            drought_threshold = calculate_dynamic_drought_threshold(
                all_actuals, 
                percentile=percentile, 
                mad_multiplier=mad_multiplier
            )
            
            # Evaluate across all models
            station_model_performances = []
            for model_name, model_data in models.items():
                # Threshold evaluation metrics
                actuals = model_data['actuals']
                predictions = model_data['predictions']
                years = model_data['years']
                
                # Identify drought events
                actual_droughts = actuals <= drought_threshold
                predicted_droughts = predictions <= drought_threshold
                
                # Confusion matrix metrics
                true_positives = np.sum((actual_droughts) & (predicted_droughts))
                false_positives = np.sum((~actual_droughts) & (predicted_droughts))
                true_negatives = np.sum((~actual_droughts) & (~predicted_droughts))
                false_negatives = np.sum((actual_droughts) & (~predicted_droughts))
                
                # Performance calculations
                try:
                    drought_detection_rate = true_positives / np.sum(actual_droughts) if np.sum(actual_droughts) > 0 else 0
                    false_alarm_rate = false_positives / np.sum(~actual_droughts) if np.sum(~actual_droughts) > 0 else 0
                    f1_score = (2 * true_positives) / (2 * true_positives + false_positives + false_negatives) if (true_positives + false_positives + false_negatives) > 0 else 0
                except ZeroDivisionError:
                    drought_detection_rate = false_alarm_rate = f1_score = 0
                
                station_model_performances.append({
                    'model': model_name,
                    'true_positives': true_positives,
                    'false_positives': false_positives,
                    'true_negatives': true_negatives,
                    'false_negatives': false_negatives,
                    'drought_detection_rate': drought_detection_rate,
                    'false_alarm_rate': false_alarm_rate,
                    'f1_score': f1_score
                })
            
            # Find best performing model for this threshold configuration
            best_model = max(station_model_performances, key=lambda x: x['f1_score'])
            
            # Store results
            result_entry = {
                'percentile': percentile,
                'mad_multiplier': mad_multiplier,
                'drought_threshold': drought_threshold,
                'best_model': best_model['model'],
                **best_model
            }
            station_results.append(result_entry)
        
        # Store results for this station
        comprehensive_results[station] = station_results
        
        # Visualize F1 Scores for this station
        plt.subplot(3, 3, list(results_dict.keys()).index(station) + 1)
        station_df = pd.DataFrame(station_results)
        
        # Pivot table for heatmap
        f1_pivot = station_df.pivot(
            index='percentile', 
            columns='mad_multiplier', 
            values='f1_score'
        )
        
        sns.heatmap(f1_pivot, annot=True, cmap='YlGnBu', fmt='.3f')
        plt.title(f'{station} - Threshold F1 Score Heatmap')
        plt.xlabel('MAD Multiplier')
        plt.ylabel('Percentile')
    
    plt.tight_layout()
    plt.show()
    
    # Create comprehensive summary
    summary_results = []
    for station, results in comprehensive_results.items():
        # Find overall best configuration
        best_config = max(results, key=lambda x: x['f1_score'])
        summary_results.append({
            'Station': station,
            'Best Percentile': best_config['percentile'],
            'Best MAD Multiplier': best_config['mad_multiplier'],
            'Drought Threshold': best_config['drought_threshold'],
            'Best Model': best_config['best_model'],
            'Best F1 Score': best_config['f1_score'],
            'Drought Detection Rate': best_config['drought_detection_rate'],
            'False Alarm Rate': best_config['false_alarm_rate']
        })
    
    # Convert to DataFrame and display
    summary_df = pd.DataFrame(summary_results)
    print("\nComprehensive Drought Threshold Analysis Summary:")
    print(summary_df.to_string(index=False))
    
    return comprehensive_results, summary_df

# Run the analysis
try:
    # Explore and visualize drought threshold configurations
    comprehensive_results, summary_df = comprehensive_drought_threshold_analysis(results_dict)
    
    # Optional: Save results to CSV
    summary_df.to_csv('drought_threshold_exploration.csv', index=False)
    print("\nDetailed results saved to drought_threshold_exploration.csv")

except NameError:
    print("\nResults dictionary not found. Ensure the model training script has been run first.")

In [None]:
#FUTURE PREDICTION

class StationDroughtAnalyzer:
    def __init__(self, station_name, rainfall_data, years):
        """
        Initialize drought analyzer for a specific station
        
        :param station_name: Name of the weather station
        :param rainfall_data: Numpy array or list of rainfall values
        :param years: Corresponding years for rainfall data
        """
        self.station_name = station_name
        self.rainfall_data = np.array(rainfall_data)
        self.years = np.array(years)
        
        # Compute station-specific drought thresholds
        self.compute_drought_thresholds()
    
    def compute_drought_thresholds(self):
        """
        Compute multiple drought thresholds based on statistical analysis
        """
        # Calculate various statistical measures
        self.mean_rainfall = np.mean(self.rainfall_data)
        self.median_rainfall = np.median(self.rainfall_data)
        self.std_rainfall = np.std(self.rainfall_data)
        
        # Define multiple drought severity levels
        self.thresholds = {
            'moderate': self.mean_rainfall - 0.5 * self.std_rainfall,  # 1st level: below mean by 0.5 standard deviations
            'severe': self.mean_rainfall - 1.0 * self.std_rainfall,    # 2nd level: below mean by 1 standard deviation
            'extreme': self.mean_rainfall - 1.5 * self.std_rainfall    # 3rd level: below mean by 1.5 standard deviations
        }
        
        print(f"\n{self.station_name} Drought Thresholds:")
        for severity, threshold in self.thresholds.items():
            print(f"  {severity.capitalize()} Drought: {threshold:.2f}")
    
    def analyze_historical_droughts(self, drought_level='moderate'):
        """
        Analyze historical droughts for a given severity level
        
        :param drought_level: Severity of drought to analyze ('moderate', 'severe', 'extreme')
        :return: Dictionary of drought analysis results
        """
        # Validate drought level
        if drought_level not in self.thresholds:
            raise ValueError(f"Invalid drought level. Choose from {list(self.thresholds.keys())}")
        
        # Get the specific threshold
        drought_threshold = self.thresholds[drought_level]
        
        # Identify drought years
        drought_mask = self.rainfall_data <= drought_threshold
        drought_years = self.years[drought_mask]
        drought_values = self.rainfall_data[drought_mask]
        
        # Compute drought frequency
        drought_frequency = len(drought_years) / len(self.years) * 100
        
        # Compute intervals between droughts if more than one drought occurred
        drought_intervals = None
        avg_interval = None
        if len(drought_years) > 1:
            drought_intervals = np.diff(drought_years)
            avg_interval = np.mean(drought_intervals)
        
        # Prepare results dictionary
        results = {
            'drought_threshold': drought_threshold,
            'drought_years': drought_years.tolist(),
            'drought_values': drought_values.tolist(),
            'drought_frequency': drought_frequency,
            'avg_drought_interval': avg_interval
        }
        
        # Print analysis results
        print(f"\n{self.station_name} {drought_level.capitalize()} Drought Analysis:")
        print(f"  Drought Threshold: {drought_threshold:.2f}")
        print(f"  Drought Years: {drought_years}")
        print(f"  Drought Frequency: {drought_frequency:.1f}%")
        if avg_interval is not None:
            print(f"  Average Drought Interval: {avg_interval:.1f} years")
        
        return results
    
    def forecast_droughts(self, drought_level='moderate', forecast_period=10):
        """
        Forecast future droughts using statistical projection
        
        :param drought_level: Severity of drought to predict
        :param forecast_period: Number of years to forecast
        :return: Dictionary of forecast results
        """
        # Get the specific threshold for the drought level
        drought_threshold = self.thresholds[drought_level]
        
        try:
            # Basic linear regression forecast
            X = np.arange(len(self.years)).reshape(-1, 1)
            y = self.rainfall_data
            
            # Perform linear regression
            slope, intercept, r_value, p_value, std_err = stats.linregress(X.flatten(), y)
            
            # Generate forecast years
            last_year = self.years[-1]
            forecast_years = np.arange(last_year + 1, last_year + forecast_period + 1)
            
            # Project future values based on trend
            forecast = intercept + slope * np.arange(len(self.years), len(self.years) + forecast_period)
            
            # Add some randomness to make it more realistic
            forecast += np.random.normal(0, std_err, forecast_period)
            
            # Identify drought years in forecast
            forecast_drought_mask = forecast <= drought_threshold
            predicted_drought_years = forecast_years[forecast_drought_mask]
            predicted_drought_values = forecast[forecast_drought_mask]
            
            # Prepare forecast results
            forecast_results = {
                'forecast_years': forecast_years.tolist(),
                'forecast_values': forecast.tolist(),
                'predicted_drought_years': predicted_drought_years.tolist(),
                'predicted_drought_values': predicted_drought_values.tolist()
            }
            
            # Print forecast results
            print(f"\n{self.station_name} {drought_level.capitalize()} Drought Forecast:")
            print("Year | Forecasted Rainfall | Drought Status")
            print("-" * 45)
            for year, value in zip(forecast_years, forecast):
                status = "DROUGHT" if value <= drought_threshold else "Normal"
                print(f"{year} | {value:7.2f}        | {status}")
            
            return forecast_results
        
        except Exception as e:
            print(f"\nError in forecast for {self.station_name}: {str(e)}")
            print(traceback.format_exc())
            return None
    
    def visualize_drought_analysis(self, historical_analysis, forecast_results=None, drought_level='moderate'):
        """
        Create a comprehensive visualization of drought analysis
        
        :param historical_analysis: Results from historical drought analysis
        :param forecast_results: Results from drought forecast (optional)
        :param drought_level: Severity of drought being analyzed
        """
        plt.figure(figsize=(12, 6))
        
        # Plot full rainfall data
        plt.plot(self.years, self.rainfall_data, 'b-', label='Historical Rainfall')
        
        # Highlight historical drought years
        drought_mask = np.array(self.rainfall_data) <= historical_analysis['drought_threshold']
        plt.scatter(
            self.years[drought_mask], 
            np.array(self.rainfall_data)[drought_mask], 
            color='red', s=50, label='Historical Droughts'
        )
        
        # Add drought threshold line
        plt.axhline(
            y=historical_analysis['drought_threshold'], 
            color='r', 
            linestyle='--', 
            label=f'{drought_level.capitalize()} Drought Threshold'
        )
        
        # Plot forecast if available
        if forecast_results:
            forecast_years = forecast_results['forecast_years']
            forecast_values = forecast_results['forecast_values']
            plt.plot(forecast_years, forecast_values, 'g--', label='Statistical Forecast')
            
            # Highlight predicted drought years
            if forecast_results['predicted_drought_years']:
                plt.scatter(
                    forecast_results['predicted_drought_years'], 
                    forecast_results['predicted_drought_values'], 
                    color='orange', marker='x', s=50, 
                    label='Predicted Droughts'
                )
        
        plt.title(f"{self.station_name} - {drought_level.capitalize()} Drought Analysis")
        plt.xlabel("Year")
        plt.ylabel("Rainfall")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

def perform_comprehensive_drought_analysis(results_dict):
    """
    Perform comprehensive drought analysis for all stations
    
    :param results_dict: Dictionary containing station results from previous analysis
    :return: Comprehensive drought analysis results
    """
    comprehensive_drought_analysis = {}
    
    for station in results_dict.keys():
        print(f"\n{'='*70}\n{station} COMPREHENSIVE DROUGHT ANALYSIS\n{'='*70}")
        
        # Get the best model results for this station
        best_model_name = max(results_dict[station].keys(), key=lambda k: results_dict[station][k]['r2'])
        station_results = results_dict[station][best_model_name]
        
        # Extract years and rainfall data
        years = station_results['years']
        rainfall = station_results['actuals']
        
        # Create station-specific drought analyzer
        analyzer = StationDroughtAnalyzer(station, rainfall, years)
        
        # Analyze droughts at different severity levels
        drought_analysis = {}
        for severity in ['moderate', 'severe', 'extreme']:
            # Historical drought analysis
            historical_analysis = analyzer.analyze_historical_droughts(drought_level=severity)
            
            # Forecast droughts
            forecast_results = analyzer.forecast_droughts(drought_level=severity)
            
            # Visualize results
            analyzer.visualize_drought_analysis(historical_analysis, forecast_results, drought_level=severity)
            
            # Store results
            drought_analysis[severity] = {
                'historical': historical_analysis,
                'forecast': forecast_results
            }
        
        # Store comprehensive analysis for this station
        comprehensive_drought_analysis[station] = drought_analysis
    
    return comprehensive_drought_analysis

# Run the comprehensive drought analysis
comprehensive_results = perform_comprehensive_drought_analysis(results_dict)

In [None]:
print (summary_df)