In [2]:
!pip install numpy pandas xgboost
import pandas as pd
import numpy as np
import xgboost as xgb
import os
import datetime
import gzip



# ==========================================
# 0. CONFIGURATION AND HYPERPARAMETERS
# ==========================================

# --- File Paths (Updated to reference the .gz file) ---
CIMIS_GZ_FILE_NAME = 'cimis_all_stations.csv.gz' 
CIMIS_CSV_FILE_NAME = 'cimis_all_stations_unzipped.csv' # Temp file for unzipped data

# --- Model Configuration ---
CONFIG = {
    'horizon': [3, 6, 12, 24],
    'frost_threshold': 0.0,
    'FEATURES': [
        'air_temp_c', 'rel_hum_percent', 'dew_point_c', 'wind_speed_m_s',
        'hour_sin', 'hour_cos', 'temp_lag_1', 'temp_lag_3', 'temp_lag_6'
    ]
}

# --- Tuned Hyperparameters ---
# (TUNED_PARAMS remains the same)
TUNED_PARAMS = {
    3: {
        'clf': {'colsample_bytree': 0.99329, 'gamma': 2.33381, 'learning_rate': 0.18198, 'max_depth': 9, 'n_estimators': 70, 'subsample': 0.78019},
        'reg': {'alpha': 0.05808, 'colsample_bytree': 0.94647, 'lambda': 0.60111, 'learning_rate': 0.15161, 'max_depth': 8, 'n_estimators': 102, 'subsample': 0.98796}
    },
    6: {
        'clf': {'colsample_bytree': 0.84474, 'gamma': 0.69746, 'learning_rate': 0.06842, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.75298},
        'reg': {'alpha': 0.05808, 'colsample_bytree': 0.94647, 'lambda': 0.60111, 'learning_rate': 0.15161, 'max_depth': 8, 'n_estimators': 102, 'subsample': 0.98796}
    },
    12: {
        'clf': {'colsample_bytree': 0.84474, 'gamma': 0.69746, 'learning_rate': 0.06842, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.75298},
        'reg': {'alpha': 0.05808, 'colsample_bytree': 0.94647, 'lambda': 0.60111, 'learning_rate': 0.15161, 'max_depth': 8, 'n_estimators': 102, 'subsample': 0.98796}
    },
    24: {
        'clf': {'colsample_bytree': 0.84474, 'gamma': 0.69746, 'learning_rate': 0.06842, 'max_depth': 9, 'n_estimators': 100, 'subsample': 0.75298},
        'reg': {'alpha': 0.23089, 'colsample_bytree': 0.69641, 'lambda': 0.68326, 'learning_rate': 0.13199, 'max_depth': 9, 'n_estimators': 84, 'subsample': 0.96372}
    }
}

def sanitize_global_params(params):
    """Recursively checks all float values in the nested dicts and removes any NaNs."""
    cleaned_params = {}
    for k, v in params.items():
        if isinstance(v, dict):
            cleaned_params[k] = sanitize_global_params(v)
        elif isinstance(v, (int, float)):
            v_float = float(v)
            if np.isnan(v_float):
                print(f"  - ‚ö†Ô∏è Global Sanitizer: Dropping NaN parameter: {k}")
                continue
            cleaned_params[k] = v_float
        else:
            cleaned_params[k] = v
    return cleaned_params

TUNED_PARAMS = sanitize_global_params(TUNED_PARAMS)


# ==========================================
# 1. DATA PREPARATION FUNCTIONS (Updated Loading)
# ==========================================

def load_and_preprocess_data(gz_file_name, csv_file_name):
    """
    Loads, fills gaps, cleans, and adds necessary features to the raw CIMIS data.
    This version explicitly unzips the .gz file to a temporary .csv file first.
    """
    
    print("--- 1. Loading and initial cleanup ---")
    
    # --- Explicit Gzip Decompression ---
    print(f"  - üîÑ Decompressing {gz_file_name} to {csv_file_name}...")
    try:
        # 'rt' opens in read mode for text
        with gzip.open(gz_file_name, 'rt', encoding='utf-8') as f_in:
            with open(csv_file_name, 'w', encoding='utf-8') as f_out:
                f_out.write(f_in.read())
        print("  - ‚úÖ Decompression complete.")
        
    except FileNotFoundError:
        print(f"  - üî¥ Error: Compressed file not found at: {os.path.abspath(gz_file_name)}")
        return pd.DataFrame(), {}
    except Exception as e:
        print(f"  - üî¥ Error during decompression: {e}")
        return pd.DataFrame(), {}
        
    # --- Read the unzipped CSV file ---
    df = pd.read_csv(csv_file_name) 
    # Clean up the unzipped file immediately
    os.remove(csv_file_name)
    print(f"  - Initial DataFrame size: {len(df)} rows")

    # Define a comprehensive rename map for all relevant columns at once
    rename_map = {
        "Stn Id": "station_id",
        "Rel Hum (%)": "rel_hum_percent",
        "Dew Point (C)": "dew_point_c",
        "Wind Speed (m/s)": "wind_speed_m_s",
        "Air Temp (C)": "air_temp_c",
        "Stn Name": "station_name"
    }
    df = df.rename(columns=rename_map, errors='ignore')

    # Initial date parsing and cleanup
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce", utc=True)
    df = df.dropna(subset=["Date"]).sort_values("Date").reset_index(drop=True)

    # Create timestamp column
    hour_int = df["Hour (PST)"].astype(int)
    hour = hour_int // 100
    df["datetime"] = df["Date"].dt.floor("D") + pd.to_timedelta(hour, unit="h")

    # --- 2. Reindex to continuous hourly timestamps and fill gaps ---
    print("--- 2. Reindexing and filling hourly gaps ---")

    def reindex_station_to_hourly(group):
        current_station_id = group.name
        group = group.sort_values("datetime")
        full_idx = pd.date_range(start=group["datetime"].min(), end=group["datetime"].max(), freq="h")
        g = group.set_index("datetime").reindex(full_idx)
        g.index.name = "datetime"
        for col in ["station_name", "CIMIS Region"]:
            if col in group.columns and not group[col].isnull().all():
                g[col] = g[col].fillna(group[col].iloc[0])
        g['station_id'] = current_station_id
        return g

    df_full = (
        df.groupby("station_id", group_keys=False) 
        .apply(reindex_station_to_hourly, include_groups=False)
        .reset_index()
    )
    print(f"  - DataFrame size after reindexing: {len(df_full)} rows")

    # --- 3. Final Cleaning and Feature Engineering ---
    print("--- 3. Final cleaning and feature engineering ---")

    df_clean = df_full.copy()
    qc_cols = [c for c in df_clean.columns if "qc" in c.lower()]
    df_clean = df_clean.drop(columns=qc_cols, errors='ignore')

    for c in df_clean.columns:
        if c not in ["station_name", "CIMIS Region", "datetime"]:
            df_clean[c] = pd.to_numeric(df_clean[c], errors="coerce")

    numeric_cols = df_clean.select_dtypes(include="number").columns
    df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].mean())

    df_clean["month"] = df_clean["datetime"].dt.month
    df_clean["hour"] = df_clean["datetime"].dt.hour
    df_clean['hour_sin'] = np.sin(2 * np.pi * df_clean['datetime'].dt.hour / 24)
    df_clean['hour_cos'] = np.cos(2 * np.pi * df_clean['datetime'].dt.hour / 24)

    # --- 4. Generate Lag Features on Full Data ---
    print("--- 4. Generating lag features ---")
    df_clean = df_clean.sort_values(['station_id', 'datetime'])
    for lag in [1, 3, 6]:
        df_clean[f'temp_lag_{lag}'] = df_clean.groupby('station_id')['air_temp_c'].shift(lag)
        df_clean[f'dew_lag_{lag}'] = df_clean.groupby('station_id')['dew_point_c'].shift(lag)

    df_clean = df_clean.dropna(subset=['temp_lag_6', 'dew_lag_6']).reset_index(drop=True)
    print(f"  - DataFrame size after lag generation and dropping initial NaNs: {len(df_clean)} rows")

    # --- 5. Generate Targets (for model training) ---
    print("--- 5. Generating targets for model training ---")

    train_data_by_horizon = {}
    for h in CONFIG['horizon']:
        df_h = df_clean.copy()
        df_h = df_h.sort_values(['station_id', 'datetime'])
        indexer = pd.api.indexers.FixedForwardWindowIndexer(window_size=h) 

        df_h['y_temp'] = df_h.groupby('station_id')['air_temp_c'].transform(
             lambda x: x.rolling(window=indexer, min_periods=1).min()
        )
        df_h['y_event'] = (df_h['y_temp'] <= CONFIG['frost_threshold']).astype(int)

        df_h_processed = df_h.dropna(subset=['y_temp']).reset_index(drop=True)

        if df_h_processed.empty:
            print(f"  - ‚ö†Ô∏è Warning: No training data available for horizon {h} after dropping NaNs. Skipping this horizon.")
        else:
            train_data_by_horizon[h] = df_h_processed
            print(f"  - Processed DataFrame size for horizon {h}h: {len(df_h_processed)} rows")

    return df_clean, train_data_by_horizon

# ==========================================
# 2. MODEL TRAINING AND STORAGE
# ==========================================

def sanitize_params(params):
    """LOCAL CLEANUP FUNCTION: Ensures parameters are correctly formatted for XGBoost."""
    cleaned_params = {}
    INT_PARAMS = ['n_estimators', 'max_depth']
    for k, v in params.items():
        if isinstance(v, (int, float)):
            if np.isnan(float(v)):
                continue
            if k in INT_PARAMS:
                try:
                    cleaned_params[k] = int(v)
                except ValueError:
                    print(f"  - üî¥ Error: Could not convert '{k}' value '{v}' to integer. Dropping parameter.")
                    continue
            else:
                cleaned_params[k] = v
        else:
            cleaned_params[k] = v
    return cleaned_params


def train_final_models(train_data_by_horizon):
    """Trains and returns a dictionary of tuned XGBoost models for each horizon."""

    trained_models = {h: {'clf': None, 'reg': None} for h in CONFIG['horizon']}

    print("\n" + "="*50)
    print("INITIATING FINAL XGBOOST MODEL TRAINING")
    print("="*50)

    for h_current in CONFIG['horizon']:
        if h_current not in train_data_by_horizon or train_data_by_horizon[h_current].empty:
            print(f"Skipping training for horizon: {h_current} hours, no data available.")
            continue

        print(f"\nTraining for horizon: {h_current} hours...")

        train_h_targets = train_data_by_horizon[h_current]
        params_h = TUNED_PARAMS[h_current]
        features = CONFIG['FEATURES']

        base_score_clf = train_h_targets['y_event'].mean()

        # 1. Frost Classifier
        clf_params = sanitize_params(params_h['clf'].copy())
        clf_xgb = xgb.XGBClassifier(
            **clf_params,
            n_jobs=-1,
            random_state=42,
            # Note: use_label_encoder=False is standard practice in recent XGBoost versions
            use_label_encoder=False, 
            eval_metric='logloss',
            base_score=base_score_clf,
            verbosity=0
        )
        print("  - Training Classifier...")
        clf_xgb.fit(train_h_targets[features], train_h_targets['y_event'])
        trained_models[h_current]['clf'] = clf_xgb

        # 2. Temperature Regressor
        reg_params = sanitize_params(params_h['reg'].copy())
        reg_xgb = xgb.XGBRegressor(
            **reg_params, 
            n_jobs=-1, 
            random_state=42,
            verbosity=0
        )
        print("  - Training Regressor...")
        reg_xgb.fit(train_h_targets[features], train_h_targets['y_temp'])
        trained_models[h_current]['reg'] = reg_xgb

    print("\n‚úÖ All final models trained successfully.")
    return trained_models

# ==========================================
# 3. INTERACTIVE PREDICTION
# ==========================================

def interactive_prediction(df_full, trained_models):
    """Prompts user for input and predicts frost/temp for all horizons."""

    print("\n" + "="*50)
    print("INTERACTIVE WEATHER PREDICTOR")
    print("="*50)

    # Get available stations and date range
    stations = sorted(df_full['station_id'].unique().tolist())
    min_date = df_full['datetime'].min().strftime('%Y-%m-%d %H:%M')
    max_date = df_full['datetime'].max().strftime('%Y-%m-%d %H:%M')

    print(f"Available Stations: {stations}")
    print(f"Data range: {min_date} to {max_date}")

    # Loop for interactive prediction until user enters -1
    while True:
        try:
            # Check if there are any available stations
            if not stations:
                print("Error: No station data available after processing.")
                return

            input_station_id_raw = input(f"Enter Station ID ({stations[0]}-{stations[-1]}) or -1 to exit: ")
            input_station_id = int(input_station_id_raw)

            if input_station_id == -1:
                print("Exiting interactive prediction.")
                break

            if input_station_id not in stations:
                print("Invalid Station ID. Please choose from the available list.")
                continue

            input_date_str = input("Enter Date and Hour (YYYY-MM-DD HH): ")
            # Ensure input datetime is timezone-aware (UTC) for correct comparison
            input_datetime = pd.to_datetime(input_date_str, format='%Y-%m-%d %H', utc=True)

            if input_datetime < pd.to_datetime(min_date, utc=True) or input_datetime > pd.to_datetime(max_date, utc=True):
                 print(f"Date is outside the available range ({min_date} to {max_date}). Please try again.")
                 continue

        except ValueError:
            print("Invalid input format. Please follow the required format (e.g., 2023-01-15 05).")
            continue
        except Exception as e:
            print(f"An error occurred: {e}")
            break

        # --- 2. Extract Current Conditions and Lagged Features ---
        # Find the row in the full data matching the user's input
        input_row = df_full[
            (df_full['station_id'] == input_station_id) &
            (df_full['datetime'] == input_datetime)
        ]

        if input_row.empty:
            print(f"\nüî¥ Error: No data found for Station {input_station_id} at {input_date_str}.")
            continue

        # Extract the features needed for prediction
        input_data = input_row[CONFIG['FEATURES']].iloc[0]

        # --- 3. Display Current Conditions ---
        print("\n" + "="*50)
        print(f"CURRENT CONDITIONS FOR STATION {input_station_id} AT {input_datetime}:")
        print(f"  Air Temp:       {input_data['air_temp_c']:.2f} ¬∞C")
        print(f"  Relative Hum:   {input_data['rel_hum_percent']:.1f} %")
        print(f"  Dew Point:      {input_data['dew_point_c']:.2f} ¬∞C")
        print(f"  Wind Speed:     {input_data['wind_speed_m_s']:.2f} m/s")
        print(f"  --- Lagged Temp: {input_data['temp_lag_6']:.2f} ¬∞C (6h ago)")
        print("="*50)

        # --- 4. Prediction ---
        print("\nFORECASTED MINIMUM TEMPERATURE & FROST RISK:")

        # XGBoost models require a 2D array/DataFrame input
        X_predict = pd.DataFrame([input_data])

        for h in CONFIG['horizon']:
            if trained_models[h]['clf'] is None or trained_models[h]['reg'] is None:
                print(f"| H={h}h Forecast: No models trained for this horizon due to lack of data.")
                continue

            clf = trained_models[h]['clf']
            reg = trained_models[h]['reg']

            # Predict minimum temperature
            min_temp_pred = reg.predict(X_predict)[0]

            # Predict frost probability
            frost_prob = clf.predict_proba(X_predict)[:, 1][0]
            frost_risk = "HIGH RISK (Frost Likely)" if frost_prob > 0.25 else "LOW RISK (No Frost)"

            print(f"| H={h}h Forecast:")
            print(f"|   Min Temp: {min_temp_pred:.2f} ¬∞C")
            print(f"|   Frost Prob: {frost_prob:.2f} -> {frost_risk}")

        print("----------------------------------------------------")


# ==========================================
# 4. EXECUTION
# ==========================================

if __name__ == "__main__":
    
    # --- Step 1: Data Preparation ---
    print("\n" + "#"*60)
    print("PHASE 1: DATA LOADING AND PREPROCESSING")
    print("#"*60)
    df_full, train_data_by_horizon = load_and_preprocess_data(CIMIS_GZ_FILE_NAME, CIMIS_CSV_FILE_NAME)

    # Check for empty dataframes after loading
    if df_full.empty:
        print("Data loading failed. Exiting script.")
        exit()

    # --- Step 2: Model Training ---
    print("\n" + "#"*60)
    print("PHASE 2: MODEL TRAINING")
    print("#"*60)
    trained_models = train_final_models(train_data_by_horizon)

    # --- Step 3: Interactive Prediction ---
    print("\n" + "#"*60)
    print("PHASE 3: STARTING INTERACTIVE PREDICTION")
    print("#"*60)
    interactive_prediction(df_full, trained_models)


############################################################
PHASE 1: DATA LOADING AND PREPROCESSING
############################################################
--- 1. Loading and initial cleanup ---
  - üîÑ Decompressing cimis_all_stations.csv.gz to cimis_all_stations_unzipped.csv...
  - ‚úÖ Decompression complete.
  - Initial DataFrame size: 2367360 rows
--- 2. Reindexing and filling hourly gaps ---
  - DataFrame size after reindexing: 2367360 rows
--- 3. Final cleaning and feature engineering ---
--- 4. Generating lag features ---
  - DataFrame size after lag generation and dropping initial NaNs: 2367252 rows
--- 5. Generating targets for model training ---
  - Processed DataFrame size for horizon 3h: 2367252 rows
  - Processed DataFrame size for horizon 6h: 2367252 rows
  - Processed DataFrame size for horizon 12h: 2367252 rows
  - Processed DataFrame size for horizon 24h: 2367252 rows

############################################################
PHASE 2: MODEL TRAINING
########

Enter Station ID (2-206) or -1 to exit:  80
Enter Date and Hour (YYYY-MM-DD HH):  2013-02-01 03



CURRENT CONDITIONS FOR STATION 80 AT 2013-02-01 03:00:00+00:00:
  Air Temp:       3.60 ¬∞C
  Relative Hum:   94.0 %
  Dew Point:      2.80 ¬∞C
  Wind Speed:     0.50 m/s
  --- Lagged Temp: 8.70 ¬∞C (6h ago)

FORECASTED MINIMUM TEMPERATURE & FROST RISK:
| H=3h Forecast:
|   Min Temp: 2.99 ¬∞C
|   Frost Prob: 0.00 -> LOW RISK (No Frost)
| H=6h Forecast:
|   Min Temp: 2.50 ¬∞C
|   Frost Prob: 0.00 -> LOW RISK (No Frost)
| H=12h Forecast:
|   Min Temp: 2.53 ¬∞C
|   Frost Prob: 0.01 -> LOW RISK (No Frost)
| H=24h Forecast:
|   Min Temp: 2.49 ¬∞C
|   Frost Prob: 0.01 -> LOW RISK (No Frost)
----------------------------------------------------


Enter Station ID (2-206) or -1 to exit:  -1


Exiting interactive prediction.
