In [None]:
#Training the Student Model (XG Boost) on the basis of Distilled Knowledge.

In [None]:
#Imports

In [None]:
import numpy as np
import pandas as pd

In [None]:
df=pd.read_csv('leaddata.csv')

In [None]:
df.head(5)

In [None]:
df.drop(columns=['Unnamed: 0'], inplace=True)
df.set_index('Date', inplace=True)
df.index=pd.to_datetime(df.index)
df.head(5)

In [None]:
fdf=pd.read_csv('dataset1.csv')

In [None]:
fdf.head(5)

In [None]:
fdf.drop(columns=['Unnamed: 0'], inplace=True)
fdf.set_index('Date', inplace=True)
fdf.index=pd.to_datetime(fdf.index)
fdf.head(5)

In [None]:
merged = pd.merge(
    df,
    fdf,
    on=["Date", "Lat", "Lon", "Lead_Time"],
    how="inner"
)

merged.drop(columns=[
    'Observed_Precip_y', 'Ensemble_Mean_y', 'Ensemble_Residual_y'
], inplace=True)

merged.rename(columns={
    'Observed_Precip_x': 'Observed_Precip',
    'Ensemble_Mean_x': 'Ensemble_Mean',
    'Ensemble_Residual_x': 'Ensemble_Residual'
}, inplace=True)

missing = merged.isna().sum()
print("Missing values:\n", missing[missing > 0])

In [None]:
merged.head(2)

In [None]:
#Modelling

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
forecast_cols = [col for col in merged.columns if col.startswith(("CFST", "GFST"))]

input_features = forecast_cols + [
    "Ensemble_Mean", "Ensemble_Spread",
    "DOY", "Climatology",
    "Lat", "Lon", "Lead_Time"
]

target_col = "New_Residual"

In [None]:
merged_clean = merged.dropna(subset=[target_col] + input_features).copy()

X = merged_clean[input_features].values
y = merged_clean[target_col].values.reshape(-1, 1)

In [None]:
scaler_y = StandardScaler()
y_scaled = scaler_y.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_scaled, test_size=0.2, random_state=42
)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

In [None]:
y_train_flat = y_train.flatten()
y_test_flat = y_test.flatten()

In [None]:
xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbosity=1
)

In [None]:
xgb_model.fit(X_train, y_train_flat)

In [None]:
y_pred_scaled = xgb_model.predict(X_test)
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

In [None]:
ens_index = input_features.index("Ensemble_Mean")
ens_mean_test = X_test[:, ens_index]

In [None]:
final_prediction = ens_mean_test - y_pred

In [None]:
final_prediction

In [None]:
X = merged_clean[input_features].values
y = merged_clean[target_col].values.reshape(-1, 1)

position_indices = np.arange(len(merged_clean))

X_train, X_test, y_train, y_test, train_idx, test_idx = train_test_split(
    X, y_scaled, position_indices, test_size=0.2, random_state=42
)

In [None]:
merged_clean["Distilled_Residual"] = np.nan
merged_clean["Distilled_Prediction"] = np.nan

merged_clean.iloc[test_idx, merged_clean.columns.get_loc("Distilled_Residual")] = y_pred
merged_clean.iloc[test_idx, merged_clean.columns.get_loc("Distilled_Prediction")] = final_prediction

In [None]:
merged_clean_reset = merged_clean.reset_index(drop=False)

# Merge
merged = merged.merge(
    merged_clean_reset[["Date", "Lat", "Lon", "Lead_Time", "Distilled_Residual", "Distilled_Prediction"]],
    on=["Date", "Lat", "Lon", "Lead_Time"],
    how="left"
)

In [None]:
fdf=merged[["Date", "Observed_Precip", "Ensemble_Mean", "New_Prediction", "Distilled_Prediction", "Lead_Time", "Lat", "Lon"]].dropna()

In [None]:
fdf.to_csv('final.csv')

In [None]:
#End