In [189]:
# === Setup ===
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt

# Paths (adjust as needed)
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"
SAMPLE_SUB_PATH = "SampleSubmission.csv"

assert os.path.exists(TRAIN_PATH), f"Missing: {TRAIN_PATH}"
assert os.path.exists(TEST_PATH), f"Missing: {TEST_PATH}"
assert os.path.exists(SAMPLE_SUB_PATH), f"Missing: {SAMPLE_SUB_PATH}"

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("SampleSubmission columns:", list(sample_sub.columns))
display(sample_sub.head())

# Inspect columns
train.head()

Train shape: (10928, 12)
Test shape: (2732, 11)
SampleSubmission columns: ['ID', 'Target']


Unnamed: 0,ID,Target
0,ID_TxqKq_24,SMALLRAIN
1,ID_8PP4P_12,SMALLRAIN
2,ID_DkPcN_12,SMALLRAIN
3,ID_lxUih_24,SMALLRAIN
4,ID_33KM9_12,SMALLRAIN


Unnamed: 0,ID,user_id,confidence,predicted_intensity,community,district,prediction_time,indicator,indicator_description,time_observed,Target,forecast_length
0,ID_KwcTp_12,11,0.3,0.0,Tumfa,atiwa_west,2025-05-30 11:09:33,,,,MEDIUMRAIN,12
1,ID_K9vWT_12,17,0.3,0.0,Kwabeng,atiwa_west,2025-05-30 11:09:35,,,,HEAVYRAIN,12
2,ID_AIQg3_12,19,0.3,0.0,Akropong,atiwa_west,2025-05-30 11:09:47,,,,MEDIUMRAIN,12
3,ID_px4yf_12,23,0.3,0.0,Asamama,atiwa_west,2025-05-30 11:16:33,,,,HEAVYRAIN,12
4,ID_QYYmK_12,23,0.3,0.0,Asamama,atiwa_west,2025-05-30 11:16:55,,,,HEAVYRAIN,12


In [190]:
test.head()

Unnamed: 0,ID,user_id,confidence,predicted_intensity,community,district,prediction_time,indicator,indicator_description,time_observed,forecast_length
0,ID_SbTdy_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:28,,,,24
1,ID_SBKYz_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:29,,,,24
2,ID_fAimg_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:30,,,,24
3,ID_2wBqC_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:31,,,,24
4,ID_NItox_24,77,0.3,0.0,ASSIN BROFOYEDUR,assin_fosu,2025-07-20 19:27:32,,,,24


In [191]:
print('Train:', train.forecast_length.value_counts())
print('Test:',test.forecast_length.value_counts())

Train: forecast_length
24    6498
12    4430
Name: count, dtype: int64
Test: forecast_length
24    1395
12    1337
Name: count, dtype: int64


In [192]:
print(train.isnull().sum())
print(test.isnull().sum())

ID                           0
user_id                      0
confidence                   0
predicted_intensity          0
community                    0
district                     0
prediction_time              0
indicator                10425
indicator_description    10582
time_observed            10856
Target                       0
forecast_length              0
dtype: int64
ID                          0
user_id                     0
confidence                  0
predicted_intensity         0
community                   0
district                    0
prediction_time             0
indicator                2527
indicator_description    2527
time_observed            2527
forecast_length             0
dtype: int64


In [193]:
#check percentage of missing values in each column
print(train.isnull().mean().sort_values(ascending=False))
print(test.isnull().mean().sort_values(ascending=False))

time_observed            0.993411
indicator_description    0.968338
indicator                0.953971
ID                       0.000000
predicted_intensity      0.000000
confidence               0.000000
user_id                  0.000000
community                0.000000
prediction_time          0.000000
district                 0.000000
Target                   0.000000
forecast_length          0.000000
dtype: float64
indicator                0.924963
time_observed            0.924963
indicator_description    0.924963
confidence               0.000000
user_id                  0.000000
ID                       0.000000
predicted_intensity      0.000000
prediction_time          0.000000
district                 0.000000
community                0.000000
forecast_length          0.000000
dtype: float64


In [194]:
# filter columns with 90% or more missing values
threshold = 0.9
train_missing= train.loc[:, train.isnull().mean() > threshold]
test_missing= test.loc[:, test.isnull().mean() > threshold]

In [195]:
#show where train_missing and test_missing columns have values not null
train_missing[train_missing.notnull().any(axis=1)]
#print(test_missing[test_missing.notnull().any(axis=1)])

Unnamed: 0,indicator,indicator_description,time_observed
7,heat,,
9,heat,,
12,heat,,
13,heat,,
14,sun,,
...,...,...,...
10760,clouds,Cloud (probably cumulus) without visible sky,EARLY_MORNING
10769,wind,Light Wind,MORNING
10837,clouds,Cloud (probably cumulus) without visible sky,EVENING
10894,wind,Other,EVENING


In [196]:
# check all unique values in the train_missing and test_missing
for col in train_missing.columns:
    print(f"Column: {col}, Unique values: {train_missing[col].unique()}")
for col in test_missing.columns:
    print(f"Column: {col}, Unique values: {test_missing[col].unique()}")

Column: indicator, Unique values: [nan 'heat' 'sun' 'clouds' 'wind' 'dew' 'fog' 'star' 'thunder' 'moon'
 'lightning']
Column: indicator_description, Unique values: [nan 'Gathering of clouds in the east' 'Light fog'
 'Heavy Clouds (probably cumulus) reflecting rays of sunlight'
 'Sunny afternoon' 'Clouds moving South → East'
 'Fast-moving Cirrus clouds (white, visible at dawn)' 'Heated atmosphere'
 'Moon with rainbow-like rings (lunar halo)' 'Cold weather' 'High heat'
 'Clear sky and visible stars'
 'Cloud (probably cumulus) without visible sky' 'Moderate Wind'
 'High intensity sun' 'Sunny with clouds'
 'Specific 7 stars (aligned) in the west with clouds'
 'Sharp moving clouds (cirrus) with visible stars'
 'Lightning with thunder' 'Small size moon (crescent moon)'
 'Thunder sounds / clouds moving East → West'
 'High intensity sunlight with clear sky' 'Clear sky'
 'Full moon with rings around it (Lunar Halo)' 'Light Wind' 'Dew present'
 'Heavy fog, which greatly obscures vision'
 'Full m

In [197]:
#return difference in unque values between train_missing and test_missing
for col in train_missing.columns:
    if col in test_missing.columns:
        train_unique = set(train_missing[col].unique())
        test_unique = set(test_missing[col].unique())
        diff = train_unique - test_unique
        if diff:
            print(f"Column: {col}, Values in train but not in test: {diff}")

Column: indicator, Values in train but not in test: {'lightning'}
Column: indicator_description, Values in train but not in test: {'Full moon with rings of cloud (Lunar Halo)', 'Full moon with rings around it (Lunar Halo)', 'High heat', 'High intensity sunlight with clear sky', 'Heated atmosphere', 'Lightning with thunder', 'Specific 7 stars (aligned) in the west with clouds'}


In [198]:
#return difference in unque values between train_missing and test_missing
for col in train_missing.columns:
    if col in test_missing.columns:
        train_unique = set(train_missing[col].unique())
        test_unique = set(test_missing[col].unique())
        diff = test_unique -train_unique
        if diff:
            print(f"Column: {col}, Values in test but not in train: {diff}")

Column: indicator_description, Values in test but not in train: {'Heavy Wind', 'Crescent moon from west'}


In [199]:
#check for mismatch in unique values between train and test in all categorial columns
categorical_cols = train.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if col in test.columns:
        train_unique = set(train[col].unique())
        test_unique = set(test[col].unique())
        diff = train_unique - test_unique
        if diff:
            print(f"Column: {col}, Values in train but not in test: {diff}")
        diff = test_unique - train_unique
        if diff:
            print(f"Column: {col}, Values in test but not in train: {diff}")

Column: ID, Values in train but not in test: {'ID_blZZp_12', 'ID_BMU5r_12', 'ID_g4NQt_12', 'ID_iuF0R_12', 'ID_YFIDE_24', 'ID_fgRxU_24', 'ID_Vxz3d_12', 'ID_ZJxjG_24', 'ID_j3qKc_24', 'ID_riLRc_12', 'ID_3ZxLk_12', 'ID_Eg6Mp_24', 'ID_kcocj_24', 'ID_FuGti_24', 'ID_k792w_24', 'ID_fnHx2_24', 'ID_f5NlO_24', 'ID_6pdLu_24', 'ID_TUVmi_24', 'ID_faCAv_12', 'ID_9x10k_12', 'ID_n8iiZ_24', 'ID_TKkwO_24', 'ID_jLWGH_12', 'ID_kwN12_24', 'ID_IIRmm_24', 'ID_DookJ_24', 'ID_VLpC3_24', 'ID_9KYsE_24', 'ID_dbx2W_24', 'ID_kLqBs_12', 'ID_Lv9C5_12', 'ID_9v7jF_24', 'ID_4qbuq_24', 'ID_be9Aj_12', 'ID_FJWnH_24', 'ID_lItv2_24', 'ID_JDrVf_24', 'ID_M4ez1_12', 'ID_5Kcnm_24', 'ID_NDBXl_12', 'ID_PhCcH_24', 'ID_sBZJ7_24', 'ID_Z0NRA_24', 'ID_RxqwF_24', 'ID_WU6TN_24', 'ID_Ezk6U_24', 'ID_ktX8g_24', 'ID_1HGlT_24', 'ID_pKSkS_24', 'ID_TT0zz_24', 'ID_1bohr_24', 'ID_rR0kR_24', 'ID_aO0Tg_12', 'ID_Kzli4_24', 'ID_1JwQV_12', 'ID_HKzoN_24', 'ID_a3lvg_24', 'ID_WRrin_12', 'ID_qmsTo_24', 'ID_oylco_24', 'ID_gVqoJ_12', 'ID_Cmdz5_12', 'ID_NEab5

In [200]:
#join the test and trainset to ensure uniform preprocessing
ntrain = train.shape[0]
ntest = test.shape[0]
y = train['Target']
encode_all_data = pd.concat((train, test)).reset_index(drop=True)
encode_all_data.drop(['Target'], axis=1, inplace=True)
print("all_data size is : {}".format(encode_all_data.shape))

all_data size is : (13660, 11)


In [201]:
encode_all_data.dtypes

ID                        object
user_id                    int64
confidence               float64
predicted_intensity      float64
community                 object
district                  object
prediction_time           object
indicator                 object
indicator_description     object
time_observed             object
forecast_length            int64
dtype: object

In [202]:
# fill all missing values with 'Missing' for categorical
for col in encode_all_data.select_dtypes(include=['object']).columns:
    encode_all_data[col] = encode_all_data[col].fillna('Missing')

In [203]:
encode_all_data.isnull().sum()

ID                       0
user_id                  0
confidence               0
predicted_intensity      0
community                0
district                 0
prediction_time          0
indicator                0
indicator_description    0
time_observed            0
forecast_length          0
dtype: int64

In [204]:
# # one-hot encode Indicator, indicator description, time observed columns, use drop first method
# categorical_features = ['indicator', 'indicator_description', 'time_observed', 'community', 'district']
# categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False) # sparse_output=False for a regular array

# # Fit AND Transform the data in one step
# encoded_array = categorical_transformer.fit_transform(encode_all_data[categorical_features])

# # Now, let's create a DataFrame from the encoded array
# # Get the feature names after one-hot encoding
# feature_names = categorical_transformer.get_feature_names_out(categorical_features)
# encoded_df = pd.DataFrame(encoded_array, columns=feature_names, index=encode_all_data.index)

# # Finally, combine the encoded features with the original non-categorical data
# # Drop the original categorical columns from the original DataFrame
# all_data_processed = encode_all_data.drop(columns=categorical_features)
# # Join the encoded DataFrame
# all_data_processed = pd.concat([all_data_processed, encoded_df], axis=1)

# all_data_processed.head()

In [205]:
# one-hot encode Indicator, indicator description, time observed columns, use drop first method
categorical_features = ['indicator', 'indicator_description', 'time_observed', 'community', 'district']
def one_hot_encode_dataframe_simple(df, categorical_columns):
    """Simplified version that only returns the processed DataFrame."""
    df_processed = df.copy()
    encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
    
    encoded_array = encoder.fit_transform(df_processed[categorical_columns])
    encoded_df = pd.DataFrame(
        encoded_array, 
        columns=encoder.get_feature_names_out(categorical_columns), 
        index=df_processed.index
    )
    
    df_processed = df_processed.drop(columns=categorical_columns)
    return pd.concat([df_processed, encoded_df], axis=1)

all_data_processed = one_hot_encode_dataframe_simple(encode_all_data, categorical_features)
all_data_processed.head()

Unnamed: 0,ID,user_id,confidence,predicted_intensity,prediction_time,forecast_length,indicator_clouds,indicator_dew,indicator_fog,indicator_heat,...,community_akwaduuso,community_assin mesre nyame,community_asunafo,community_jimiso,community_mouso,community_odumasi,community_odumasi.1,community_odumasi Adansi,district_atiwa_west,district_obuasi_east
0,ID_KwcTp_12,11,0.3,0.0,2025-05-30 11:09:33,12,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,ID_K9vWT_12,17,0.3,0.0,2025-05-30 11:09:35,12,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,ID_AIQg3_12,19,0.3,0.0,2025-05-30 11:09:47,12,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,ID_px4yf_12,23,0.3,0.0,2025-05-30 11:16:33,12,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,ID_QYYmK_12,23,0.3,0.0,2025-05-30 11:16:55,12,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [206]:
def parse_time_features(df, time_col='prediction_time'):
    """
    Extracts comprehensive time features from a datetime column for weather forecasting in Ghana.
    Includes month, week, seasons, and other relevant temporal features.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The input DataFrame containing the datetime column.
    time_col : str, default='prediction_time'
        The name of the datetime column to parse.
        
    Returns:
    --------
    pandas.DataFrame
        The DataFrame with added time feature columns.
    """
    
    df = df.copy()
    
    if time_col in df.columns:
        # Convert to datetime with dayfirst format common in many regions
        dt = pd.to_datetime(df[time_col].astype(str), dayfirst=True, errors='coerce')
        
        # Basic time features
        # df['pred_hour'] = dt.dt.hour
        # df['pred_dow'] = dt.dt.dayofweek  # Monday=0, Sunday=6
        df['pred_day'] = dt.dt.day
        df['pred_month'] = dt.dt.month
        # df['pred_year'] = dt.dt.year
        df['pred_week'] = dt.dt.isocalendar().week  # ISO week number
        df['pred_quarter'] = dt.dt.quarter
        
        # Date as string (optional - keep if needed for grouping)
        df['pred_date'] = dt.dt.date.astype('str')
        
        # Time of day categories (useful for weather patterns)
        df['time_of_day'] = pd.cut(dt.dt.hour, 
                                  bins=[-1, 6, 12, 18, 24], 
                                  labels=['Night', 'Morning', 'Afternoon', 'Evening'])
        
        # Ghana Seasons (Based on rainfall patterns)
        def get_ghana_season(month):
            if month in [3, 4, 5, 6]:  # Major rainy season: March-June
                return 'Major_Rainy'
            elif month in [7, 8]:       # Minor dry season: July-August
                return 'Minor_Dry'
            elif month in [9, 10, 11]:  # Minor rainy season: September-November
                return 'Minor_Rainy'
            else:                       # Major dry season: December-February
                return 'Major_Dry'
        
        df['ghana_season'] = dt.dt.month.map(get_ghana_season)
        
        # Harmattan season (Dry, dusty wind from Sahara - typically Dec-Feb)
        df['is_harmattan'] = dt.dt.month.isin([12, 1, 2]).astype(int)
        
        # Cyclical features using sine/cosine encoding
        df['hour_sin'] = np.sin(2 * np.pi * dt.dt.hour / 24)
        df['hour_cos'] = np.cos(2 * np.pi * dt.dt.hour / 24)
        df['month_sin'] = np.sin(2 * np.pi * dt.dt.month / 12)
        df['month_cos'] = np.cos(2 * np.pi * dt.dt.month / 12)
        
        # Weather-relevant binary features
        df['is_daytime'] = ((dt.dt.hour >= 6) & (dt.dt.hour <= 18)).astype(int)
        df['is_weekend'] = (dt.dt.dayofweek >= 5).astype(int)  # 5=Sat, 6=Sun
        
        # Seasonal periods for agricultural planning
        df['planting_season'] = ((dt.dt.month >= 3) & (dt.dt.month <= 4)).astype(int)  # Main planting: Mar-Apr
        df['harvest_season'] = ((dt.dt.month >= 8) & (dt.dt.month <= 9)).astype(int)   # Main harvest: Aug-Sep
        
    return df

# Usage example:
all_data_processed = parse_time_features(all_data_processed)

# Optional: If you want to see what features were created
print("Time features created:")
time_features = [col for col in all_data_processed.columns if col.startswith('pred_') or 
                 col in ['time_of_day', 'ghana_season', 'is_harmattan', 'hour_sin', 
                         'hour_cos', 'month_sin', 'month_cos', 'is_daytime', 
                         'is_weekend', 'planting_season', 'harvest_season']]
print(time_features)

Time features created:
['pred_day', 'pred_month', 'pred_week', 'pred_quarter', 'pred_date', 'time_of_day', 'ghana_season', 'is_harmattan', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'is_daytime', 'is_weekend', 'planting_season', 'harvest_season']


  dt = pd.to_datetime(df[time_col].astype(str), dayfirst=True, errors='coerce')


In [207]:
all_data_processed.head()

Unnamed: 0,ID,user_id,confidence,predicted_intensity,prediction_time,forecast_length,indicator_clouds,indicator_dew,indicator_fog,indicator_heat,...,ghana_season,is_harmattan,hour_sin,hour_cos,month_sin,month_cos,is_daytime,is_weekend,planting_season,harvest_season
0,ID_KwcTp_12,11,0.3,0.0,2025-05-30 11:09:33,12,0.0,0.0,0.0,0.0,...,Major_Rainy,0,0.258819,-0.965926,0.5,-0.866025,1,0,0,0
1,ID_K9vWT_12,17,0.3,0.0,2025-05-30 11:09:35,12,0.0,0.0,0.0,0.0,...,Major_Rainy,0,0.258819,-0.965926,0.5,-0.866025,1,0,0,0
2,ID_AIQg3_12,19,0.3,0.0,2025-05-30 11:09:47,12,0.0,0.0,0.0,0.0,...,Major_Rainy,0,0.258819,-0.965926,0.5,-0.866025,1,0,0,0
3,ID_px4yf_12,23,0.3,0.0,2025-05-30 11:16:33,12,0.0,0.0,0.0,0.0,...,Major_Rainy,0,0.258819,-0.965926,0.5,-0.866025,1,0,0,0
4,ID_QYYmK_12,23,0.3,0.0,2025-05-30 11:16:55,12,0.0,0.0,0.0,0.0,...,Major_Rainy,0,0.258819,-0.965926,0.5,-0.866025,1,0,0,0


In [208]:
# drop prediction_time column as we have extracted all features from it
all_data_processed.drop(['prediction_time'], axis=1, inplace=True)

In [209]:
# filter for all categorial columns
categorical_cols = [col for col in all_data_processed.select_dtypes(include=['object']).columns]

In [210]:
categorical_cols

['ID', 'pred_date', 'ghana_season']

In [211]:
categorical_features = ['ghana_season', 'time_of_day']
all_data_processed = one_hot_encode_dataframe_simple(all_data_processed, categorical_features)

In [212]:
all_data_processed.head()

Unnamed: 0,ID,user_id,confidence,predicted_intensity,forecast_length,indicator_clouds,indicator_dew,indicator_fog,indicator_heat,indicator_lightning,...,month_sin,month_cos,is_daytime,is_weekend,planting_season,harvest_season,ghana_season_Minor_Dry,time_of_day_Evening,time_of_day_Morning,time_of_day_Night
0,ID_KwcTp_12,11,0.3,0.0,12,0.0,0.0,0.0,0.0,0.0,...,0.5,-0.866025,1,0,0,0,0.0,0.0,1.0,0.0
1,ID_K9vWT_12,17,0.3,0.0,12,0.0,0.0,0.0,0.0,0.0,...,0.5,-0.866025,1,0,0,0,0.0,0.0,1.0,0.0
2,ID_AIQg3_12,19,0.3,0.0,12,0.0,0.0,0.0,0.0,0.0,...,0.5,-0.866025,1,0,0,0,0.0,0.0,1.0,0.0
3,ID_px4yf_12,23,0.3,0.0,12,0.0,0.0,0.0,0.0,0.0,...,0.5,-0.866025,1,0,0,0,0.0,0.0,1.0,0.0
4,ID_QYYmK_12,23,0.3,0.0,12,0.0,0.0,0.0,0.0,0.0,...,0.5,-0.866025,1,0,0,0,0.0,0.0,1.0,0.0


In [213]:
# filter for all categorial columns
[col for col in all_data_processed.select_dtypes(include=['object']).columns]

['ID', 'pred_date']

In [214]:
all_data_processed.time_of_day_Evening

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
13655    1.0
13656    1.0
13657    1.0
13658    1.0
13659    1.0
Name: time_of_day_Evening, Length: 13660, dtype: float64

In [215]:
# separate back into train and test sets
train_processed = all_data_processed.iloc[:ntrain, :].copy()
test_processed = all_data_processed.iloc[ntrain:, :].copy()

In [216]:
train_processed.shape, test_processed.shape

((10928, 116), (2732, 116))

In [217]:
train.shape, test.shape

((10928, 12), (2732, 11))

In [None]:
model = RandomForestClassifier(
    n_estimators=300,
    n_jobs=-1,
    random_state=42
)

pipe = Pipeline(steps = [('model', model)])

X = train_processed.drop(columns=['pred_date', 'ID'], errors='ignore')
y = y

In [219]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1_scores = cross_val_score(pipe, X, y, cv=skf, scoring='f1_macro', n_jobs=-1)
acc_scores = cross_val_score(pipe, X, y, cv=skf, scoring='accuracy', n_jobs=-1)

print(f"CV Macro F1: {f1_scores.mean():.4f} ± {f1_scores.std():.4f}")
print(f"CV Accuracy: {acc_scores.mean():.4f} ± {acc_scores.std():.4f}")

CV Macro F1: 0.9730 ± 0.0048
CV Accuracy: 0.9938 ± 0.0007


In [226]:
ID_COL = "ID"
def conform_to_sample(sample_df: pd.DataFrame, pred_df: pd.DataFrame, id_col: str = "id") -> pd.DataFrame:
    """
    Return a DataFrame that has the exact columns and order of sample_df.
    - Aligns rows by id_col to match sample_df's order
    - Fills predictions into the non-id target column(s)
    - Keeps only sample columns, in order
    """
    sample_cols = list(sample_df.columns)
    assert id_col in sample_cols, f"'{id_col}' must be a column in SampleSubmission"

    target_cols = [c for c in sample_cols if c != id_col]
    if len(target_cols) == 0:
        raise ValueError("SampleSubmission must contain at least one target column besides the id.")

    merged = sample_df[[id_col]].merge(pred_df, on=id_col, how="left")

    for tcol in target_cols:
        if tcol in pred_df.columns:
            merged[tcol] = merged[tcol]
        else:
            pred_only = [c for c in pred_df.columns if c != id_col]
            if len(pred_only) == 1:
                merged[tcol] = merged[pred_only[0]]
            else:
                raise ValueError(f"Cannot map predictions to sample target column '{tcol}'. Provide a column named '{tcol}'.")

    return merged[sample_cols]

pipe.fit(X, y)

X_test = test_processed.drop(columns=['pred_date', 'ID'], errors='ignore')
test_pred = pipe.predict(X_test)

pred_df = pd.DataFrame({ID_COL: test_processed[ID_COL].values, 'rain_type': test_pred})

submission = conform_to_sample(sample_sub, pred_df, id_col=ID_COL)

save_path = "submission_1.csv"
submission.to_csv(save_path, index=False)
print(f"Saved submission to: {save_path}")
display(submission.head())

# Sanity checks
assert list(submission.columns) == list(sample_sub.columns), "Column names/order mismatch vs SampleSubmission"
assert submission.shape[0] == sample_sub.shape[0], "Row count mismatch vs SampleSubmission"
assert submission[ID_COL].equals(sample_sub[ID_COL]), "ID ordering mismatch vs SampleSubmission"

Saved submission to: submission_1.csv


Unnamed: 0,ID,Target
0,ID_TxqKq_24,NORAIN
1,ID_8PP4P_12,NORAIN
2,ID_DkPcN_12,NORAIN
3,ID_lxUih_24,NORAIN
4,ID_33KM9_12,NORAIN
