In [12]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

print("XGBoost version:", xgb.__version__)  # should be 3.0.2

# --- Feature Engineering ---
def engineer_features_pruned(df):
    df = df.copy()
    new_cols = {}

    # ✅ Weighted component-property features
    for comp in range(1, 6):
        for prop in range(1, 11):
            prop_col = f'Component{comp}_Property{prop}'
            frac_col = f'Component{comp}_fraction'
            new_cols[f'{prop_col}_weighted'] = df[prop_col] * df[frac_col]

    # ✅ Select pairwise interaction (multiplicative only)
    for i in range(1, 6):
        frac_i = f'Component{i}_fraction'
        for j in range(i + 1, 6):
            frac_j = f'Component{j}_fraction'
            new_cols[f'{frac_i}_x_{frac_j}'] = df[frac_i] * df[frac_j]

    # ⚠️ Keep only mean + min for component stats
    for comp in range(1, 6):
        props = [f'Component{comp}_Property{p}' for p in range(1, 11)]
        df_props = df[props]
        new_cols[f'Component{comp}_prop_mean'] = df_props.mean(axis=1)
        new_cols[f'Component{comp}_prop_min'] = df_props.min(axis=1)

    return pd.concat([df, pd.DataFrame(new_cols, index=df.index)], axis=1)

# --- Load Data ---
X = pd.read_csv('/Users/xDAyN/Desktop/cs project/Project Shell.ai Hackathon/dataset/X_train.csv')
y = pd.read_csv('/Users/xDAyN/Desktop/cs project/Project Shell.ai Hackathon/dataset/y_train.csv')
X_test = pd.read_csv('/Users/xDAyN/Desktop/cs project/Project Shell.ai Hackathon/dataset/X_test.csv')

# --- Feature Engineering ---
X_fe = engineer_features_pruned(X)
X_test_fe = engineer_features_pruned(X_test)

# --- Split ---
X_train, X_val, y_train, y_val = train_test_split(X_fe, y, test_size=0.2, random_state=42)

# --- Train Model ---
target_index = 4
print(f"\nTraining model for target {target_index + 1} / {y.shape[1]}")

model = xgb.XGBRegressor(
    max_depth=5,
    learning_rate=0.01,
    n_estimators=10000,
    reg_alpha=0,
    reg_lambda=5.1,
    eval_metric='mae',
    verbosity=1,
    random_state=42,
    early_stopping_rounds = 25
)

model.fit(
    X_train,
    y_train.iloc[:, target_index],
    eval_set=[(X_val, y_val.iloc[:, target_index])],
    verbose=50
)

# --- Validation MAE Tracking ---
evals_result = model.evals_result()
val_mae_list = evals_result['validation_0']['mae']

best_mae = float('inf')
print("\nValidation MAE per iteration:")
for i, mae in enumerate(val_mae_list):
    if mae < best_mae:
        best_mae = mae
        print(f"  Iteration {i + 1}: MAE = {mae:.5f}")

# --- Evaluate ---
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)

train_mape = mean_absolute_percentage_error(y_train.iloc[:, target_index], train_preds)
val_mape = mean_absolute_percentage_error(y_val.iloc[:, target_index], val_preds)

print(f"\nTrain MAPE: {train_mape:.4f}")
print(f"Validation MAPE: {val_mape:.4f}")

# --- Test Predictions ---
test_preds = model.predict(X_test_fe)

# --- Save Test Predictions ---
output_df = pd.DataFrame(test_preds, columns=[f'Target{target_index + 1}_prediction'])
output_df.to_csv(f'target_{target_index + 1}_test_predictions.csv', index=False)
print(f"Test predictions saved to 'target_{target_index + 1}_test_predictions.csv'")


XGBoost version: 3.0.2

Training model for target 5 / 10
[0]	validation_0-mae:0.77693
[50]	validation_0-mae:0.48559
[100]	validation_0-mae:0.30597
[150]	validation_0-mae:0.19685
[200]	validation_0-mae:0.13160
[250]	validation_0-mae:0.09152
[300]	validation_0-mae:0.06697
[350]	validation_0-mae:0.05362
[400]	validation_0-mae:0.04610
[450]	validation_0-mae:0.04134
[500]	validation_0-mae:0.03821
[550]	validation_0-mae:0.03640
[600]	validation_0-mae:0.03582
[650]	validation_0-mae:0.03549
[700]	validation_0-mae:0.03524
[750]	validation_0-mae:0.03506
[800]	validation_0-mae:0.03495
[850]	validation_0-mae:0.03489
[900]	validation_0-mae:0.03482
[950]	validation_0-mae:0.03474
[1000]	validation_0-mae:0.03464
[1050]	validation_0-mae:0.03452
[1100]	validation_0-mae:0.03440
[1150]	validation_0-mae:0.03435
[1173]	validation_0-mae:0.03437

Validation MAE per iteration:
  Iteration 1: MAE = 0.77693
  Iteration 2: MAE = 0.76967
  Iteration 3: MAE = 0.76255
  Iteration 4: MAE = 0.75540
  Iteration 5: MAE 