In [44]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

# ---- 1. Feature Engineering Function ----
def engineer_features_no_pca(df):
    df = df.copy()
    new_cols = {}

    for comp in range(1, 6):
        for prop in range(1, 11):
            prop_col = f'Component{comp}_Property{prop}'
            frac_col = f'Component{comp}_fraction'
            new_cols[f'{prop_col}_weighted'] = df[prop_col] * df[frac_col]

    for i in range(1, 6):
        frac_i = f'Component{i}_fraction'
        for j in range(i + 1, 6):
            frac_j = f'Component{j}_fraction'
            new_cols[f'{frac_i}_x_{frac_j}'] = df[frac_i] * df[frac_j]
            new_cols[f'{frac_i}_div_{frac_j}'] = df[frac_i] / (df[frac_j] + 1e-8)

    total_fraction = sum(df[f'Component{i}_fraction'] for i in range(1, 6))
    new_cols['Total_fraction'] = total_fraction
    for i in range(1, 6):
        frac_col = f'Component{i}_fraction'
        new_cols[f'{frac_col}_to_total'] = df[frac_col] / (total_fraction + 1e-8)

    for comp in range(1, 6):
        props = [f'Component{comp}_Property{p}' for p in range(1, 11)]
        new_cols[f'Component{comp}_prop_mean'] = df[props].mean(axis=1)
        new_cols[f'Component{comp}_prop_std'] = df[props].std(axis=1)
        new_cols[f'Component{comp}_prop_min'] = df[props].min(axis=1)
        new_cols[f'Component{comp}_prop_max'] = df[props].max(axis=1)
        new_cols[f'Component{comp}_prop_range'] = new_cols[f'Component{comp}_prop_max'] - new_cols[f'Component{comp}_prop_min']

    for i in range(1, 6):
        frac_col = f'Component{i}_fraction'
        new_cols[f'{frac_col}_squared'] = df[frac_col] ** 2

    for i in range(1, 6):
        frac_i = f'Component{i}_fraction'
        for j in range(i + 1, 6):
            frac_j = f'Component{j}_fraction'
            new_cols[f'{frac_i}_x_{frac_j}'] = df[frac_i] * df[frac_j]

    new_features_df = pd.DataFrame(new_cols, index=df.index)
    df = pd.concat([df, new_features_df], axis=1)
    return df

# ---- 2. Define Top 20 Features (based on total gain) ----
TOP_20_FEATURES = [
    "Component2_fraction_div_Component5_fraction",
    "Component4_fraction_x_Component5_fraction",
    "Component3_fraction_x_Component5_fraction",
    "Component5_fraction",
    "Component1_fraction_div_Component5_fraction",
    "Component3_Property2_weighted",
    "Component4_Property2_weighted",
    "Component5_Property2_weighted",
    "Component4_Property2",
    "Component5_Property2",
    "Component1_Property2_weighted",
    "Component3_fraction_x_Component4_fraction",
    "Component2_fraction_div_Component3_fraction",
    "Component2_Property2_weighted",
    "Component3_fraction",
    "Component3_Property2",
    "Component1_fraction_x_Component2_fraction",
    "Component2_fraction_div_Component4_fraction",
    "Component2_fraction",
    "Component1_Property2"
]

# ---- 3. Load and Engineer Data ----
X = pd.read_csv('/Users/xDAyN/Desktop/cs project/Project Shell.ai Hackathon/dataset/X_train.csv')
y = pd.read_csv('/Users/xDAyN/Desktop/cs project/Project Shell.ai Hackathon/dataset/y_train.csv')
X_test = pd.read_csv('/Users/xDAyN/Desktop/cs project/Project Shell.ai Hackathon/dataset/X_test.csv')

X_fe = engineer_features_no_pca(X)
X_test_fe = engineer_features_no_pca(X_test)

# Only use top 20 features
X_fe = X_fe[TOP_20_FEATURES]
X_test_fe = X_test_fe[TOP_20_FEATURES]

# ---- 4. Train-Validation Split ----
X_train, X_val, y_train, y_val = train_test_split(X_fe, y, test_size=0.2, random_state=42)

# ---- 5. Train for Target 2 (index 1) ----
target_index = 1
print(f"Training model for target {target_index + 1}")

model = xgb.XGBRegressor(
    max_depth=3,
    n_estimators=100000,
    learning_rate=0.5,
    reg_alpha=0,
    reg_lambda=5.1,
    eval_metric='mae',
    random_state=42,
    verbosity=1,
    early_stopping_rounds=25
)

model.fit(
    X_train, y_train.iloc[:, target_index],
    eval_set=[(X_val, y_val.iloc[:, target_index])],
    verbose=50
)

# ---- 6. Evaluate and Save Predictions ----
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)
test_preds = model.predict(X_test_fe)

train_mape = mean_absolute_percentage_error(y_train.iloc[:, target_index], train_preds)
val_mape = mean_absolute_percentage_error(y_val.iloc[:, target_index], val_preds)

print(f"\nTrain MAPE for target {target_index + 1}: {train_mape:.4f}")
print(f"Validation MAPE for target {target_index + 1}: {val_mape:.4f}")

# Save test predictions
output_df = pd.DataFrame(test_preds, columns=[f'Target{target_index + 1}_prediction'])
output_df.to_csv('target_2_test_predictions.csv', index=False)
print("Test predictions saved to 'target_2_test_predictions.csv'")


Training model for target 2
[0]	validation_0-mae:0.56564
[50]	validation_0-mae:0.17298
[100]	validation_0-mae:0.15633
[150]	validation_0-mae:0.15044
[200]	validation_0-mae:0.14632
[250]	validation_0-mae:0.14472
[300]	validation_0-mae:0.14366
[350]	validation_0-mae:0.14250
[400]	validation_0-mae:0.14168
[450]	validation_0-mae:0.14124
[500]	validation_0-mae:0.14055
[531]	validation_0-mae:0.14047

Train MAPE for target 2: 0.0528
Validation MAPE for target 2: 0.5550
Test predictions saved to 'target_2_test_predictions.csv'
