In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

# 1. Load full dataset
X = pd.read_csv('/Users/xDAyN/Desktop/cs project/Project Shell.ai Hackathon/dataset/X_train.csv')
y = pd.read_csv('/Users/xDAyN/Desktop/cs project/Project Shell.ai Hackathon/dataset/y_train.csv')
X_test = pd.read_csv('/Users/xDAyN/Desktop/cs project/Project Shell.ai Hackathon/dataset/X_test.csv')

# 2. Feature engineering: generate blend estimates
def generate_blend_estimates(df):
    blend_estimates = pd.DataFrame()
    for prop in range(1, 11):  # For BlendProperty1 to BlendProperty10
        est_col = f'BlendProperty{prop}_est'
        blend_estimates[est_col] = sum(
            df[f'Component{i}_Property{prop}'] * df[f'Component{i}_fraction']
            for i in range(1, 6)
        )
    return blend_estimates

X_blend_estimates = generate_blend_estimates(X)
X_test_blend_estimates = generate_blend_estimates(X_test)

# 3. Concatenate blend estimates to original features
X_final = pd.concat([X, X_blend_estimates], axis=1)
X_test_final = pd.concat([X_test, X_test_blend_estimates], axis=1)

# 4. Split train into train + validation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_final, y, test_size=0.2, random_state=42
)

# 5. Train one XGBoost model per target with your given parameters
models = []
for i in range(y.shape[1]):
    print(f"Training model for target {i + 1} / {y.shape[1]}")
    model = xgb.XGBRegressor(
        max_depth=3,
        n_estimators=3000,
        learning_rate=0.075,
        reg_alpha=1.0,
        reg_lambda=5.0,
        early_stopping_rounds=25,
        eval_metric='mae',
        random_state=42,
    )
    model.fit(
        X_train_split, y_train_split.iloc[:, i],
        eval_set=[(X_val_split, y_val_split.iloc[:, i])],
        verbose=False
    )
    models.append(model)

# 6. Predict on train and validation
train_preds = np.column_stack([m.predict(X_train_split) for m in models])
val_preds = np.column_stack([m.predict(X_val_split) for m in models])

# 7. Evaluate
train_mape = mean_absolute_percentage_error(y_train_split, train_preds)
val_mape = mean_absolute_percentage_error(y_val_split, val_preds)

print(f"Train MAPE: {train_mape:.4f}")
print(f"Validation MAPE: {val_mape:.4f}")

# 8. Predict on test set
test_preds = np.column_stack([m.predict(X_test_final) for m in models])

# 9. Prepare submission DataFrame
# If test.csv has an ID column, load it
try:
    test_ids = pd.read_csv('/Users/xDAyN/Desktop/cs project/Project Shell.ai Hackathon/dataset/test.csv', usecols=['ID'])
except:
    # Otherwise, just create a dummy ID column
    test_ids = pd.DataFrame({'ID': range(len(X_test_final))})

submission = test_ids.copy()
for i in range(10):
    submission[f'BlendProperty{i+1}'] = test_preds[:, i]




Training model for target 1 / 10
Training model for target 2 / 10
Training model for target 3 / 10
Training model for target 4 / 10
Training model for target 5 / 10
Training model for target 6 / 10
Training model for target 7 / 10
Training model for target 8 / 10
Training model for target 9 / 10
Training model for target 10 / 10
Train MAPE: 0.4330
Validation MAPE: 0.8526
✅ Submission file 'submission.csv' created successfully.
