In [22]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

# Feature engineering function (your full version)
def engineer_features_no_pca(df):
    df = df.copy()
    new_cols = {}

    # Weighted component-property features
    for comp in range(1, 6):
        for prop in range(1, 11):
            prop_col = f'Component{comp}_Property{prop}'
            frac_col = f'Component{comp}_fraction'
            new_cols[f'{prop_col}_weighted'] = df[prop_col] * df[frac_col]

    # Interaction features between fractions
    for i in range(1, 6):
        frac_i = f'Component{i}_fraction'
        for j in range(i + 1, 6):
            frac_j = f'Component{j}_fraction'
            new_cols[f'{frac_i}_x_{frac_j}'] = df[frac_i] * df[frac_j]
            new_cols[f'{frac_i}_div_{frac_j}'] = df[frac_i] / (df[frac_j] + 1e-8)

    # Ratios to total fraction
    total_fraction = sum(df[f'Component{i}_fraction'] for i in range(1, 6))
    new_cols['Total_fraction'] = total_fraction
    for i in range(1, 6):
        frac_col = f'Component{i}_fraction'
        new_cols[f'{frac_col}_to_total'] = df[frac_col] / (total_fraction + 1e-8)

    # Aggregate stats for component properties
    for comp in range(1, 6):
        props = [f'Component{comp}_Property{p}' for p in range(1, 11)]
        new_cols[f'Component{comp}_prop_mean'] = df[props].mean(axis=1)
        new_cols[f'Component{comp}_prop_std'] = df[props].std(axis=1)
        new_cols[f'Component{comp}_prop_min'] = df[props].min(axis=1)
        new_cols[f'Component{comp}_prop_max'] = df[props].max(axis=1)
        new_cols[f'Component{comp}_prop_range'] = new_cols[f'Component{comp}_prop_max'] - new_cols[f'Component{comp}_prop_min']

    # Polynomial features for component fractions (squares)
    for i in range(1, 6):
        frac_col = f'Component{i}_fraction'
        new_cols[f'{frac_col}_squared'] = df[frac_col] ** 2

    # Cross product polynomial features
    for i in range(1, 6):
        frac_i = f'Component{i}_fraction'
        for j in range(i + 1, 6):
            frac_j = f'Component{j}_fraction'
            new_cols[f'{frac_i}_x_{frac_j}'] = df[frac_i] * df[frac_j]

    new_features_df = pd.DataFrame(new_cols, index=df.index)
    df = pd.concat([df, new_features_df], axis=1)
    return df

# Load data
X = pd.read_csv('/Users/xDAyN/Desktop/cs project/Project Shell.ai Hackathon/dataset/X_train.csv')
y = pd.read_csv('/Users/xDAyN/Desktop/cs project/Project Shell.ai Hackathon/dataset/y_train.csv')

# Feature engineering
X_fe = engineer_features_no_pca(X)

# Train/validation split
X_train, X_val, y_train, y_val = train_test_split(X_fe, y, test_size=0.2, random_state=42)

target_index = 5  # Target 6 (zero-based indexing)

# Define and train model
model = xgb.XGBRegressor(
    max_depth=1,
    n_estimators=100000,
    learning_rate=0.05,
    reg_alpha=1.5,
    reg_lambda=9,
    eval_metric='mae',
    random_state=42,
    early_stopping_rounds=50,
    verbosity=1
)

print(f"Training model for target {target_index + 1}...")
model.fit(
    X_train, y_train.iloc[:, target_index],
    eval_set=[(X_val, y_val.iloc[:, target_index])],
    verbose=50
)

# Predictions
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)

# Metrics
train_mae = mean_absolute_error(y_train.iloc[:, target_index], train_preds)
val_mae = mean_absolute_error(y_val.iloc[:, target_index], val_preds)

train_mape = mean_absolute_percentage_error(y_train.iloc[:, target_index], train_preds)
val_mape = mean_absolute_percentage_error(y_val.iloc[:, target_index], val_preds)

print(f"\nTrain MAE: {train_mae:.5f}")
print(f"Validation MAE: {val_mae:.5f}")
print(f"Train MAPE: {train_mape:.5f}")
print(f"Validation MAPE: {val_mape:.5f}")

# Feature importance by total gain
booster = model.get_booster()
importance = booster.get_score(importance_type='total_gain')

importance_df = pd.DataFrame({
    'feature': list(importance.keys()),
    'total_gain': list(importance.values())
}).sort_values(by='total_gain', ascending=False)

print("\nTop 20 features by total gain:")
print(importance_df.head(20).reset_index(drop=True))

# Load test set
X_test = pd.read_csv('/Users/xDAyN/Desktop/cs project/Project Shell.ai Hackathon/dataset/X_test.csv')
X_test_fe = engineer_features_no_pca(X_test)

# --- Training and evaluation code (unchanged) ---

print(f"Training model for target {target_index + 1}...")
model.fit(
    X_train, y_train.iloc[:, target_index],
    eval_set=[(X_val, y_val.iloc[:, target_index])],
    verbose=50
)

# Predictions
train_preds = model.predict(X_train)
val_preds = model.predict(X_val)

# Metrics
train_mae = mean_absolute_error(y_train.iloc[:, target_index], train_preds)
val_mae = mean_absolute_error(y_val.iloc[:, target_index], val_preds)

train_mape = mean_absolute_percentage_error(y_train.iloc[:, target_index], train_preds)
val_mape = mean_absolute_percentage_error(y_val.iloc[:, target_index], val_preds)

print(f"\nTrain MAE: {train_mae:.5f}")
print(f"Validation MAE: {val_mae:.5f}")
print(f"Train MAPE: {train_mape:.5f}")
print(f"Validation MAPE: {val_mape:.5f}")

# Feature importance by total gain
booster = model.get_booster()
importance = booster.get_score(importance_type='total_gain')

importance_df = pd.DataFrame({
    'feature': list(importance.keys()),
    'total_gain': list(importance.values())
}).sort_values(by='total_gain', ascending=False)

print("\nTop 20 features by total gain:")
print(importance_df.head(20).reset_index(drop=True))

# --- Predict on test set and save ---
test_preds = model.predict(X_test_fe)
test_output = pd.DataFrame(test_preds, columns=[f'Target{target_index + 1}_prediction'])
test_output.to_csv(f'test_predictions_target_{target_index + 1}.csv', index=False)

print(f"\nTest predictions saved to 'test_predictions_target_{target_index + 1}.csv'")




Training model for target 6...
[0]	validation_0-mae:0.77556
[50]	validation_0-mae:0.55456
[100]	validation_0-mae:0.44945
[150]	validation_0-mae:0.37960
[200]	validation_0-mae:0.33237
[250]	validation_0-mae:0.29789
[300]	validation_0-mae:0.27230
[350]	validation_0-mae:0.25118
[400]	validation_0-mae:0.23385
[450]	validation_0-mae:0.21856
[500]	validation_0-mae:0.20576
[550]	validation_0-mae:0.19542
[600]	validation_0-mae:0.18699
[650]	validation_0-mae:0.17987
[700]	validation_0-mae:0.17364
[750]	validation_0-mae:0.16873
[800]	validation_0-mae:0.16418
[850]	validation_0-mae:0.16002
[900]	validation_0-mae:0.15637
[950]	validation_0-mae:0.15323
[1000]	validation_0-mae:0.15041
[1050]	validation_0-mae:0.14767
[1100]	validation_0-mae:0.14513
[1150]	validation_0-mae:0.14294
[1200]	validation_0-mae:0.14075
[1250]	validation_0-mae:0.13864
[1300]	validation_0-mae:0.13665
[1350]	validation_0-mae:0.13474
[1400]	validation_0-mae:0.13288
[1450]	validation_0-mae:0.13122
[1500]	validation_0-mae:0.12962
