In [19]:
# 📌 Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
import xgboost as xgb
import optuna
import warnings
warnings.filterwarnings("ignore")


In [20]:
# 📌 Step 2: Load the Data
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")
submission = pd.read_csv("Submission.csv")

In [21]:
# 📌 Step 3: Data Preprocessing
X = train.drop(columns=['ai_tool_usage_hours'])
y = train['ai_tool_usage_hours']

# Impute missing target values
y_imputed = y.copy()
y_imputed.fillna(y.median(), inplace=True)

# Impute missing feature values
feature_imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(feature_imputer.fit_transform(X), columns=X.columns)
X_test = pd.DataFrame(feature_imputer.transform(test.drop(columns=['ai_tool_usage_hours'])), columns=X.columns)


In [22]:
# 📌 Step 4: Hyperparameter Tuning with Optuna

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'random_state': 42
    }

    model = xgb.XGBRegressor(**params)
    score = cross_val_score(model, X_imputed, y_imputed, cv=5, scoring='r2').mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print("Best trial:", study.best_trial.params)


[I 2025-06-19 14:01:11,708] A new study created in memory with name: no-name-20652349-8cfa-47a2-8e04-2140d1f7a398
[I 2025-06-19 14:01:12,332] Trial 0 finished with value: -0.002447942520892221 and parameters: {'n_estimators': 262, 'max_depth': 10, 'learning_rate': 0.21520434503635755, 'subsample': 0.9785883704205058, 'colsample_bytree': 0.8076481789286394, 'gamma': 1.2406882391262446, 'reg_alpha': 4.833418507731632, 'reg_lambda': 4.632645255956838}. Best is trial 0 with value: -0.002447942520892221.
[I 2025-06-19 14:01:13,698] Trial 1 finished with value: -0.0038296949030781403 and parameters: {'n_estimators': 544, 'max_depth': 3, 'learning_rate': 0.29479635889385253, 'subsample': 0.5508867134309476, 'colsample_bytree': 0.9846667005040717, 'gamma': 2.7919787117379915, 'reg_alpha': 2.0760782980911685, 'reg_lambda': 3.7830669196578297}. Best is trial 0 with value: -0.002447942520892221.
[I 2025-06-19 14:01:16,469] Trial 2 finished with value: -0.004778175304096477 and parameters: {'n_est

Best trial: {'n_estimators': 252, 'max_depth': 10, 'learning_rate': 0.20853509753515673, 'subsample': 0.9843519621152413, 'colsample_bytree': 0.7827487173710872, 'gamma': 4.586140275397418, 'reg_alpha': 2.663534652958474, 'reg_lambda': 4.961169134262857}


In [24]:
# 📌 Step 5: Train Final Model with Best Parameters
best_params = study.best_trial.params
final_model = xgb.XGBRegressor(**best_params)
final_model.fit(X_imputed, y_imputed)


In [25]:
# 📌 Step 6: Predict Test Set
predictions = final_model.predict(X_test)
submission['ai_tool_usage_hours'] = predictions
submission.to_csv("final_submission.csv", index=False)

print("✅ Best model trained and 'final_submission.csv' created successfully!")

✅ Best model trained and 'final_submission.csv' created successfully!
