In [181]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from sklearn.model_selection import cross_val_score
import optuna

In [182]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [183]:
class OutlierFixer(BaseEstimator, TransformerMixin):
    def __init__(self, method='iqr', factor=1.5):
        self.method = method
        self.factor = factor

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns if isinstance(X, pd.DataFrame) else [f"feature_{i}" for i in range(X.shape[1])]
        return self

    def transform(self, X, y=None):
        X = pd.DataFrame(X, columns=self.feature_names_in_)
        if self.method == 'iqr':
            for col in X.columns:
                Q1 = X[col].quantile(0.25)
                Q3 = X[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - self.factor * IQR
                upper_bound = Q3 + self.factor * IQR
                median = X[col].median()
                X[col] = X[col].mask((X[col] < lower_bound) | (X[col] > upper_bound), median)
        return X

In [184]:
X = train_data.drop(columns=['Y', 'X1', 'X2', 'X3', 'X5', 'X10'])
y = train_data['Y']

categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(exclude=['object']).columns

In [185]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('outlier_removal', OutlierFixer(method='iqr', factor=1.5)),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=3, include_bias=False))
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_columns),
    ('cat', categorical_pipeline, categorical_columns)
])

In [186]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

final_params = {
    'n_estimators': 800,
    'learning_rate': 0.0135,
    'max_depth': 2,
    'subsample': 0.7,
    'colsample_bytree': 0.6,
    'reg_alpha': 4.77e-05,
    'reg_lambda': 0.16,
    'random_state': 42,
    'n_jobs': -1
}

final_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(**final_params))
])

final_model.fit(X, y)

val_mae = mean_absolute_error(y_val, final_model.predict(X_val))
print(f"Fixed parameters: {final_params}")
print(f"Mean Absolute Error (MAE) with fixed parameters: {val_mae:.4f}")

submission = pd.DataFrame({
    'row_id': test_data.index,
    'Y': final_model.predict(test_data.drop(columns=['X1', 'X2', 'X3', 'X5', 'X10']))
})
submission.to_csv('submission_XGBoost.csv', index=False)


Fixed parameters: {'n_estimators': 800, 'learning_rate': 0.0135, 'max_depth': 2, 'subsample': 0.7, 'colsample_bytree': 0.6, 'reg_alpha': 4.77e-05, 'reg_lambda': 0.16, 'random_state': 42, 'n_jobs': -1}
Mean Absolute Error (MAE) with fixed parameters: 0.4030
