In [3]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import pandas as pd

# Load the dataset
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Custom Transformer: PrintDataTransformer
class PrintDataTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, message="Data after transformation"):  # Fixed __init__ method
        self.message = message

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        print(X.describe())  # Print summary stats instead of all rows
        return self

    def transform(self, X, y=None):
        return X

# Custom Transformer: OutlierRemover
class OutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, method='iqr', factor=1.5):  # Fixed __init__ method
        self.method = method
        self.factor = factor

    def fit(self, X, y=None):
        self.feature_names_in_ = X.columns if isinstance(X, pd.DataFrame) else [f"feature_{i}" for i in range(X.shape[1])]
        return self

    def transform(self, X, y=None):
        X = pd.DataFrame(X, columns=self.feature_names_in_)
        if self.method == 'iqr':
            for col in X.columns:
                Q1 = X[col].quantile(0.25)
                Q3 = X[col].quantile(0.75)
                IQR = Q3 - Q1
                lower_bound = Q1 - self.factor * IQR
                upper_bound = Q3 + self.factor * IQR
                X[col] = X[col].mask((X[col] < lower_bound) | (X[col] > upper_bound), np.nan)
        return X

# Prepare the data
X = train_data.drop(columns=['Y', 'X1', 'X2', 'X3', 'X5', 'X8', 'X10'])
y = train_data['Y']

categorical_columns = X.select_dtypes(include=['object']).columns
numerical_columns = X.select_dtypes(exclude=['object']).columns

# Define the pipelines
numerical_pipeline = Pipeline(steps=[
    ('outlier_removal', OutlierRemover(method='iqr', factor=1.5)),
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures(degree=3, include_bias=False))
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_columns),
    ('cat', categorical_pipeline, categorical_columns)
])

# Model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Grid Search for hyperparameter tuning
param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5],
    'regressor__min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {-grid_search.best_score_:.4f}")

# Refit the model with the best parameters
model.set_params(**grid_search.best_params_)
model.fit(X_train, y_train)

# Evaluate on validation set
val_predictions = model.predict(X_val)
mae = mean_absolute_error(y_val, val_predictions)
print(f"Mean Absolute Error (MAE) after tuning: {mae:.4f}")

# Predict on test set
test_data_prepared = test_data.drop(columns=["X1", "X2", "X3", "X5", "X8", "X10"])
test_predictions = model.predict(test_data_prepared)

# Save predictions to a CSV file
submission = pd.DataFrame({'row_id': test_data.index, 'Y': test_predictions})
submission.to_csv('submission_hgb.csv', index=False)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters: {'regressor__max_depth': 10, 'regressor__min_samples_leaf': 2, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 200}
Best cross-validation score: 0.4133
Mean Absolute Error (MAE) after tuning: 0.4231
