In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import cross_val_score

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

from xgboost import XGBRegressor

In [2]:
train_data = pd.read_csv('/kaggle/input/30-days-of-ml/train.csv', index_col='id')
test_data = pd.read_csv('/kaggle/input/30-days-of-ml/test.csv', index_col='id')

In [3]:
categorical_cols = [col for col in train_data.columns if 'cat' in col]
continuous_cols = [col for col in train_data.columns if 'cont' in col]

In [4]:
X = train_data.drop(['target'], axis=1)
y = train_data['target']

Removing columns based on the coefficient values from the [Data Preprocessing notebook](https://www.kaggle.com/aniketsharma00411/30-days-of-ml-data-analysis-and-preprocessing#Finding-correlation-between-features-and-target).

In [5]:
to_remove = ['cont1', 'cat4', 'cat7']
X = X.drop(to_remove, axis=1)

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

In [7]:
# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [8]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Training Final Model

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('continuous', numerical_transformer, list(set(continuous_cols)-set(to_remove))),
        ('categorical', categorical_transformer, list(set(categorical_cols)-set(to_remove)))
    ])

model= XGBRegressor(
    n_estimators=1000,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.1,
    max_depth=3,
    booster='gbtree',
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    reg_lambda=0.0009,
    reg_alpha=23,
    n_jobs=4,
    random_state=42
)

pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

In [10]:
# from sklearn.model_selection import cross_val_score

# # Multiply by -1 since sklearn calculates *negative* MAE
# scores = -1 * cross_val_score(pipeline, X, y,
#                               cv=5,
#                               scoring='neg_mean_absolute_error')

# print("MAE scores:\n", scores)
# print("Average MAE score (across experiments):")
# print(scores.mean())

In [11]:
pipeline.fit(X, y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('continuous',
                                                  SimpleImputer(strategy='constant'),
                                                  ['cont9', 'cont3', 'cont6',
                                                   'cont10', 'cont2', 'cont11',
                                                   'cont4', 'cont12', 'cont8',
                                                   'cont7', 'cont5', 'cont13',
                                                   'cont0']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder(handle_unknown='ign...
      

In [12]:
# from sklearn.metrics import mean_absolute_error
# predictions = pipeline.predict(X_val)
# mean_squared_error(y_train, pipeline.predict(X_train), squared=True)
# print("Mean Absolute Error: " + str(mean_squared_error(y_val, predictions, squared=True)))
# print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_val)))

In [13]:
final_predictions = pd.DataFrame(zip(test_data.index, pipeline.predict(test_data.drop(to_remove, axis=1))), columns=['id', 'target'])

In [14]:
final_predictions.to_csv('/kaggle/working/submission.csv', index=False)