In [1]:
'''
Main module for preprocessing.
'''
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

PATH = '../data/train.csv'

def encoding_values(df):
    '''
    encode categorical data to float
    Parameters:
        dataframe (pandas.DataFrame): DataFrame on which to operate
    Returns:
        pandas.DataFrame
    '''
    df_objects = (df.dtypes=='object')
    object_cols = list(df_objects[df_objects].index)
    ordinal_encoder = OrdinalEncoder()
    df[object_cols] = ordinal_encoder.fit_transform(df[object_cols])
    return df

def impute_values(df):
    '''
    Impute missing values in features
    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''
    imputer = SimpleImputer()
    imputer.fit(df)
    imputed_df = pd.DataFrame(imputer.transform(df))
    imputed_df.columns = df.columns
    return imputed_df

def scaling_values(df):
    '''
    Scaling features
    Parameters:
        df (pandas.DataFrame): Dataframe on which to operate
    Returns:
        pandas.DataFrame
    '''

    scaler = MinMaxScaler()
    x_train = df.drop(['Id', 'SalePrice'], axis=1)
    scaler.fit(x_train)
    scaled_data = scaler.transform(x_train)
    scaled_data = pd.DataFrame(scaled_data, columns=x_train.columns)
    scaled_data.insert(loc=0, column='Id', value=df['Id'])
    scaled_data['SalePrice'] = df['SalePrice']
    return scaled_data

def apply_preprocessing(df):
    '''
    Applying data cleaning functions to data sets
    Paramters:
        dataframe (pandas.DataFrame): Dataframe on which to operate
    Retruns:
        pandas.DataFrame
    '''
    # since some columns have to much NaN we will drop them here
    df = df.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'])
    df = encoding_values(df)
    df = impute_values(df)
    df = scaling_values(df)
    return df

def get_df():
    '''
    Sharing dataframe after aplying preprocessing
    Returns:
        pandas.DataFrame
    '''
    df = pd.read_csv(PATH)
    df = apply_preprocessing(df)
    return df


In [2]:
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

LABEL = 'SalePrice'

df = get_df()

X = df.drop(['SalePrice', 'Id'], axis=1)
y = df[LABEL]

k_fold = KFold(
n_splits=5,
shuffle=True,
random_state=42
)

scores = []

In [24]:
scores = []
for train_index, test_index in k_fold.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    # {'depth': 6, 'iterations': 100, 'learning_rate': 0.1}
    # 26948.184676065393
    # 27983
    # 27144
    # 26974
    # 26815 depth = 7
    # 26750 iterations = 1250
    # 26719 iterations = 1500
    reg = CatBoostRegressor(depth=7, iterations=1500, learning_rate = 0.025, logging_level='Silent')
    # reg = CatBoostRegressor(logging_level='Silent')
    reg.fit(X_train, y_train)

    y_predict = reg.predict(X_test)

    acc_score = np.sqrt(mean_squared_error(y_test, y_predict))
        
    print(acc_score)

    scores.append(acc_score)

print()
print("Average:", np.sum(scores)/5)
print("Std:", round(np.std(scores), 3))

26806.4949386533
23292.420516271683
37325.670510667245
26786.487463836613
19385.578002562106

Average: 26719.33028639819
Std: 5967.147


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

x_reg = CatBoostRegressor()

parameters = {'depth'         : [5,25, 50, 100],
            'learning_rate' : [0.01, 0.025, 0.05, 0.1],
            'iterations'    : [30, 50, 100, 500, 1000],
            'logging_level': ['Silent']
}

grid = GridSearchCV(estimator=x_reg, param_grid = parameters, cv = 5, n_jobs=-1)
grid.fit(X_train, y_train)

300 fits failed out of a total of 400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
300 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/lib/python3.9/site-packages/catboost/core.py", line 5590, in fit
    return self._fit(X, y, cat_features, None, None, None, sample_weight, None, None, None, None, baseline,
  File "/opt/conda/lib/python3.9/site-packages/catboost/core.py", line 2262, in _fit
    train_params = self._prepare_train_params(
  File "/opt/conda/lib/python3.9/site-packages/catboost/core.py", line 2194, in _prepare_trai

In [10]:
    print(" Results from Grid Search " )
    print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
    print("\n The best score across ALL searched params:\n", grid.best_score_)
    print("\n The best parameters across ALL searched params:\n", grid.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 <catboost.core.CatBoostRegressor object at 0x7f62f7abd070>

 The best score across ALL searched params:
 0.9062573423693564

 The best parameters across ALL searched params:
 {'depth': 5, 'iterations': 1000, 'learning_rate': 0.025, 'logging_level': 'Silent'}
7:	learn: 51099.6870006	total: 1.59s	remaining: 18.3s
8:	learn: 48639.4946803	total: 1.81s	remaining: 18.3s
9:	learn: 46360.1761728	total: 1.89s	remaining: 17s
10:	learn: 44330.6691140	total: 2.05s	remaining: 16.6s
11:	learn: 42657.6929351	total: 2.3s	remaining: 16.9s
12:	learn: 40905.7242840	total: 2.43s	remaining: 16.3s
13:	learn: 39380.4424769	total: 2.75s	remaining: 16.9s
14:	learn: 37826.9809032	total: 3.04s	remaining: 17.2s
15:	learn: 36463.6268669	total: 3.49s	remaining: 18.3s
16:	learn: 35106.7314604	total: 3.8s	remaining: 18.6s
17:	learn: 34020.9323356	total: 4.22s	remaining: 19.2s
18:	learn: 33042.1688418	total: 4.36s	remaining: 18.6s
19:	learn: 