In [None]:
!pip install pycaret

In [None]:
pip install catboost

In [None]:
pip install optuna

In [4]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsRegressor
import scipy.stats
from sklearn.preprocessing import StandardScaler
from pycaret.regression import setup, compare_models
from sklearn.model_selection import KFold, cross_val_score

from catboost import CatBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

import optuna

In [5]:
train0 = pd.read_csv('train.csv')
test0 = pd.read_csv('test.csv')

In [6]:
target = train0['SalePrice']
test_ids = test0['Id']

train1 = train0.drop(['Id', 'SalePrice'], axis=1)
test1 = test0.drop('Id', axis=1)

data1 = pd.concat([train1, test1], axis=0).reset_index(drop=True)
data1

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RL,64.0,10475,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2010,WD,Normal
1,90,RL,,18890,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,Gar2,8300,8,2007,WD,Normal
2,80,RL,,21453,Pave,,IR1,Low,AllPub,CulDSac,...,0,0,,,,0,10,2006,WD,Normal
3,20,RL,60.0,9600,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2010,WD,Normal
4,30,RM,60.0,8967,Pave,,Reg,Lvl,AllPub,Corner,...,0,0,,,,0,11,2007,WD,Abnorml
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,60,RL,,14364,Pave,,IR1,Low,AllPub,Inside,...,154,0,,,,0,4,2007,WD,Normal
2926,50,RM,50.0,6125,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,3,2007,CWD,Normal
2927,20,RL,,8246,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,5,2010,WD,Normal
2928,20,RL,75.0,10650,Pave,,Reg,Lvl,AllPub,Corner,...,0,0,,MnPrv,,0,2,2010,WD,Normal


In [7]:
data2 = data1.copy()

In [8]:
data2['MSSubClass'] = data2['MSSubClass'].astype(str)

En el caso de las variables cualitativas donde haya valores nulos existen dos posibilidades: una ha sido rellenar los nulos con la palabra "None" y otra ha sido rellenarlos con la moda.

Lo que se hace es rellenar estos valores nulos con "None". Por ejemplo, si hay un nulo en la columna "PoolQC" significa que no hay piscina por tanto se imputa un "None". En estos casos, se entiende el valor de "NA" como un nulo pero realmente es el verdadero valor que tiene la variable.

Para otras columnas se ha decicido rellenar los valores nulos con la moda de cada una de las columnas, es decir, por el valor más frecuente.

In [9]:
# Impute using a constant value
for column in [
    'Alley',
    'BsmtQual',
    'BsmtCond',
    'BsmtExposure',
    'BsmtFinType1',
    'BsmtFinType2',
    'FireplaceQu',
    'GarageType',
    'GarageFinish',
    'GarageQual',
    'GarageCond',
    'PoolQC',
    'Fence',
    'MiscFeature'
]:
    data2[column] = data2[column].fillna("None")

# Impute using the column mode
for column in [
    'MSZoning',
    'Utilities',
    'Exterior1st',
    'Exterior2nd',
    'MasVnrType',
    'Electrical',
    'KitchenQual',
    'Functional',
    'SaleType'
]:
    data2[column] = data2[column].fillna(data2[column].mode()[0])

In [10]:
data3 = data2.copy()

En el caso de las variables numéricas se ha optado, debido que sobre lo que se está trabajando son casa y suelen compartir características, por rellenar los valores nulos con valores de observaciones cercanas mediante el algortimo de KNN, es decir, con las casas más parecidas.

In [11]:
def knn_impute(df, na_target):
    df = df.copy()

    numeric_df = df.select_dtypes(np.number)
    non_na_columns = numeric_df.loc[: ,numeric_df.isna().sum() == 0].columns

    y_train = numeric_df.loc[numeric_df[na_target].isna() == False, na_target]
    X_train = numeric_df.loc[numeric_df[na_target].isna() == False, non_na_columns]
    X_test = numeric_df.loc[numeric_df[na_target].isna() == True, non_na_columns]

    knn = KNeighborsRegressor()
    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)

    df.loc[df[na_target].isna() == True, na_target] = y_pred

    return df

In [12]:
for column in [
    'LotFrontage',
    'MasVnrArea',
    'BsmtFinSF1',
    'BsmtFinSF2',
    'BsmtUnfSF',
    'TotalBsmtSF',
    'BsmtFullBath',
    'BsmtHalfBath',
    'GarageYrBlt',
    'GarageCars',
    'GarageArea'
]:
    data3 = knn_impute(data3, column)

In [13]:
data4 = data3.copy()

Se ha decidido crear algunas variables nuevas que puedan aportar información al modelo como combinación de otras, mediante sumas y multiplicaciones

In [14]:
data4["SqFtPerRoom"] = data4["GrLivArea"] / (data4["TotRmsAbvGrd"] + data4["FullBath"] + data4["HalfBath"] + data4["KitchenAbvGr"])

data4['Total_Home_Quality'] = data4['OverallQual'] + data4['OverallCond']

data4['Total_Bathrooms'] = (data4['FullBath'] + (0.5 * data4['HalfBath']) + data4['BsmtFullBath'] + (0.5 * data4['BsmtHalfBath']))

data4["HighQualSF"] = data4["1stFlrSF"] + data4["2ndFlrSF"]

In [15]:
data5 = data4.copy()

Para las variables numéricas, se comprueba si alguna de ellas está sesgada (skewed) y en ese caso se el aplica la función logarítmica para corregir esa distribución.

In [16]:
skew_df = pd.DataFrame(data5.select_dtypes(np.number).columns, columns=['Feature'])
skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.skew(data5[feature]))
skew_df['Absolute Skew'] = skew_df['Skew'].apply(abs)
skew_df['Skewed'] = skew_df['Absolute Skew'].apply(lambda x: True if x >= 0.5 else False)
skew_df

Unnamed: 0,Feature,Skew,Absolute Skew,Skewed
0,LotFrontage,1.333981,1.333981,True
1,LotArea,12.770448,12.770448,True
2,OverallQual,0.19152,0.19152,False
3,OverallCond,0.561348,0.561348,True
4,YearBuilt,-0.598164,0.598164,True
5,YearRemodAdd,-0.445806,0.445806,False
6,MasVnrArea,2.600227,2.600227,True
7,BsmtFinSF1,1.425643,1.425643,True
8,BsmtFinSF2,4.129423,4.129423,True
9,BsmtUnfSF,0.918231,0.918231,True


In [17]:
for column in skew_df.query("Skewed == True")['Feature'].values:
    data5[column] = np.log1p(data5[column])

Para los números de mes lo que se decide hacer es convertirlos en una función coseno ya que los meses de un año es algo cíclico, es decir, empiezan siempre en 1 y terminan en 12. De esta forma podemos conseguir que la distancia entre el mes 12 y el 1 sea la misma que entre el 11 y el 12, o entre el 1 y el 2.

Para conseguir el valor de "0.5326" lo que se ha hecho ha sido en la web de Geogebra probar cuál es el número "X" que permitía que el valor de cos(X) fuese -1, y se ha considerado poner el "-" delante simulando que el valor de la función coseno fuese la temperatura, de esta forma conseguimos que la "temperatura" en el mes 6 valiese 1 (calor) y que en el mes 12 fuese -1 (frío).

In [18]:
data5['MoSold'] = (-np.cos(0.5236 * data5['MoSold']))

In [19]:
data6 = data5.copy()

In [20]:
data6 = pd.get_dummies(data6)

In [21]:
data7 = data6.copy()

In [22]:
scaler = StandardScaler()
scaler.fit(data7)

data7 = pd.DataFrame(scaler.transform(data7), index=data7.index, columns=data7.columns)

In [23]:
data8 = data7.copy()

In [24]:
train_final = data8.loc[:train0.index.max(), :].copy()
test_final = data8.loc[train0.index.max() + 1:, :].reset_index(drop=True).copy()

In [25]:
setup(data=pd.concat([train_final, target], axis = 1), target='SalePrice')

Unnamed: 0,Description,Value
0,Session id,2943
1,Target,SalePrice
2,Target type,Regression
3,Original data shape,"(2302, 321)"
4,Transformed data shape,"(2302, 321)"
5,Transformed train set shape,"(1611, 321)"
6,Transformed test set shape,"(691, 321)"
7,Numeric features,320
8,Preprocess,True
9,Imputation type,simple


<pycaret.regression.oop.RegressionExperiment at 0x7ead16784b80>

In [26]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,21964.5091,1180087901.094,33875.056,0.7992,0.1932,0.1857,8.534
lightgbm,Light Gradient Boosting Machine,23215.699,1290328572.6504,35621.7323,0.7809,0.2034,0.1982,2.79
gbr,Gradient Boosting Regressor,22845.0869,1275404924.7545,35293.081,0.7806,0.2036,0.1964,1.335
rf,Random Forest Regressor,23605.1639,1309213714.8808,35820.5615,0.7764,0.2076,0.1984,4.667
en,Elastic Net,23116.7594,1357359974.4,36354.3926,0.772,0.2042,0.1819,0.341
br,Bayesian Ridge,23544.8461,1427139552.0,37284.5273,0.7596,0.2087,0.1789,0.166
xgboost,Extreme Gradient Boosting,24477.3986,1428969580.8,37456.7191,0.7569,0.214,0.2144,0.889
llar,Lasso Least Angle Regression,23493.3398,1446361676.8,37450.4199,0.7564,0.2105,0.168,0.158
et,Extra Trees Regressor,24358.1126,1481943835.0886,37867.6114,0.7477,0.2132,0.2106,4.943
omp,Orthogonal Matching Pursuit,23875.8352,1503418745.6,38233.3709,0.7447,0.2348,0.1786,0.106


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

<catboost.core.CatBoostRegressor at 0x7eace414b010>

Se escoge el modelo con el mejor rendimiento, es decir, el modelo con un menor valor de RMSE: CatBoost Regressor.

Se podrían haber cogido más modelos y probar cuál es el que finalmente ofrece mejores resultados, se ha decidido solo probar con el CatBoost

In [27]:
modelos = {
    "catboost":CatBoostRegressor(verbose=0)
}

In [28]:
for name, model in modelos.items():
    model.fit(train_final, target)
    print(name + " trained.")

catboost trained.


In [29]:
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK

Se construyen los Grids donde más adelante cada uno de los modelos iterará hasta encontrar la mejor combinación de hiperparámetros.

In [30]:
catboost_params = {
    'iterations': hp.randint('iterations',100,1000),
    'learning_rate': hp.uniform('learning_rate', 0.1, 0.5),
    'depth': hp.randint('depth',4,10),
    'l2_leaf_reg': hp.randint('l2_leaf_reg',1,10),
    'eval_metric':'RMSE',
    'early_stopping_rounds': 200,
    'random_seed': 42
}

In [31]:
models = {
    "catboost": CatBoostRegressor(**catboost_params, verbose=0)
}

In [32]:
SAMPLE_RATE = 0.4
RANDOM_SEED = 1
EARLY_STOPPING_ROUND = 100

In [33]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [34]:
X_train, X_valid, y_train, y_valid = train_test_split(train_final, target, test_size=0.2, random_state=RANDOM_SEED)

In [35]:
X_train, X_eval, y_train, y_eval = train_test_split(X_train, y_train, test_size=0.1, random_state=RANDOM_SEED)

In [36]:
def objective(trial):
    param = {}
    param['learning_rate'] = trial.suggest_discrete_uniform("learning_rate", 0.001, 0.02, 0.001)
    param['depth'] = trial.suggest_int('depth', 9, 15)
    param['l2_leaf_reg'] = trial.suggest_discrete_uniform('l2_leaf_reg', 1.0, 5.5, 0.5)
    param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1, 4, 8, 16, 32])
    param['grow_policy'] = 'Depthwise'
    param['iterations'] = 10000
    param['use_best_model'] = True
    param['eval_metric'] = 'RMSE'
    param['od_type'] = 'iter'
    param['od_wait'] = 20
    param['random_state'] = RANDOM_SEED
    param['logging_level'] = 'Silent'

    regressor = CatBoostRegressor(**param)

    regressor.fit(X_train.copy(), y_train.copy(),
                  eval_set=[(X_eval.copy(), y_eval.copy())],
                  early_stopping_rounds=EARLY_STOPPING_ROUND)
    loss = mean_squared_error(y_valid, regressor.predict(X_valid.copy()))
    return loss


In [37]:
%%time
study = optuna.create_study(study_name=f'catboost-seed{RANDOM_SEED}')
study.optimize(objective, n_trials=100, n_jobs=-1, timeout=24000)

[I 2024-04-25 08:27:46,283] A new study created in memory with name: catboost-seed1
[I 2024-04-25 08:28:31,596] Trial 1 finished with value: 1069578775.1892498 and parameters: {'learning_rate': 0.018000000000000002, 'depth': 9, 'l2_leaf_reg': 2.0, 'min_child_samples': 1}. Best is trial 1 with value: 1069578775.1892498.
[I 2024-04-25 08:29:03,217] Trial 2 finished with value: 1061374288.1601657 and parameters: {'learning_rate': 0.006, 'depth': 10, 'l2_leaf_reg': 2.5, 'min_child_samples': 32}. Best is trial 2 with value: 1061374288.1601657.
[I 2024-04-25 08:36:52,250] Trial 3 finished with value: 1048700588.9287137 and parameters: {'learning_rate': 0.003, 'depth': 14, 'l2_leaf_reg': 3.0, 'min_child_samples': 4}. Best is trial 3 with value: 1048700588.9287137.
[I 2024-04-25 08:38:29,167] Trial 4 finished with value: 1077231620.9332566 and parameters: {'learning_rate': 0.008, 'depth': 13, 'l2_leaf_reg': 3.0, 'min_child_samples': 8}. Best is trial 3 with value: 1048700588.9287137.
[I 2024-0

CPU times: user 3h 38min 36s, sys: 5min 39s, total: 3h 44min 15s
Wall time: 2h 7min 21s


In [39]:
optimized_regressor = CatBoostRegressor(learning_rate=0.012,
                                        depth=11,
                                        l2_leaf_reg=5.5,
                                        min_child_samples=1,
                                        grow_policy='Depthwise',
                                        iterations=10000,
                                        use_best_model=True,
                                        eval_metric='RMSE',
                                        od_type='iter',
                                        od_wait=20,
                                        random_state=RANDOM_SEED,
                                        logging_level='Silent')
optimized_regressor.fit(X_train.copy(), y_train.copy(),
                        eval_set=[(X_eval.copy(), y_eval.copy())],
                        early_stopping_rounds=EARLY_STOPPING_ROUND)
pred_train = optimized_regressor.predict(X_train.copy())
pred_valid = optimized_regressor.predict(X_valid.copy())

In [40]:
sample_sub = pd.read_csv('sample_submission.csv')

In [41]:
sample_sub['target'] = optimized_regressor.predict(test_final)

In [42]:
sample_sub = sample_sub.drop('SalePrice', axis=1)

In [43]:
sample_sub.rename(columns={'target': 'SalePrice'}, inplace=True)

In [46]:
sample_sub.set_index('ID', inplace=True)

In [47]:
sample_sub

Unnamed: 0_level_0,SalePrice
ID,Unnamed: 1_level_1
2303,177731.137679
2304,143634.795195
2305,411025.173892
2306,305680.326638
2307,110996.348050
...,...
2926,254618.064020
2927,131220.310063
2928,148977.213454
2929,137109.251610


In [48]:
sample_sub.to_csv('late_submission.csv')