# House Prices: Advanced Regression Techniques

Predict sales prices and practice feature engineering, RFs, and gradient boosting

<img src="housesbanner.png"  align="left" style="width: 600px;"/>

<span style="font-family: Courier New; background-color: #ffcc5c; color: #000000; padding: 3px; ">Performed by:</span> Nicolás D'Alessandro

___

## 00 - Load the required libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import reutils as k

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
import category_encoders
from category_encoders.target_encoder import TargetEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from category_encoders.cat_boost import CatBoostEncoder
from catboost import CatBoostRegressor

_____

## 00 - Create the required libraries

In [24]:
def MAE_dataset_lm(X_train, X_valid, y_train, y_valid):
    '''
            This function displays the MAE for the given datasets:
        >>> Parameter 1: [DataFrame] X_train.
        >>> Parameter 2: [DataFrame] X_valid.
        >>> Parameter 3: [DataFrame] y_train.
        >>> Parameter 4: [DataFrame] y_valid.
        ------------------------------------------------
        <<< Return: MAE on top of the given predictions. -
        '''
    
    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    preds_lm = lm.predict(X_valid)
    
    print('\033[1m' + 'MAE from Imputation:'+ '\033[0m')
    
    return mean_absolute_error(y_valid, preds_lm)


def MAE_dataset_rf(X_train, X_valid, y_train, y_valid):
    '''
            This function displays the MAE for the given datasets:
        >>> Parameter 1: [DataFrame] X_train.
        >>> Parameter 2: [DataFrame] X_valid.
        >>> Parameter 3: [DataFrame] y_train.
        >>> Parameter 4: [DataFrame] y_valid.
        ------------------------------------------------
        <<< Return: MAE on top of the given predictions. -
        '''
    rf = RandomForestRegressor(n_estimators=100, random_state=0)
    rf.fit(X_train, y_train)
    preds_rf = rf.predict(X_valid)
    
    print('\033[1m' + 'MAE from Imputation:'+ '\033[0m')
    
    return mean_absolute_error(y_valid, preds_rf)

def MAE_dataset_XGB(X_train, X_valid, y_train, y_valid):
    '''
            This function displays the MAE for the given datasets:
        >>> Parameter 1: [DataFrame] X_train
        >>> Parameter 2: [DataFrame] X_valid.
        >>> Parameter 3: [DataFrame] y_train.
        >>> Parameter 4: [DataFrame] y_valid.
        ------------------------------------------------
        <<< Return: MAE on top of the given predictions. -
    '''
    XGB = XGBRegressor(n_estimators=1000, learning_rate=0.05, silent=True)
    XGB.fit(X_train, y_train)
    preds_XGB = XGB.predict(X_valid)
    
    print('\033[1m' + 'MAE from Imputation:'+ '\033[0m')
    
    return mean_absolute_error(y_valid, preds_XGB)

def MAE_dataset_cbr(X_train, X_valid, y_train, y_valid):
    '''
            This function displays the MAE for the given datasets:
        >>> Parameter 1: [DataFrame] X_train
        >>> Parameter 2: [DataFrame] X_valid.
        >>> Parameter 3: [DataFrame] y_train.
        >>> Parameter 4: [DataFrame] y_valid.
        ------------------------------------------------
        <<< Return: MAE on top of the given predictions. -
    '''
    cbr = CatBoostRegressor(verbose=False)
    cbr.fit(X_train, y_train)
    preds_cbr = cbr.predict(X_valid)
    
    print('\033[1m' + 'MAE from Imputation:'+ '\033[0m')
    
    return mean_absolute_error(y_valid, preds_cbr)

## 01 - Fetch dataset

In [25]:
df = pd.read_csv('full.csv')

In [26]:
train = df[df['is_train']==1]
test = df[df['is_train']==0]

## 02 - Train test split

In [61]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(columns='saleprice'),
                                                      train.saleprice,random_state=0)

In [62]:
X_train.shape

(1095, 76)

In [63]:
X_test.shape

(365, 76)

## 03 - Create & Evaluate Models

### 03.1 - Lasso

In [92]:
clf = Lasso()

In [96]:
clf.get_params()

{'alpha': 1.0,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'normalize': False,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

In [97]:
clf = Lasso()

cbe = CatBoostEncoder()
ohe = OneHotEncoder()


encodings = Pipeline([('ohe',ohe),
                      ('cbe',cbe)
                     ])

models = Pipeline([('clf', clf)
                  ])

pipe_clf = Pipeline([('encodings', encodings),
                     ('models', models) 
                    ])

pipeline_grid = {
    "clf__alpha":np.linspace(0.0001,1,20),
        }

In [100]:
np.linspace(0.0001,1,20)

array([1.00000000e-04, 5.27263158e-02, 1.05352632e-01, 1.57978947e-01,
       2.10605263e-01, 2.63231579e-01, 3.15857895e-01, 3.68484211e-01,
       4.21110526e-01, 4.73736842e-01, 5.26363158e-01, 5.78989474e-01,
       6.31615789e-01, 6.84242105e-01, 7.36868421e-01, 7.89494737e-01,
       8.42121053e-01, 8.94747368e-01, 9.47373684e-01, 1.00000000e+00])

In [98]:
pipe_cv = GridSearchCV(pipe_cbr, param_grid=pipeline_grid, cv=3)

In [99]:
pipe_cv.fit(X_train, y_train)

ValueError: Invalid parameter clf for estimator Pipeline(memory=None,
         steps=[('encodings',
                 Pipeline(memory=None,
                          steps=[('ohe',
                                  OneHotEncoder(categorical_features=None,
                                                categories=None, drop=None,
                                                dtype=<class 'numpy.float64'>,
                                                handle_unknown='error',
                                                n_values=None, sparse=True))],
                          verbose=False)),
                ('models',
                 Pipeline(memory=None,
                          steps=[('cbr',
                                  <catboost.core.CatBoostRegressor object at 0x1345f1f90>)],
                          verbose=False))],
         verbose=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [90]:
pd.DataFrame(pipe_cv.cv_results_)

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'

In [91]:
best_model = pipe_cv.best_estimator_.fit(X_train, y_train)

AttributeError: 'GridSearchCV' object has no attribute 'best_estimator_'

In [None]:
preds_cbr = best_model.predict(X_valid)

### 03.2 - Catboost regressor

In [71]:
cbr = CatBoostRegressor(verbose=False)

#cbe = CatBoostEncoder()
ohe = OneHotEncoder()


encodings = Pipeline([('ohe',ohe)#,
                      #('cbe',cbe)
                     ])

models = Pipeline([('cbr', cbr)
                  ])

pipe_cbr = Pipeline([('encodings', encodings),
                     ('models', models) 
                    ])

In [72]:
pipe_cbr.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('encodings',
                 Pipeline(memory=None,
                          steps=[('ohe',
                                  OneHotEncoder(categorical_features=None,
                                                categories=None, drop=None,
                                                dtype=<class 'numpy.float64'>,
                                                handle_unknown='error',
                                                n_values=None, sparse=True))],
                          verbose=False)),
                ('models',
                 Pipeline(memory=None,
                          steps=[('cbr',
                                  <catboost.core.CatBoostRegressor object at 0x12fc5d950>)],
                          verbose=False))],
         verbose=False)

In [73]:
pipe_cbr.score(X_test, y_test)

ValueError: Found unknown categories [495, 525, 526, 551, 575, 576, 581, 661, 694, 696, 702, 703, 713, 753, 755, 760, 770, 778, 779, 783, 793, 812, 825, 842, 846, 861, 865, 886, 897, 914, 915, 920, 929, 930, 951, 998, 1003, 1029, 1051, 1061, 1063, 1099, 1131, 1138, 1152, 1156, 1217, 1228, 1240, 1241, 1251, 1272, 1334, 1349, 1357, 1375, 1402, 1411, 1419, 1425, 1426, 1429, 1432, 1444, 1454, 1464, 1465, 1472, 1478, 1490, 1521, 1523, 1525, 1548, 1553, 1561, 1569, 1571, 1574, 1586, 1625, 1640, 1659, 1679, 1690, 1698, 1702, 1704, 1719, 1752, 1787, 1792, 1828, 1836, 1838, 1842, 1888, 1980, 2046, 2076, 2084, 2121, 2136, 2158, 2259, 2392, 2402, 2411, 2515, 4692] in column 0 during transform

In [74]:
MSE = mean_squared_error(preds_cbr, y_valid, squared=True)
RMSE = mean_squared_error(preds_cbr, y_valid, squared=False)

NameError: name 'mean_squared_error' is not defined

In [75]:
mean_squared_error(np.log(preds_cbr), np.log(y_valid), squared=False)

NameError: name 'mean_squared_error' is not defined

# Predictions

In [None]:
sample = pd.read_csv('submission.csv')

In [None]:
sample['saleprice'] = cbr.predict(test)

In [None]:
sample.to_csv('30Jansubm_2.csv',index=False)

___