# House Prices: Advanced Regression Techniques

Predict sales prices and practice feature engineering, RFs, and gradient boosting

<img src="housesbanner.png"  align="left" style="width: 600px;"/>

<span style="font-family: Courier New; background-color: #ffcc5c; color: #000000; padding: 3px; ">Performed by:</span> Nicolás D'Alessandro

___

## 00 - Load the required libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kbasics as k
import TypeSelector as t


from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
import category_encoders
from category_encoders.target_encoder import TargetEncoder
from category_encoders.one_hot import OneHotEncoder
from category_encoders.ordinal import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

_____

## 00 - Create the required libraries

In [3]:
def MAE_dataset_lm(X_train, X_valid, y_train, y_valid):
    '''
            This function displays the MAE for the given datasets:
        >>> Parameter 1: [DataFrame] X_train.
        >>> Parameter 2: [DataFrame] X_valid.
        >>> Parameter 3: [DataFrame] y_train.
        >>> Parameter 4: [DataFrame] y_valid.
        ------------------------------------------------
        <<< Return: MAE on top of the given predictions. -
        '''
    
    lm = linear_model.LinearRegression()
    lm.fit(X_train, y_train)
    preds_lm = lm.predict(X_valid)
    
    print('\033[1m' + 'MAE from Imputation:'+ '\033[0m')
    
    return mean_absolute_error(y_valid, preds_lm)


def MAE_dataset_rf(X_train, X_valid, y_train, y_valid):
    '''
            This function displays the MAE for the given datasets:
        >>> Parameter 1: [DataFrame] X_train.
        >>> Parameter 2: [DataFrame] X_valid.
        >>> Parameter 3: [DataFrame] y_train.
        >>> Parameter 4: [DataFrame] y_valid.
        ------------------------------------------------
        <<< Return: MAE on top of the given predictions. -
        '''
    rf = RandomForestRegressor(n_estimators=100, random_state=0)
    rf.fit(X_train, y_train)
    preds_rf = rf.predict(X_valid)
    
    print('\033[1m' + 'MAE from Imputation:'+ '\033[0m')
    
    return mean_absolute_error(y_valid, preds_rf)

def MAE_dataset_XGB(X_train, X_valid, y_train, y_valid):
    '''
            This function displays the MAE for the given datasets:
        >>> Parameter 1: [DataFrame] X_train
        >>> Parameter 2: [DataFrame] X_valid.
        >>> Parameter 3: [DataFrame] y_train.
        >>> Parameter 4: [DataFrame] y_valid.
        ------------------------------------------------
        <<< Return: MAE on top of the given predictions. -
    '''
    XGB = XGBRegressor(n_estimators=1000, learning_rate=0.05, silent=True)
    XGB.fit(X_train, y_train)
    preds_XGB = XGB.predict(X_valid)
    
    print('\033[1m' + 'MAE from Imputation:'+ '\033[0m')
    
    return mean_absolute_error(y_valid, preds_XGB)

def MAE_dataset_cbr(X_train, X_valid, y_train, y_valid):
    '''
            This function displays the MAE for the given datasets:
        >>> Parameter 1: [DataFrame] X_train
        >>> Parameter 2: [DataFrame] X_valid.
        >>> Parameter 3: [DataFrame] y_train.
        >>> Parameter 4: [DataFrame] y_valid.
        ------------------------------------------------
        <<< Return: MAE on top of the given predictions. -
    '''
    cbr = CatBoostRegressor(verbose=False)
    cbr.fit(X_train, y_train)
    preds_cbr = cbr.predict(X_valid)
    
    print('\033[1m' + 'MAE from Imputation:'+ '\033[0m')
    
    return mean_absolute_error(y_valid, preds_cbr)

## 01 - Fetch dataset

In [4]:
df = pd.read_csv('full.csv')

In [5]:
train = df[df['is_train']==1]
test = df[df['is_train']==0]

## 02 - Train test split

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(train.drop(columns='saleprice'),
                                                      train.saleprice,random_state=0)

In [7]:
X_train.shape

(1095, 76)

In [8]:
X_valid.shape

(365, 76)

## 03 - Create & Evaluate Models

In [17]:
cols_ohe = ['street','utilities']
cols_te = ['street','fence']

In [24]:
ohe = t.Encodings(encoding_type="OneHotEncoder",columns=cols_ohe)
cbe = t.Encodings(encoding_type="CatBoostEncoder",columns=cols_te)
woe = t.Encodings(encoding_type="WOEEncoder",columns=cols_ohe)

clf = Lasso()
scaler = StandardScaler()


encodings = Pipeline([('ohe',ohe),
                      ('cbe',cbe),
                      ('woe', woe)
                     ])

pipe_clf = Pipeline([
    ('encodings',encodings),
    ('selector', t.TypeSelector(np.number)), # Selects Numerical Columns only
                    ('clf',clf)
                   ])

pipeline_grid = {
    "clf__alpha":np.linspace(0.0001,2,20),
        }

In [22]:
pipe_cv = GridSearchCV(pipe_clf, param_grid=pipeline_grid, n_jobs = -1, cv=3)

In [25]:
#pipe_cv.fit(X_train, y_train)

In [None]:
pd.DataFrame(pipe_cv.cv_results_)

In [None]:
best_model = pipe_cv.best_estimator_.fit(X_train, y_train)

In [None]:
preds_cbr = best_model.predict(X_valid)

In [None]:
MSE = mean_squared_error(preds_cbr, y_valid, squared=True)
RMSE = mean_squared_error(preds_cbr, y_valid, squared=False)

In [None]:
MSE 

In [None]:
RMSE

In [None]:
mean_squared_error(np.log(preds_cbr), np.log(y_valid), squared=False)

# Predictions

In [None]:
sample = pd.read_csv('submission.csv')

In [None]:
sample['saleprice'] = cbr.predict(test)

In [None]:
sample.to_csv('30Jansubm_2.csv',index=False)

___