## Libraries

In [1]:
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("max_colwidth", None)
import numpy as np
import math
import warnings
warnings.filterwarnings("ignore")
from dython.nominal import associations 

# Scikit Learn 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.kernel_ridge import KernelRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

## Text Parser

In [2]:
with open('data_description.txt', 'r') as text_file:
    
    text_features_dict = {
        'Id':[np.nan], 
        'SalePrice':[np.nan]
    }
    
    for line in text_file:
        if ':' in line and line[0] != ' ':
            text_column = line[:line.find(":")]
            if (text_column == 'Bedroom' or text_column == 'Kitchen'):
                text_column += 'AbvGr'
            text_features_dict[text_column] = [np.nan]
        elif line.strip() != '':
            text_value = line[:line.find("\t")].strip()
            text_features_dict[text_column].append(text_value)
            if np.nan in text_features_dict[text_column]:
                text_features_dict[text_column].remove(np.nan) 

del text_file, line, text_column, text_value

In [3]:
text_features_dict

{'Id': [nan],
 'SalePrice': [nan],
 'MSSubClass': ['20',
  '30',
  '40',
  '45',
  '50',
  '60',
  '70',
  '75',
  '80',
  '85',
  '90',
  '120',
  '150',
  '160',
  '180',
  '190'],
 'MSZoning': ['A', 'C', 'FV', 'I', 'RH', 'RL', 'RP', 'RM'],
 'LotFrontage': [nan],
 'LotArea': [nan],
 'Street': ['Grvl', 'Pave'],
 'Alley': ['Grvl', 'Pave', 'NA'],
 'LotShape': ['Reg', 'IR1', 'IR2', 'IR3'],
 'LandContour': ['Lvl', 'Bnk', 'HLS', 'Low'],
 'Utilities': ['AllPub', 'NoSewr', 'NoSeWa', 'ELO'],
 'LotConfig': ['Inside', 'Corner', 'CulDSac', 'FR2', 'FR3'],
 'LandSlope': ['Gtl', 'Mod', 'Sev'],
 'Neighborhood': ['Blmngtn',
  'Blueste',
  'BrDale',
  'BrkSide',
  'ClearCr',
  'CollgCr',
  'Crawfor',
  'Edwards',
  'Gilbert',
  'IDOTRR',
  'MeadowV',
  'Mitchel',
  'Names',
  'NoRidge',
  'NPkVill',
  'NridgHt',
  'NWAmes',
  'OldTown',
  'SWISU',
  'Sawyer',
  'SawyerW',
  'Somerst',
  'StoneBr',
  'Timber',
  'Veenker'],
 'Condition1': ['Artery',
  'Feedr',
  'Norm',
  'RRNn',
  'RRAn',
  'PosN',
  

## Reading the datasets

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv')
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


## Creating feature lists

In [5]:
numerical_features_lst = []
categorical_features_lst  = []
special_cases_features_lst  = []

for key, value in text_features_dict.items():
    if np.nan in value:
        numerical_features_lst.append(key)
    else:
        if 'NA' in value:
            special_cases_features_lst.append(key)
        else:
            categorical_features_lst.append(key)

del key, value
len(numerical_features_lst), len(categorical_features_lst), len(special_cases_features_lst)

(35, 32, 14)

## Handling missing values

In [6]:
def missing_values(train=train, test=test):
    train_null = train.isnull().sum()
    test_null = test.isnull().sum()
    return pd.DataFrame({
        'Train': train_null, 
        'Test': test_null,
        '% Train': round(train_null/len(train), 2),
        '% Test': round(test_null/len(test), 2)
    })

In [7]:
missing_values(train[special_cases_features_lst], test[special_cases_features_lst]).query("Train > 0 or Test > 0")

Unnamed: 0,Train,Test,% Train,% Test
Alley,1369,1352,0.94,0.93
BsmtQual,37,44,0.03,0.03
BsmtCond,37,45,0.03,0.03
BsmtExposure,38,44,0.03,0.03
BsmtFinType1,37,42,0.03,0.03
BsmtFinType2,38,42,0.03,0.03
FireplaceQu,690,730,0.47,0.5
GarageType,81,76,0.06,0.05
GarageFinish,81,78,0.06,0.05
GarageQual,81,78,0.06,0.05


In [8]:
# Replace categorical special cases missing values with NA 
for column in special_cases_features_lst:
    print(f'\n******* {column} *******')
    print('--- TRAIN ---')
    print('Before:', train[column].unique(), end='\n')
    train[column] = train[column].fillna('NA')
    print('After:', train[column].unique(), end='\n\n')
    print('--- TEST ---')
    print('Before:', test[column].unique(), end='\n')
    test[column] = test[column].fillna('NA')
    print('After:', test[column].unique(), end='\n\n')

del column


******* Alley *******
--- TRAIN ---
Before: [nan 'Grvl' 'Pave']
After: ['NA' 'Grvl' 'Pave']

--- TEST ---
Before: [nan 'Pave' 'Grvl']
After: ['NA' 'Pave' 'Grvl']


******* BsmtQual *******
--- TRAIN ---
Before: ['Gd' 'TA' 'Ex' nan 'Fa']
After: ['Gd' 'TA' 'Ex' 'NA' 'Fa']

--- TEST ---
Before: ['TA' 'Gd' 'Ex' 'Fa' nan]
After: ['TA' 'Gd' 'Ex' 'Fa' 'NA']


******* BsmtCond *******
--- TRAIN ---
Before: ['TA' 'Gd' nan 'Fa' 'Po']
After: ['TA' 'Gd' 'NA' 'Fa' 'Po']

--- TEST ---
Before: ['TA' 'Po' 'Fa' 'Gd' nan]
After: ['TA' 'Po' 'Fa' 'Gd' 'NA']


******* BsmtExposure *******
--- TRAIN ---
Before: ['No' 'Gd' 'Mn' 'Av' nan]
After: ['No' 'Gd' 'Mn' 'Av' 'NA']

--- TEST ---
Before: ['No' 'Gd' 'Mn' 'Av' nan]
After: ['No' 'Gd' 'Mn' 'Av' 'NA']


******* BsmtFinType1 *******
--- TRAIN ---
Before: ['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' nan 'LwQ']
After: ['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' 'NA' 'LwQ']

--- TEST ---
Before: ['Rec' 'ALQ' 'GLQ' 'Unf' 'BLQ' 'LwQ' nan]
After: ['Rec' 'ALQ' 'GLQ' 'Unf' 'BLQ' 'LwQ' 'NA']



In [9]:
missing_values(train[special_cases_features_lst], test[special_cases_features_lst]).query("Train > 0 or Test > 0")

Unnamed: 0,Train,Test,% Train,% Test


In [10]:
missing_values(train[categorical_features_lst], test[categorical_features_lst]).query("Train > 0 or Test > 0")

Unnamed: 0,Train,Test,% Train,% Test
MSZoning,0,4,0.0,0.0
Utilities,0,2,0.0,0.0
Exterior1st,0,1,0.0,0.0
Exterior2nd,0,1,0.0,0.0
MasVnrType,8,16,0.01,0.01
Electrical,1,0,0.0,0.0
KitchenQual,0,1,0.0,0.0
Functional,0,2,0.0,0.0
SaleType,0,1,0.0,0.0


In [11]:
print('******* TRAIN *******')
for column in categorical_features_lst:
    if np.nan in list(train[column].unique()):
        print(f'\n{column}:')
        print('Before:', train[column].unique(), end='\n')
        train[column] = train[column].fillna(train[column].mode()[0])
        print('After:', train[column].unique(), end='\n')

print('\n******* TEST *******')
for column in categorical_features_lst:
    if np.nan in list(test[column].unique()):
        print(f'\n{column}:')
        print('Before:', test[column].unique(), end='\n')
        test[column] = test[column].fillna(test[column].mode()[0])
        print('After:', test[column].unique(), end='\n')
        
del column

******* TRAIN *******

MasVnrType:
Before: ['BrkFace' 'None' 'Stone' 'BrkCmn' nan]
After: ['BrkFace' 'None' 'Stone' 'BrkCmn']

Electrical:
Before: ['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix' nan]
After: ['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix']

******* TEST *******

MSZoning:
Before: ['RH' 'RL' 'RM' 'FV' 'C (all)' nan]
After: ['RH' 'RL' 'RM' 'FV' 'C (all)']

Utilities:
Before: ['AllPub' nan]
After: ['AllPub']

Exterior1st:
Before: ['VinylSd' 'Wd Sdng' 'HdBoard' 'Plywood' 'MetalSd' 'CemntBd' 'WdShing'
 'BrkFace' 'AsbShng' 'BrkComm' 'Stucco' 'AsphShn' nan 'CBlock']
After: ['VinylSd' 'Wd Sdng' 'HdBoard' 'Plywood' 'MetalSd' 'CemntBd' 'WdShing'
 'BrkFace' 'AsbShng' 'BrkComm' 'Stucco' 'AsphShn' 'CBlock']

Exterior2nd:
Before: ['VinylSd' 'Wd Sdng' 'HdBoard' 'Plywood' 'MetalSd' 'Brk Cmn' 'CmentBd'
 'ImStucc' 'Wd Shng' 'AsbShng' 'Stucco' 'CBlock' 'BrkFace' 'AsphShn' nan
 'Stone']
After: ['VinylSd' 'Wd Sdng' 'HdBoard' 'Plywood' 'MetalSd' 'Brk Cmn' 'CmentBd'
 'ImStucc' 'Wd Shng' 'AsbShng' 'Stucco' 'CBl

In [12]:
print("\n********* TRAIN *********")
for column in train.columns:
    for value in train[column].unique():
        if value == 'None':
            print(f'\n{column} has None values')
            train[column] = train[column].apply(lambda record: f'No {column}' if record == 'None' else record)
            print('REPLACED:', train[column].unique())
    
print("\n********* TEST  *********")
for column in test.columns:
    for value in test[column].unique():
        if value == 'None':
            print(f'\n{column} has None values')
            test[column] = test[column].apply(lambda record: f'No {column}' if record == 'None' else record)
            print('REPLACED:', test[column].unique())

del column, value


********* TRAIN *********

MasVnrType has None values
REPLACED: ['BrkFace' 'No MasVnrType' 'Stone' 'BrkCmn']

********* TEST  *********

MasVnrType has None values
REPLACED: ['No MasVnrType' 'BrkFace' 'Stone' 'BrkCmn']


In [13]:
missing_values(train[categorical_features_lst], test[categorical_features_lst]).query("Train > 0 or Test > 0")

Unnamed: 0,Train,Test,% Train,% Test


In [14]:
missing_values(train[numerical_features_lst[2:]], test[numerical_features_lst[2:]]).query("Train > 0 or Test > 0")

Unnamed: 0,Train,Test,% Train,% Test
LotFrontage,259,227,0.18,0.16
MasVnrArea,8,15,0.01,0.01
BsmtFinSF1,0,1,0.0,0.0
BsmtFinSF2,0,1,0.0,0.0
BsmtUnfSF,0,1,0.0,0.0
TotalBsmtSF,0,1,0.0,0.0
BsmtFullBath,0,2,0.0,0.0
BsmtHalfBath,0,2,0.0,0.0
GarageYrBlt,81,78,0.06,0.05
GarageCars,0,1,0.0,0.0


In [15]:
def compare_columns(dataset, missing_column, compare_column):
    return dataset[pd.isna(dataset[missing_column])][[missing_column, compare_column]]

In [16]:
compare_columns(train, 'LotFrontage', 'Street')

Unnamed: 0,LotFrontage,Street
7,,Pave
12,,Pave
14,,Pave
16,,Pave
24,,Pave
31,,Pave
42,,Pave
43,,Pave
50,,Pave
64,,Pave


In [17]:
print('\n*** TRAIN mean ***')
print('LotFrontage:', float((train['LotFrontage'].mean())))
print('LotFrontage (Pave Street):', float(train[train['Street']=='Pave'][['LotFrontage', 'Street']].mean()))
print('\n*** TEST mean ***')
print('LotFrontage:', float((test['LotFrontage'].mean())))
print('LotFrontage (Pave Street):', float(test[test['Street']=='Pave'][['LotFrontage', 'Street']].mean()))


*** TRAIN mean ***
LotFrontage: 70.04995836802665
LotFrontage (Pave Street): 69.98578595317726

*** TEST mean ***
LotFrontage: 68.58035714285714
LotFrontage (Pave Street): 68.48899755501222


In [18]:
train_lotfront_age_mean = float(math.floor(train['LotFrontage'].mean()))
test_lotfront_age_mean = float(math.floor(test['LotFrontage'].mean()))

train['LotFrontage'] = train['LotFrontage'].fillna(train_lotfront_age_mean)
test['LotFrontage'] = test['LotFrontage'].fillna(test_lotfront_age_mean)

del train_lotfront_age_mean, test_lotfront_age_mean

In [19]:
compare_columns(train, 'MasVnrArea', 'MasVnrType')

Unnamed: 0,MasVnrArea,MasVnrType
234,,No MasVnrType
529,,No MasVnrType
650,,No MasVnrType
936,,No MasVnrType
973,,No MasVnrType
977,,No MasVnrType
1243,,No MasVnrType
1278,,No MasVnrType


In [20]:
train['MasVnrArea'] = train['MasVnrArea'].fillna(0.0)
test['MasVnrArea'] = test['MasVnrArea'].fillna(0.0)

In [21]:
compare_columns(train, 'GarageYrBlt', 'GarageCond')

Unnamed: 0,GarageYrBlt,GarageCond
39,,
48,,
78,,
88,,
89,,
99,,
108,,
125,,
127,,
140,,


In [22]:
time_columns = []
for column in list(train.columns):
    if "year" in column.lower() or "yr" in column.lower():
        time_columns.append(column)
        
print(f"Missing data in Year columns:\n{train[time_columns].isnull().sum()}")

Missing data in Year columns:
YearBuilt        0
YearRemodAdd     0
GarageYrBlt     81
YrSold           0
dtype: int64


In [23]:
train[time_columns][pd.isna(train["GarageYrBlt"])]
garage_yearbuilt_train = train['YearBuilt'][pd.isna(train["GarageYrBlt"])].values
null_indeces_train = train.loc[:, 'GarageYrBlt'][pd.isna(train["GarageYrBlt"])].index
garage_dict_train = dict(zip(null_indeces_train, garage_yearbuilt_train))
garage_dict_train

{39: 1955,
 48: 1920,
 78: 1968,
 88: 1915,
 89: 1994,
 99: 1959,
 108: 1919,
 125: 1935,
 127: 1930,
 140: 1971,
 148: 2004,
 155: 1924,
 163: 1956,
 165: 1940,
 198: 1912,
 210: 1925,
 241: 1945,
 250: 1940,
 287: 1971,
 291: 1912,
 307: 1920,
 375: 1922,
 386: 1910,
 393: 1941,
 431: 1920,
 434: 1972,
 441: 1955,
 464: 1978,
 495: 1920,
 520: 1900,
 528: 1920,
 533: 1946,
 535: 1910,
 562: 1940,
 582: 1990,
 613: 2007,
 614: 1972,
 620: 1914,
 635: 1914,
 636: 1936,
 638: 1910,
 649: 1970,
 705: 1930,
 710: 1935,
 738: 1987,
 750: 1910,
 784: 1914,
 826: 1924,
 843: 1961,
 921: 1900,
 942: 1977,
 954: 1975,
 960: 1958,
 968: 1910,
 970: 1949,
 976: 1923,
 1009: 1926,
 1011: 1965,
 1030: 1916,
 1038: 1970,
 1096: 1914,
 1123: 1947,
 1131: 1991,
 1137: 1875,
 1143: 1959,
 1173: 1946,
 1179: 1954,
 1218: 1947,
 1219: 1971,
 1234: 1911,
 1257: 1922,
 1283: 1971,
 1323: 1940,
 1325: 1922,
 1326: 1931,
 1337: 1941,
 1349: 1872,
 1407: 1985,
 1449: 1970,
 1450: 1974,
 1453: 2006}

In [24]:
for i in train['GarageYrBlt'].index:
    if pd.isna(train.loc[i, 'GarageYrBlt']):
        train.loc[i, 'GarageYrBlt'] = garage_dict_train[i]
        print(f"Index {i} replaced with {train.loc[i, 'GarageYrBlt']}")

Index 39 replaced with 1955.0
Index 48 replaced with 1920.0
Index 78 replaced with 1968.0
Index 88 replaced with 1915.0
Index 89 replaced with 1994.0
Index 99 replaced with 1959.0
Index 108 replaced with 1919.0
Index 125 replaced with 1935.0
Index 127 replaced with 1930.0
Index 140 replaced with 1971.0
Index 148 replaced with 2004.0
Index 155 replaced with 1924.0
Index 163 replaced with 1956.0
Index 165 replaced with 1940.0
Index 198 replaced with 1912.0
Index 210 replaced with 1925.0
Index 241 replaced with 1945.0
Index 250 replaced with 1940.0
Index 287 replaced with 1971.0
Index 291 replaced with 1912.0
Index 307 replaced with 1920.0
Index 375 replaced with 1922.0
Index 386 replaced with 1910.0
Index 393 replaced with 1941.0
Index 431 replaced with 1920.0
Index 434 replaced with 1972.0
Index 441 replaced with 1955.0
Index 464 replaced with 1978.0
Index 495 replaced with 1920.0
Index 520 replaced with 1900.0
Index 528 replaced with 1920.0
Index 533 replaced with 1946.0
Index 535 repl

In [25]:
train[['GarageYrBlt']][pd.isna(train["GarageYrBlt"])]

Unnamed: 0,GarageYrBlt


In [26]:
test[['GarageYrBlt']][pd.isna(test["GarageYrBlt"])]

Unnamed: 0,GarageYrBlt
53,
71,
79,
92,
96,
98,
100,
130,
133,
134,


In [27]:
test[time_columns][pd.isna(test["GarageYrBlt"])]
garage_yearbuilt_test = test['YearBuilt'][pd.isna(test["GarageYrBlt"])].values
null_indeces_test = test.loc[:, 'GarageYrBlt'][pd.isna(test["GarageYrBlt"])].index
garage_dict_test = dict(zip(null_indeces_test, garage_yearbuilt_test))
for i in test['GarageYrBlt'].index:
    if pd.isna(test.loc[i, 'GarageYrBlt']):
        test.loc[i, 'GarageYrBlt'] = garage_dict_test[i]
        print(f"Index {i} replaced with {test.loc[i, 'GarageYrBlt']}")

Index 53 replaced with 1962.0
Index 71 replaced with 1920.0
Index 79 replaced with 1910.0
Index 92 replaced with 1912.0
Index 96 replaced with 1915.0
Index 98 replaced with 1907.0
Index 100 replaced with 1967.0
Index 130 replaced with 1958.0
Index 133 replaced with 1967.0
Index 134 replaced with 1931.0
Index 154 replaced with 1970.0
Index 155 replaced with 1970.0
Index 257 replaced with 2004.0
Index 261 replaced with 2004.0
Index 327 replaced with 1940.0
Index 348 replaced with 1910.0
Index 350 replaced with 1920.0
Index 351 replaced with 1910.0
Index 359 replaced with 1910.0
Index 362 replaced with 1900.0
Index 371 replaced with 1922.0
Index 374 replaced with 1902.0
Index 376 replaced with 1923.0
Index 379 replaced with 1987.0
Index 387 replaced with 1947.0
Index 433 replaced with 1959.0
Index 550 replaced with 2005.0
Index 621 replaced with 1961.0
Index 630 replaced with 1910.0
Index 633 replaced with 1920.0
Index 636 replaced with 1890.0
Index 639 replaced with 1949.0
Index 644 repl

In [28]:
test[['GarageYrBlt']][pd.isna(test["GarageYrBlt"])]

Unnamed: 0,GarageYrBlt


In [29]:
missing_values(train[numerical_features_lst[2:]], test[numerical_features_lst[2:]]).query("Train > 0 or Test > 0")

Unnamed: 0,Train,Test,% Train,% Test
BsmtFinSF1,0,1,0.0,0.0
BsmtFinSF2,0,1,0.0,0.0
BsmtUnfSF,0,1,0.0,0.0
TotalBsmtSF,0,1,0.0,0.0
BsmtFullBath,0,2,0.0,0.0
BsmtHalfBath,0,2,0.0,0.0
GarageCars,0,1,0.0,0.0
GarageArea,0,1,0.0,0.0


In [30]:
compare_columns(test, 'BsmtFinSF1', 'BsmtCond')

Unnamed: 0,BsmtFinSF1,BsmtCond
660,,


In [31]:
compare_columns(test, 'BsmtFinSF2', 'BsmtCond')

Unnamed: 0,BsmtFinSF2,BsmtCond
660,,


In [32]:
compare_columns(test, 'BsmtUnfSF', 'BsmtCond')

Unnamed: 0,BsmtUnfSF,BsmtCond
660,,


In [33]:
compare_columns(test, 'TotalBsmtSF', 'BsmtCond')

Unnamed: 0,TotalBsmtSF,BsmtCond
660,,


In [34]:
compare_columns(test, 'BsmtFullBath', 'BsmtCond')

Unnamed: 0,BsmtFullBath,BsmtCond
660,,
728,,


In [35]:
compare_columns(test, 'BsmtHalfBath', 'BsmtCond')

Unnamed: 0,BsmtHalfBath,BsmtCond
660,,
728,,


In [36]:
compare_columns(test, 'GarageCars', 'GarageCond')

Unnamed: 0,GarageCars,GarageCond
1116,,


In [37]:
compare_columns(test, 'GarageArea', 'GarageCond')

Unnamed: 0,GarageArea,GarageCond
1116,,


In [38]:
missing_numerical_columns_test = missing_values(
    train[numerical_features_lst[2:]], 
    test[numerical_features_lst[2:]]).query("Train > 0 or Test > 0")['Test'].index

for column in missing_numerical_columns_test:
    test[column] = test[column].fillna(0.0)
    
del missing_numerical_columns_test, column

In [39]:
missing_values(train[numerical_features_lst[2:]], test[numerical_features_lst[2:]]).query("Train > 0 or Test > 0")

Unnamed: 0,Train,Test,% Train,% Test


In [40]:
print('Missing values in train set: ', train.isnull().sum().any())
print('Missing values in test set: ', test.isnull().sum().any())

Missing values in train set:  False
Missing values in test set:  False


## Checking the data types

In [41]:
list(train[categorical_features_lst].select_dtypes(include="number").columns)

['MSSubClass', 'OverallQual', 'OverallCond']

In [42]:
for column in list(train[categorical_features_lst].select_dtypes(include="number").columns):
    train[column] = train[column].astype(object)
    test[column] = test[column].astype(object)
    print(f"Train {column} --> {train[column].dtype}")
    print(f"Test {column} --> {test[column].dtype}")

Train MSSubClass --> object
Test MSSubClass --> object
Train OverallQual --> object
Test OverallQual --> object
Train OverallCond --> object
Test OverallCond --> object


In [43]:
def check_data_types(display=False):
    different_type_columns = []
    for column in list(set(train.columns) & set(test.columns)):
        train_type = train[column].dtype
        test_type = test[column].dtype
        if train_type != test_type:
            if display:
                print(f'\n*** {column} ***\nTrain: {train_type}\nTest: {test_type}')
            different_type_columns.append(column)

    return different_type_columns

In [44]:
check_data_types(display=True)


*** BsmtFinSF2 ***
Train: int64
Test: float64

*** BsmtFinSF1 ***
Train: int64
Test: float64

*** BsmtUnfSF ***
Train: int64
Test: float64

*** TotalBsmtSF ***
Train: int64
Test: float64

*** GarageArea ***
Train: int64
Test: float64

*** GarageCars ***
Train: int64
Test: float64

*** BsmtFullBath ***
Train: int64
Test: float64

*** BsmtHalfBath ***
Train: int64
Test: float64


['BsmtFinSF2',
 'BsmtFinSF1',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'GarageArea',
 'GarageCars',
 'BsmtFullBath',
 'BsmtHalfBath']

In [45]:
for column in check_data_types():
    test[column] = test[column].astype('int64')

## Feature Selection

In [46]:
corr_features_df = associations(train, compute_only=True)["corr"]
abs(corr_features_df).tail(1).T.query('SalePrice > 0.5')[:-1]

Unnamed: 0,SalePrice
Neighborhood,0.73863
OverallQual,0.790982
YearBuilt,0.522897
YearRemodAdd,0.507101
ExterQual,0.690933
Foundation,0.506328
BsmtQual,0.681905
TotalBsmtSF,0.613581
1stFlrSF,0.605852
GrLivArea,0.708624


In [47]:
important_features_lst = list(abs(corr_features_df).tail(1).T.query('SalePrice > 0.5')[:-1].index)

## Creating dummies

In [48]:
train_features = pd.get_dummies(train[important_features_lst], drop_first=True)
test_features = pd.get_dummies(test[important_features_lst], drop_first=True)

train_features.head()

Unnamed: 0,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,GarageYrBlt,GarageCars,GarageArea,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,OverallQual_2,OverallQual_3,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,OverallQual_10,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_NA,BsmtQual_TA,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_NA,FireplaceQu_Po,FireplaceQu_TA,GarageFinish_NA,GarageFinish_RFn,GarageFinish_Unf
0,2003,2003,856,856,1710,2,8,2003.0,2,548,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0
1,1976,1976,1262,1262,1262,2,6,1976.0,2,460,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0
2,2001,2002,920,920,1786,2,6,2001.0,2,608,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0
3,1915,1970,756,961,1717,1,7,1998.0,3,642,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,1
4,2000,2000,1145,1145,2198,2,9,2000.0,3,836,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0


## Dataset Splitting

In [49]:
train_labels_log = np.log(train['SalePrice'])
submission_labels_log = np.log(sample_submission.SalePrice)

print(f"Train:\nFeatures --> {train_features.shape}\nLabels --> {train_labels_log.shape}", end='\n'*2)
print(f"Validation:\nFeatures --> {test_features.shape}\nLabels --> {submission_labels_log.shape}")

Train:
Features --> (1460, 66)
Labels --> (1460,)

Validation:
Features --> (1459, 66)
Labels --> (1459,)


In [50]:
# for column in list(set(test_features.columns) - set(train_features.columns)): #['MSSubClass_150']
#     test_features_dummies.drop([column], axis=1, inplace=True)
#     del column

# for column in list(set(train_features.columns) - set(test_features.columns)):
#     test_features_dummies[column] = 0
#     del column

# print(f"Train:\nFeatures --> {train_features.shape}\nLabels --> {train_labels_log.shape}", end='\n'*2)
# print(f"Validation:\nFeatures --> {test_features.shape}\nLabels --> {submission_labels_log.shape}")

In [51]:
x_train, x_validation, y_train, y_validation = train_test_split(train_features,
                                                                train_labels_log, 
                                                                train_size = 0.8, 
                                                                random_state = 3)

print(f"Train:\nFeatures --> {x_train.shape}\nLabels --> {y_train.shape}", end='\n'*2)
print(f"Validation:\nFeatures --> {x_validation.shape}\nLabels --> {y_validation.shape}")

Train:
Features --> (1168, 66)
Labels --> (1168,)

Validation:
Features --> (292, 66)
Labels --> (292,)


## Feature Scaling

In [52]:
columns_to_scale_lst = list(train[important_features_lst].select_dtypes(include='number'))
columns_to_scale_lst

['YearBuilt',
 'YearRemodAdd',
 'TotalBsmtSF',
 '1stFlrSF',
 'GrLivArea',
 'FullBath',
 'TotRmsAbvGrd',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea']

In [53]:
scaler = StandardScaler()

x_train.iloc[:, :len(columns_to_scale_lst)] = scaler.fit_transform(x_train.iloc[:, :len(columns_to_scale_lst)])
x_validation.iloc[:, :len(columns_to_scale_lst)]= scaler.transform(x_validation.iloc[:, :len(columns_to_scale_lst)])
x_train.head()

Unnamed: 0,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,GarageYrBlt,GarageCars,GarageArea,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,OverallQual_2,OverallQual_3,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,OverallQual_10,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_NA,BsmtQual_TA,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_NA,FireplaceQu_Po,FireplaceQu_TA,GarageFinish_NA,GarageFinish_RFn,GarageFinish_Unf
727,1.180562,1.072775,0.42405,0.177191,-0.536522,0.792494,-0.299419,1.156579,0.30314,0.754233,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0
328,-1.846616,0.441075,-0.49454,0.738908,1.214983,0.792494,2.19414,-1.764785,0.30314,-0.159982,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1
958,1.047499,0.878406,0.672638,0.454094,-0.332633,0.792494,-0.299419,1.00482,0.30314,0.2708,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0
126,0.149325,-0.433587,0.059455,-0.545392,-1.068576,0.792494,-0.922809,0.018385,0.30314,-0.164768,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0
1102,-0.382926,0.829814,-0.101535,-0.081251,-0.726819,-1.026038,-0.299419,-0.626591,-1.053439,-1.05505,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0


## Building Machine Learning Models

### Grid Search parameters

In [54]:
models = dict()

models["Linear Regression"] = {"param_grid": ["No parameters"]}

models["Ridge"] = {"param_grid": {'alpha': [0.001, 0.01, 0.02, 0.03, 0.04, 
                                             0.05, 0.06, 0.07, 0.08, 1, 2, 
                                             3, 5, 8, 10, 20, 50, 100, 1000]}}
                  

models["Lasso"] = {"param_grid" : {'alpha': [0.001, 0.01, 0.02, 0.03, 0.04, 
                                             0.05, 0.06, 0.07, 0.08, 1, 2, 
                                             3, 5, 8, 10, 20, 50, 100, 1000]}}

models["Elastic Net"] = {"param_grid" : {'alpha': [0.001, 0.01, 0.02, 0.03, 0.04, 
                                             0.05, 0.06, 0.07, 0.08, 1, 2, 
                                             3, 5, 8, 10, 20, 50, 100, 1000],
                                         'l1_ratio': np.arange(0.0, 1.0, 0.1)}}

models["Support Vector Regression"] = {"param_grid": {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
                                                      'gamma': ['scale', 'auto'],
                                                      'C': [1, 10, 100],
                                                      'epsilon': [0.01, 0.1, 1, 10]}}

models["Decision Tree Regressor"] = {"param_grid" : {'max_depth': list(range(2, 10)),
                                                    'splitter': ['best', 'random'],
                                                    'min_samples_leaf': list(range(1, 10)),
                                                    'max_leaf_nodes': list(range(5, 20))}}
                                     
models["Random Forest Regressor"] = {"param_grid" : {'n_estimators': list(range(100, 200, 10)),
                                                     'max_depth': list(range(4, 7)),
                                                     'min_samples_split': list(range(2, 4))}}
                       
models["LGBM Regressor"] = {"param_grid" : {'num_leaves': [7, 14, 21, 28, 31, 50],
                                            'learning_rate': [0.1, 0.03, 0.003],
                                            'max_depth': [-1, 3, 5],
                                            'n_estimators': [50, 100, 200, 500]}}

models["AdaBoost Regressor"] = {"param_grid" : {'n_estimators': list(range(100, 1000, 100)),
                                                'learning_rate': [0.001, 0.01, 0.1, 1, 10]}}

models["CatBoost Regressor"] = {"param_grid" : {'learning_rate': [0.03, 0.1],
                                                'depth': [4, 6, 10],
                                                'l2_leaf_reg': [1, 3, 5, 7, 9],
                                                'verbose': [False]}}

models["XGB Regressor"] = {"param_grid" : {'n_estimators': list(range(500, 1000, 100)),
                                           'learning_rate': [0.001, 0.01, 0.1]}}
                    

models["Gradient Boosting Regressor"] = {"param_grid" : {'learning_rate': [0.01,0.02,0.03,0.04],
                                                         'subsample': [0.9, 0.5, 0.2, 0.1],
                                                         'n_estimators': [100,500,1000, 1500],
                                                         'max_depth': [4,6,8,10]}}

### Linear Regression

In [55]:
linear_regression = LinearRegression()
current_model = "Linear Regression"

linear_regression.fit(x_train, y_train)
models[current_model]["Predictions"] = np.exp(linear_regression.predict(x_validation))
models[current_model]["RMSE"] = mean_squared_error(y_validation,
                                                   np.log(models[current_model]["Predictions"]),
                                                   squared=False)
models[current_model]["Score"] = linear_regression.score(x_validation, y_validation)
models[current_model]["Best parameters"] = "No best parameters"

print(f"Model: {current_model}")
print(f'Score: {round(models[current_model]["Score"] * 100, 2)}%')
print(f'RMSE: {round(models[current_model]["RMSE"], 3)}')
print(f"Best parameters:\n {models[current_model]['Best parameters']}")

del current_model

Model: Linear Regression
Score: 76.08%
RMSE: 0.176
Best parameters:
 No best parameters


### Ridge

In [56]:
ridge = Ridge()
current_model = "Ridge"

grid_search_ridge = GridSearchCV(ridge, models[current_model]["param_grid"], cv = 5)
grid_search_ridge.fit(x_train, y_train)
models[current_model]["Predictions"] = np.exp(grid_search_ridge.predict(x_validation))
models[current_model]["RMSE"] = mean_squared_error(y_validation,
                                                   np.log(models[current_model]["Predictions"]),
                                                   squared=False)
models[current_model]["Score"] = grid_search_ridge.score(x_validation, y_validation)
models[current_model]["Best parameters"] = grid_search_ridge.best_params_

print(f"Model: {current_model}")
print(f'Score: {round(models[current_model]["Score"] * 100, 2)}%')
print(f'RMSE: {round(models[current_model]["RMSE"], 3)}')
print(f"Best parameters:\n {models[current_model]['Best parameters']}")

del current_model

Model: Ridge
Score: 75.18%
RMSE: 0.18
Best parameters:
 {'alpha': 2}


### Lasso|

In [57]:
lasso = Lasso()
current_model = "Lasso"

grid_search_lasso = GridSearchCV(lasso, models[current_model]["param_grid"], cv = 5)
grid_search_lasso.fit(x_train, y_train)
models[current_model]["Predictions"] = np.exp(grid_search_lasso.predict(x_validation))
models[current_model]["RMSE"] = mean_squared_error(y_validation,
                                                   np.log(models[current_model]["Predictions"]),
                                                   squared=False)
models[current_model]["Score"] = grid_search_lasso.score(x_validation, y_validation)
models[current_model]["Best parameters"] = grid_search_lasso.best_params_

print(f"Model: {current_model}")
print(f'Score: {round(models[current_model]["Score"] * 100, 2)}%')
print(f'RMSE: {round(models[current_model]["RMSE"], 3)}')
print(f"Best parameters:\n {models[current_model]['Best parameters']}")

del current_model

Model: Lasso
Score: 72.15%
RMSE: 0.19
Best parameters:
 {'alpha': 0.001}


### Elastic Net

In [58]:
elastic_net = ElasticNet()
current_model = "Elastic Net"
grid_search_elastic_net = GridSearchCV(elastic_net, models[current_model]["param_grid"], cv = 5)
grid_search_elastic_net.fit(x_train, y_train)
models[current_model]["Predictions"] = np.exp(grid_search_elastic_net.predict(x_validation))
models[current_model]["RMSE"] = mean_squared_error(y_validation,
                                                   np.log(models[current_model]["Predictions"]),
                                                   squared=False)
models[current_model]["Score"] = grid_search_elastic_net.score(x_validation, y_validation)
models[current_model]["Best parameters"] = grid_search_elastic_net.best_params_

print(f"Model: {current_model}")
print(f'Score: {round(models[current_model]["Score"] * 100, 2)}%')
print(f'RMSE: {round(models[current_model]["RMSE"], 3)}')
print(f"Best parameters:\n {models[current_model]['Best parameters']}")

del current_model

Model: Elastic Net
Score: 75.19%
RMSE: 0.18
Best parameters:
 {'alpha': 0.001, 'l1_ratio': 0.1}


### Support Vector Regression

In [59]:
svr = SVR()
current_model = "Support Vector Regression"

grid_search_svr = GridSearchCV(svr, models[current_model]["param_grid"], cv = 5)
grid_search_svr.fit(x_train, y_train)
models[current_model]["Predictions"] = np.exp(grid_search_svr.predict(x_validation))
models[current_model]["RMSE"] = mean_squared_error(y_validation,
                                                   np.log(models[current_model]["Predictions"]),
                                                   squared=False)
models[current_model]["Score"] = grid_search_svr.score(x_validation, y_validation)
models[current_model]["Best parameters"] = grid_search_svr.best_params_

print(f"Model: {current_model}")
print(f'Score: {round(models[current_model]["Score"] * 100, 2)}%')
print(f'RMSE: {round(models[current_model]["RMSE"], 3)}')
print(f"Best parameters:\n {models[current_model]['Best parameters']}")

del current_model

Model: Support Vector Regression
Score: 89.6%
RMSE: 0.116
Best parameters:
 {'C': 1, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}


### LGBM Regressor

In [60]:
from lightgbm import LGBMRegressor
lgbm_reg = LGBMRegressor(random_state = 0)
current_model = "LGBM Regressor"

grid_search_lgbm = GridSearchCV(lgbm_reg, models[current_model]["param_grid"], cv = 5)
grid_search_lgbm.fit(x_train, y_train)
models[current_model]["Predictions"] = np.exp(grid_search_lgbm.predict(x_validation))
models[current_model]["RMSE"] = mean_squared_error(y_validation,
                                                   np.log(models[current_model]["Predictions"]),
                                                   squared=False)
models[current_model]["Score"] = grid_search_lgbm.score(x_validation, y_validation)
models[current_model]["Best parameters"] = grid_search_lgbm.best_params_

print(f"Model: {current_model}")
print(f'Score: {round(models[current_model]["Score"] * 100, 2)}%')
print(f'RMSE: {round(models[current_model]["RMSE"], 3)}')
print(f"Best parameters:\n {models[current_model]['Best parameters']}")

del current_model

Model: LGBM Regressor
Score: 83.52%
RMSE: 0.146
Best parameters:
 {'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 500, 'num_leaves': 7}


### CatBoost Regressor

In [61]:
catboost_reg = CatBoostRegressor(random_state = 0)
current_model = "CatBoost Regressor"

grid_search_catboost = GridSearchCV(catboost_reg, models[current_model]["param_grid"], cv = 5)
grid_search_catboost.fit(x_train, y_train)
models[current_model]["Predictions"] = np.exp(grid_search_catboost.predict(x_validation))
models[current_model]["RMSE"] = mean_squared_error(y_validation,
                                                   np.log(models[current_model]["Predictions"]),
                                                   squared=False)
models[current_model]["Score"] = grid_search_catboost.score(x_validation, y_validation)
models[current_model]["Best parameters"] = grid_search_catboost.best_params_

print(f"Model: {current_model}")
print(f'Score: {round(models[current_model]["Score"] * 100, 2)}%')
print(f'RMSE: {round(models[current_model]["RMSE"], 3)}')
print(f"Best parameters:\n {models[current_model]['Best parameters']}")

del current_model

Model: CatBoost Regressor
Score: 87.62%
RMSE: 0.127
Best parameters:
 {'depth': 4, 'l2_leaf_reg': 5, 'learning_rate': 0.03, 'verbose': False}


### Decision Tree Regressor

In [62]:
decision_tree_reg = DecisionTreeRegressor(random_state = 0)
current_model = "Decision Tree Regressor"

grid_search_dtr = GridSearchCV(decision_tree_reg, models[current_model]["param_grid"], cv = 5)
grid_search_dtr.fit(x_train, y_train)
models[current_model]["Predictions"] = np.exp(grid_search_dtr.predict(x_validation))
models[current_model]["RMSE"] = mean_squared_error(y_validation,
                                                   np.log(models[current_model]["Predictions"]),
                                                   squared=False)
models[current_model]["Score"] = grid_search_dtr.score(x_validation, y_validation)
models[current_model]["Best parameters"] = grid_search_dtr.best_params_

print(f"Model: {current_model}")
print(f'Score: {round(models[current_model]["Score"] * 100, 2)}%')
print(f'RMSE: {round(models[current_model]["RMSE"], 3)}')
print(f"Best parameters:\n {models[current_model]['Best parameters']}")

del current_model

Model: Decision Tree Regressor
Score: 69.75%
RMSE: 0.198
Best parameters:
 {'max_depth': 6, 'max_leaf_nodes': 18, 'min_samples_leaf': 4, 'splitter': 'best'}


### Random Forest Regressor

In [63]:
random_forest_reg = RandomForestRegressor()
current_model = "Random Forest Regressor"

grid_search_rfr = GridSearchCV(random_forest_reg, models[current_model]["param_grid"], cv = 5)
grid_search_rfr.fit(x_train, y_train)
models[current_model]["Predictions"] = np.exp(grid_search_rfr.predict(x_validation))
models[current_model]["RMSE"] = mean_squared_error(y_validation,
                                                   np.log(models[current_model]["Predictions"]),
                                                   squared=False)
models[current_model]["Score"] = grid_search_rfr.score(x_validation, y_validation)
models[current_model]["Best parameters"] = grid_search_rfr.best_params_

print(f"Model: {current_model}")
print(f'Score: {round(models[current_model]["Score"] * 100, 2)}%')
print(f'RMSE: {round(models[current_model]["RMSE"], 3)}')
print(f"Best parameters:\n {models[current_model]['Best parameters']}")

del current_model

Model: Random Forest Regressor
Score: 83.99%
RMSE: 0.144
Best parameters:
 {'max_depth': 6, 'min_samples_split': 2, 'n_estimators': 170}


### XGB Regressor

In [64]:
xgb_reg = XGBRegressor()
current_model = "XGB Regressor"

grid_search_xgb = GridSearchCV(xgb_reg, models[current_model]["param_grid"], cv = 5)
grid_search_xgb.fit(x_train, y_train)
models[current_model]["Predictions"] = np.exp(grid_search_xgb.predict(x_validation))
models[current_model]["RMSE"] = mean_squared_error(y_validation,
                                                   np.log(models[current_model]["Predictions"]),
                                                   squared=False)
models[current_model]["Score"] = grid_search_xgb.score(x_validation, y_validation)
models[current_model]["Best parameters"] = grid_search_xgb.best_params_

print(f"Model: {current_model}")
print(f'Score: {round(models[current_model]["Score"] * 100, 2)}%')
print(f'RMSE: {round(models[current_model]["RMSE"], 3)}')
print(f"Best parameters:\n {models[current_model]['Best parameters']}")

del current_model

Model: XGB Regressor
Score: 86.68%
RMSE: 0.132
Best parameters:
 {'learning_rate': 0.01, 'n_estimators': 900}


### AdaBoost Regressor

In [67]:
ada_boost_reg = AdaBoostRegressor(random_state = 0)
current_model = "AdaBoost Regressor"

grid_search_abr = GridSearchCV(ada_boost_reg, models[current_model]["param_grid"], cv = 5)
grid_search_abr.fit(x_train, y_train)
models[current_model]["Predictions"] = np.exp(grid_search_abr.predict(x_validation))
models[current_model]["RMSE"] = mean_squared_error(y_validation,
                                                   np.log(models[current_model]["Predictions"]),
                                                   squared=False)
models[current_model]["Score"] = grid_search_abr.score(x_validation, y_validation)
models[current_model]["Best parameters"] = grid_search_abr.best_params_

print(f"Model: {current_model}")
print(f'Score: {round(models[current_model]["Score"] * 100, 2)}%')
print(f'RMSE: {round(models[current_model]["RMSE"], 3)}')
print(f"Best parameters:\n {models[current_model]['Best parameters']}")

del current_model

Model: AdaBoost Regressor
Score: 76.36%
RMSE: 0.175
Best parameters:
 {'learning_rate': 0.01, 'n_estimators': 900}


### Gradient Boosting Regressor

In [68]:
gradient_boosting_reg = GradientBoostingRegressor(random_state=0)
current_model = "Gradient Boosting Regressor"

grid_search_grbr = GridSearchCV(gradient_boosting_reg, models[current_model]["param_grid"], cv = 5)
grid_search_grbr.fit(x_train, y_train)
models[current_model]["Predictions"] = np.exp(grid_search_grbr.predict(x_validation))
models[current_model]["RMSE"] = mean_squared_error(y_validation,
                                                   np.log(models[current_model]["Predictions"]),
                                                   squared=False)
models[current_model]["Score"] = grid_search_grbr.score(x_validation, y_validation)
models[current_model]["Best parameters"] = grid_search_grbr.best_params_

print(f"Model: {current_model}")
print(f'Score: {round(models[current_model]["Score"] * 100, 2)}%')
print(f'RMSE: {round(models[current_model]["RMSE"], 3)}')
print(f"Best parameters:\n {models[current_model]['Best parameters']}")

del current_model

Model: Gradient Boosting Regressor
Score: 85.0%
RMSE: 0.14
Best parameters:
 {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 500, 'subsample': 0.1}


## Evaluation

In [69]:
models_performace = pd.DataFrame({"Score" : [models[model]["Score"] for model in models.keys()],
                           "RMSE" : [models[model]["RMSE"] for model in models.keys()],
                           "Best parameters" : [models[model]["Best parameters"] for model in models.keys()]},
                          index=models.keys())
models_performace

Unnamed: 0,Score,RMSE,Best parameters
Linear Regression,0.760831,0.176482,No best parameters
Ridge,0.751824,0.179775,{'alpha': 2}
Lasso,0.721544,0.190426,{'alpha': 0.001}
Elastic Net,0.751913,0.179742,"{'alpha': 0.001, 'l1_ratio': 0.1}"
Support Vector Regression,0.895961,0.116398,"{'C': 1, 'epsilon': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}"
Decision Tree Regressor,0.697525,0.198469,"{'max_depth': 6, 'max_leaf_nodes': 18, 'min_samples_leaf': 4, 'splitter': 'best'}"
Random Forest Regressor,0.839888,0.144398,"{'max_depth': 6, 'min_samples_split': 2, 'n_estimators': 170}"
LGBM Regressor,0.835195,0.146499,"{'learning_rate': 0.03, 'max_depth': 3, 'n_estimators': 500, 'num_leaves': 7}"
AdaBoost Regressor,0.763586,0.175463,"{'learning_rate': 0.01, 'n_estimators': 900}"
CatBoost Regressor,0.876206,0.126969,"{'depth': 4, 'l2_leaf_reg': 5, 'learning_rate': 0.03, 'verbose': False}"


## Predictions

In [70]:
models_predictions = pd.DataFrame([{model_name : model_values["Predictions"]}[model_name].round(3)
                            for model_name, model_values in models.items()]).T
models_predictions.columns = list(models.keys())
models_predictions

Unnamed: 0,Linear Regression,Ridge,Lasso,Elastic Net,Support Vector Regression,Decision Tree Regressor,Random Forest Regressor,LGBM Regressor,AdaBoost Regressor,CatBoost Regressor,XGB Regressor,Gradient Boosting Regressor
0,97535.417,96838.681,102310.867,97665.922,108270.858,128899.078,102923.114,107281.606,110033.599,97932.315,90532.101562,89485.455
1,141207.34,140941.171,139290.744,140960.56,139706.246,104150.698,119355.516,137578.155,124707.182,137965.642,137645.46875,140580.996
2,194705.933,193975.723,192461.254,194044.088,194271.243,199552.079,192503.862,194806.593,209957.014,195392.013,192827.0625,190963.298
3,175307.032,176973.868,188322.182,178474.312,190275.004,270887.948,187863.885,183980.521,182983.878,182585.823,205293.875,184970.828
4,264187.985,264159.304,275992.581,263801.9,279016.27,242447.575,251604.108,268472.783,235222.255,252979.178,279951.25,284821.113
5,69967.745,70480.353,72934.858,70711.717,80051.082,73481.921,88249.669,71277.249,95990.358,72940.913,70388.023438,71799.306
6,154827.278,154721.555,152617.379,154740.112,157334.193,128899.078,149085.582,152395.056,126504.557,157128.499,152895.296875,158487.831
7,123214.179,122494.908,118042.118,122125.925,119751.019,171133.833,144808.376,127708.022,146548.186,125792.012,130009.523438,125313.568
8,117293.455,118611.081,124168.557,118799.937,123841.644,125433.587,121069.081,124112.972,112485.136,129704.726,127043.15625,126751.712
9,106607.854,105015.422,106311.05,105107.214,116058.739,128899.078,111946.087,120459.014,112438.11,113570.639,120630.460938,110400.214


### Best models prediction

In [71]:
train_features_to_predict = train_features.copy()
test_features_to_predict = test_features.copy()
train_features_to_predict.iloc[:, :len(columns_to_scale_lst)] = scaler.fit_transform(train_features_to_predict.iloc[:, :len(columns_to_scale_lst)])
test_features_to_predict.iloc[:, :len(columns_to_scale_lst)] = scaler.transform(test_features_to_predict.iloc[:, :len(columns_to_scale_lst)])
test_features_to_predict.head()

Unnamed: 0,YearBuilt,YearRemodAdd,TotalBsmtSF,1stFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,GarageYrBlt,GarageCars,GarageArea,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,OverallQual_2,OverallQual_3,OverallQual_4,OverallQual_5,OverallQual_6,OverallQual_7,OverallQual_8,OverallQual_9,OverallQual_10,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_NA,BsmtQual_TA,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_NA,FireplaceQu_Po,FireplaceQu_TA,GarageFinish_NA,GarageFinish_RFn,GarageFinish_Unf
0,-0.340077,-1.15638,-0.400017,-0.689929,-1.179256,-1.026041,-0.93413,-0.589691,-1.026858,1.202536,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1
1,-0.43944,-1.30174,0.619239,0.430511,-0.354966,-1.026041,-0.318683,-0.703769,-1.026858,-0.753188,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1
2,0.852269,0.6364,-0.295127,-0.607125,0.216136,0.789741,-0.318683,0.779249,0.311725,0.042202,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0
3,0.88539,0.6364,-0.299687,-0.6123,0.168544,0.789741,0.296763,0.817275,0.311725,-0.013943,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0
4,0.686666,0.345679,0.507509,0.303718,-0.448246,0.789741,-0.93413,0.589118,0.311725,0.154492,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0


In [72]:
best_model = SVR(C=1, epsilon=0.01, gamma='auto', kernel='rbf')
best_model.fit(train_features_to_predict, np.log(train.SalePrice))
best_model_predictions = np.exp(best_model.predict(test_features_to_predict))
pd.DataFrame({"Predictions":best_model_predictions}, index=sample_submission.Id)

Unnamed: 0_level_0,Predictions
Id,Unnamed: 1_level_1
1461,127274.729968
1462,145936.361934
1463,185000.517959
1464,192288.560874
1465,194436.895047
1466,181126.405926
1467,170157.625043
1468,176513.534646
1469,183599.909016
1470,123389.220235


### Submission

In [74]:
output = pd.DataFrame({'Id': sample_submission.Id,
                       'SalePrice': best_model_predictions})
output.to_csv('submission.csv', index=False)