In [7]:
import pandas as pd
import math
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype

%matplotlib inline
pd.options.display.max_columns=1000
pd.options.display.max_rows=1000
pd.options.display.max_categories=1000

In [8]:
df_raw=pd.read_csv('/Users/darshak.shah/Downloads/house-prices-advanced-regression-techniques/train.csv')

In [9]:
df_raw['SalePrice'].head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [10]:
np.log1p(df_raw['SalePrice']).head()

0    12.247699
1    12.109016
2    12.317171
3    11.849405
4    12.429220
Name: SalePrice, dtype: float64

In [11]:
df_raw['SalePrice'] = np.log1p(df_raw['SalePrice'])

In [12]:
df_raw.head().T

Unnamed: 0,0,1,2,3,4
Id,1,2,3,4,5
MSSubClass,60,20,60,70,60
MSZoning,RL,RL,RL,RL,RL
LotFrontage,65,80,68,60,84
LotArea,8450,9600,11250,9550,14260
Street,Pave,Pave,Pave,Pave,Pave
Alley,,,,,
LotShape,Reg,Reg,IR1,IR1,IR1
LandContour,Lvl,Lvl,Lvl,Lvl,Lvl
Utilities,AllPub,AllPub,AllPub,AllPub,AllPub


In [13]:
df_raw.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
Alley             object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
MasVnrArea       float64
ExterQual         object
ExterCond         object
Foundation        object
BsmtQual          object
BsmtCond          object
BsmtExposure      object
BsmtFinType1      object
BsmtFinSF1         int64
BsmtFinType2      object
BsmtFinSF2         int64
BsmtUnfSF          int64
TotalBsmtSF        int64
Heating           object


In [14]:
df_raw.isnull().sum()/len(df_raw)

Id               0.000000
MSSubClass       0.000000
MSZoning         0.000000
LotFrontage      0.177397
LotArea          0.000000
Street           0.000000
Alley            0.937671
LotShape         0.000000
LandContour      0.000000
Utilities        0.000000
LotConfig        0.000000
LandSlope        0.000000
Neighborhood     0.000000
Condition1       0.000000
Condition2       0.000000
BldgType         0.000000
HouseStyle       0.000000
OverallQual      0.000000
OverallCond      0.000000
YearBuilt        0.000000
YearRemodAdd     0.000000
RoofStyle        0.000000
RoofMatl         0.000000
Exterior1st      0.000000
Exterior2nd      0.000000
MasVnrType       0.005479
MasVnrArea       0.005479
ExterQual        0.000000
ExterCond        0.000000
Foundation       0.000000
BsmtQual         0.025342
BsmtCond         0.025342
BsmtExposure     0.026027
BsmtFinType1     0.025342
BsmtFinSF1       0.000000
BsmtFinType2     0.026027
BsmtFinSF2       0.000000
BsmtUnfSF        0.000000
TotalBsmtSF 

In [15]:
def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict

In [16]:
def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):
        df[name] = pd.Categorical(col).codes+1

In [17]:
def proc_df(df, y_fld=None, na_dict=None, skip_flds=None, max_n_cat=None):
# , skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
#             preproc_fn=None, , subset=None, mapper=None):
    
#     if not ignore_flds: ignore_flds=[]
    if not skip_flds: skip_flds=[]
    df = df.copy()
#     if subset: df = get_sample(df,subset)
#     else: df = df.copy()
#     ignored_flds = df.loc[:, ignore_flds]
#     df.drop(ignore_flds, axis=1, inplace=True)
#     if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = pd.Categorical(df[y_fld]).codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    else: na_dict = na_dict.copy()
    na_dict_initial = na_dict.copy()
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if len(na_dict_initial.keys()) > 0:
        df.drop([a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))], axis=1, inplace=True)
#     if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([df], axis=1)
    res = [df, y, na_dict]
#     if do_scale: res = res + [mapper]
    return res

In [20]:
df, y, nas = proc_df(df_raw, y_fld='SalePrice', skip_flds=['Id'])

In [21]:
X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.20, random_state=42)

In [22]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [23]:
m = RandomForestRegressor(random_state = 33, n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)



CPU times: user 278 ms, sys: 9.76 ms, total: 288 ms
Wall time: 134 ms
[0.07090620246850389, 0.15265031559969, 0.9670190945684138, 0.8751299747733363]


In [24]:
m = LinearRegression()
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 126 ms, sys: 16.1 ms, total: 143 ms
Wall time: 165 ms
[0.1251558369627743, 0.16355588031263046, 0.897246567361794, 0.856650853269142]


In [25]:
m = ExtraTreesRegressor(random_state = 33, n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)



CPU times: user 220 ms, sys: 5.89 ms, total: 226 ms
Wall time: 124 ms
[9.745099143420879e-06, 0.15582568303169486, 0.9999999993770307, 0.869880955693509]


In [26]:
m = BaggingRegressor(random_state = 33, n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 84.4 ms, sys: 69.7 ms, total: 154 ms
Wall time: 5.37 s
[0.06903936438238106, 0.15623291864366734, 0.9687328937041296, 0.8691999594853789]


In [27]:
m = GradientBoostingRegressor(random_state = 33)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 392 ms, sys: 9.2 ms, total: 402 ms
Wall time: 406 ms
[0.07642927611508575, 0.1415361814320951, 0.9616810487914588, 0.8926510662322463]


In [29]:
m = AdaBoostRegressor(random_state = 33)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 300 ms, sys: 19.1 ms, total: 319 ms
Wall time: 320 ms
[0.14594402431289205, 0.18415258435315512, 0.8602773685370386, 0.8182734329829319]
