In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats


import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error


In [2]:
#Importing dataset

train = pd.read_csv('zillow/train.csv')

#Test for evaluation
test = pd.read_csv('zillow/test.csv')

In [3]:
#Fixing typo

train = train.replace({"Exterior2nd":{"CmentBd":"CemntBd"}})

In [4]:
#Saving Id for further refference
Id_train =train['Id']
Id_test = test['Id']
#Dropping Id from dataset
train.drop('Id', axis=1, inplace=True)  #For feature engeneering
test.drop('Id', axis=1, inplace = True)

In [5]:
#Outlier removal

print("Shape before removal ", train.shape)
#train.drop(train[(train['OverallQual']<5) & (train['SalePrice']>200000)].index, inplace=True)
train.drop(train[(train['GrLivArea']>2500) & (train['SalePrice']<300000)].index, inplace=True)
print("Shape after removal ", train.shape)

Shape before removal  (1460, 80)
Shape after removal  (1427, 80)


In [6]:
#Saving SalePrice
train['SalePrice'] = np.log1p(train['SalePrice'])
y = train['SalePrice'].reset_index(drop=True)

In [7]:
#Concatenate train and test 
df = pd.concat([train, test]).reset_index(drop=True)

In [8]:
#Remove SalePrice from df
df = df.drop(['SalePrice'], axis=1)

In [9]:
#Check shape
df.shape

(2886, 79)

In [10]:
#Function that split dataset to numerical and textual and check for missing values

def missing_numcat(data, categorical=True):
    """This function splits dataset into numerical and textual portion of the data and make dataframe with missing values"""
    if categorical:
        categorical_text = data.select_dtypes(include='object')
        categorical_cnt = categorical_text.count()
        cat_with_missing = categorical_cnt[categorical_cnt.values < len(data)]
        cat_percent =  (len(data) - cat_with_missing.values) / len(data)
        missing_categorical =pd.DataFrame({'values': cat_with_missing,
                                  'NaN vals':(len(data) - cat_with_missing.values), '(%) NaN':cat_percent})
        return missing_categorical
    else:
        numerical_data = data._get_numeric_data()
        numerical_cnt = numerical_data.count()
        num_with_missing = numerical_cnt[numerical_cnt.values < len(data)]
        num_percent =  (len(data) - num_with_missing.values) / len(data)
        missing_numerical =pd.DataFrame({'values': num_with_missing,
                                          'NaN vals':(len(data) - num_with_missing.values), '(%) NaN':num_percent}) 
        return missing_numerical
    
    
    
#Function that fixes skewnes
from scipy.stats import skew
def fix_skew(c):
    """This function takes only numeric portion of the dataset and fix skewnes"""
    numeric_feats = c.dtypes[c.dtypes != "object"].index

    skewed_feats = c[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index

    c[skewed_feats] = np.log1p(c[skewed_feats])
    return c 

In [11]:
missing_numcat(df, categorical=True) #Before

Unnamed: 0,values,NaN vals,(%) NaN
Alley,193,2693,0.933125
BsmtCond,2804,82,0.028413
BsmtExposure,2804,82,0.028413
BsmtFinType1,2807,79,0.027374
BsmtFinType2,2806,80,0.02772
BsmtQual,2805,81,0.028067
Electrical,2885,1,0.000347
Exterior1st,2885,1,0.000347
Exterior2nd,2885,1,0.000347
Fence,563,2323,0.80492


In [12]:
missing_numcat(df, categorical=False)

Unnamed: 0,values,NaN vals,(%) NaN
BsmtFinSF1,2885,1,0.000347
BsmtFinSF2,2885,1,0.000347
BsmtFullBath,2884,2,0.000693
BsmtHalfBath,2884,2,0.000693
BsmtUnfSF,2885,1,0.000347
GarageArea,2885,1,0.000347
GarageCars,2885,1,0.000347
GarageYrBlt,2729,157,0.054401
LotFrontage,2407,479,0.165974
MasVnrArea,2864,22,0.007623


In [13]:
df['MSSubClass'] = df['MSSubClass'].apply(str)
df['YrSold'] = df['YrSold'].astype(str)
df['MoSold'] = df['MoSold'].astype(str)
df['Functional'] = df['Functional'].fillna('Typ') 
df['Electrical'] = df['Electrical'].fillna("SBrkr") 
df['KitchenQual'] = df['KitchenQual'].fillna("TA") 
df["PoolQC"] = df["PoolQC"].fillna("None")
df['Exterior1st'] = df['Exterior1st'].fillna(df['Exterior1st'].mode()[0]) 
df['Exterior2nd'] = df['Exterior2nd'].fillna(df['Exterior2nd'].mode()[0])
df['SaleType'] = df['SaleType'].fillna(df['SaleType'].mode()[0])

In [14]:
#Garage
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)
df['GarageArea'] = df['GarageArea'].fillna(0)
df['GarageCars'] = df['GarageCars'].fillna(0)
df['GarageType'] = df['GarageType'].fillna("None")
df['GarageFinish'] = df['GarageFinish'].fillna("None")
df['GarageQual'] = df['GarageQual'].fillna("None")
df['GarageCond'] = df['GarageCond'].fillna("None")

In [15]:
#Basement
df['BsmtQual'] = df['BsmtQual'].fillna("None")
df['BsmtCond'] = df['BsmtCond'].fillna("None")
df['BsmtExposure'] = df['BsmtExposure'].fillna("None")
df['BsmtFinType1'] = df['BsmtFinType1'].fillna("None")
df['BsmtFinType2'] = df['BsmtFinType2'].fillna("None")

In [16]:
#MSzoning
df['MSZoning'] = df.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

In [17]:
#Fill none with categorical
objects = []
for i in df.columns:
    if df[i].dtype == object:
        objects.append(i)
df.update(df[objects].fillna('None'))

In [18]:
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

In [19]:
#Fill Zero with numerical
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in df.columns:
    if df[i].dtype in numeric_dtypes:
        numerics.append(i)

        df.update(df[numerics].fillna(0))

In [20]:
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
#Fixing skewnes with numerical
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in df.columns:
    if df[i].dtype in numeric_dtypes:
        numerics2.append(i)
skew_features = df[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

for i in skew_index:
    df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1))

In [21]:
#Function that fixes skewnes
from scipy.stats import skew
def fix_skew(c):
    """This function takes only numeric portion of the dataset and fix skewnes"""
    numeric_feats = c.dtypes[c.dtypes != "object"].index

    skewed_feats = c[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.5]
    skewed_feats = skewed_feats.index

    c[skewed_feats] = np.log1p(c[skewed_feats])
    return c 

df = fix_skew(df)

#Print evaluation results
from sklearn.metrics import mean_squared_error, mean_squared_log_error
def evaluate(X_train, y_train, X_test,y_test, clf):
    clf_predict_train = clf.predict(X_train)
    clf_predict_test = clf.predict(X_test)
    clfmse_train = mean_squared_error(np.expm1(y_train), np.expm1(clf_predict_train))**0.5
    clfmse_test = mean_squared_error(np.expm1(y_test), np.expm1(clf_predict_test))**0.5
    clfrmse_train = mean_squared_log_error(np.expm1(y_train), np.expm1(clf_predict_train))**0.5
    clfrmse_test = mean_squared_log_error(np.expm1(y_test), np.expm1(clf_predict_test))**0.5
    print("Evaluate---","***", "----")
    print("RMSE-train",clfmse_train)
    print("RMSE-test", clfmse_test)
    print(("RMSE-test - RMSE-Train", (clfmse_train - clfmse_test)))
    print("RMSLE-train", clfrmse_train)
    print("RMSLE-test", clfrmse_test)

In [22]:
#Adding new features and drop non informative

df = df.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

df['YrBltAndRemod']=df['YearBuilt']+df['YearRemodAdd']
df['TotalSF']=df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

df['Total_sqr_footage'] = (df['BsmtFinSF1'] + df['BsmtFinSF2'] +
                                 df['1stFlrSF'] + df['2ndFlrSF'])

df['Total_Bathrooms'] = (df['FullBath'] + (0.5 * df['HalfBath']) +
                               df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))

df['Total_porch_sf'] = (df['OpenPorchSF'] + df['3SsnPorch'] +
                              df['EnclosedPorch'] + df['ScreenPorch'] +
                              df['WoodDeckSF'])

In [23]:
df['haspool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
df['has2ndfloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
df['hasgarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
df['hasbsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
df['hasfireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [24]:
#Check for missing values in categorical
missing_numcat(df, categorical=True)

Unnamed: 0,values,NaN vals,(%) NaN


In [25]:
#Check for missing values in numerical
missing_numcat(df, categorical=False)

Unnamed: 0,values,NaN vals,(%) NaN


In [26]:
#Check shape
print(df.shape)

(2886, 86)


In [27]:
#COnvert categorical values using dummy
#final = pd.get_dummies(df).reset_index(drop=True)
#final.shape
final = df

In [28]:
X = final.iloc[:len(y), :]
X_test = final.iloc[len(y):, :]
X.shape, y.shape, X_test.shape

((1427, 86), (1427,), (1459, 86))

In [29]:
outliers = [30, 88, 462, 631, 1322]
X = X.drop(X.index[outliers])
y = y.drop(y.index[outliers])

overfit = []
for i in X.columns:
    counts = X[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(X) * 100 > 90:
        overfit.append(i)

overfit = list(overfit)
print(overfit)
#X = X.drop(overfit, axis=1)
#X_test = X_test.drop(overfit, axis=1)

['3SsnPorch', 'Alley', 'BsmtHalfBath', 'CentralAir', 'Condition2', 'Electrical', 'Functional', 'GarageCond', 'Heating', 'KitchenAbvGr', 'LandContour', 'LandSlope', 'LowQualFinSF', 'MiscFeature', 'MiscVal', 'PavedDrive', 'PoolArea', 'RoofMatl', 'ScreenPorch', 'haspool', 'hasgarage', 'hasbsmt']


In [30]:
X.shape, y.shape, X_test.shape

((1422, 86), (1422,), (1459, 86))

In [31]:
#For stratification
Z = pd.concat((X,y), axis=1)

In [32]:
#Grids
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

In [33]:
#Split  training dataset to train and test set
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.13, random_state=42)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, train_size=0.7, random_state=24)

for train_index, test_index in split.split(Z, Z['Neighborhood']):
    str_train = Z.iloc[train_index]
    str_test   = Z.iloc[test_index]
    

y_train = str_train['SalePrice']
y_ver  = str_test['SalePrice']
X_train = str_train.drop('SalePrice', axis = 1)
X_ver = str_test.drop('SalePrice', axis = 1)

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler, Normalizer
from sklearn.impute import SimpleImputer


housing_num = X_train._get_numeric_data()
housing_cat = X_train.select_dtypes('object')

In [35]:
num_pipeline = Pipeline([
    ('std_scal', RobustScaler()),
])

In [36]:
cat_pipeline = Pipeline([
    ('one_hen', OneHotEncoder(handle_unknown='ignore')),
])

In [37]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
num_attrs = housing_num.columns.tolist()
cat_attrs = housing_cat.columns.tolist()
full_pipeline = ColumnTransformer([
    
    ('numerical_pl', num_pipeline, num_attrs),
    ('categorical_pl', cat_pipeline, cat_attrs ),
])

In [38]:
X_train_prepared = full_pipeline.fit_transform(X_train)
X_ver_prepared = full_pipeline.transform(X_ver)
test_prepared = full_pipeline.transform(X_test)

In [39]:
from sklearn.linear_model import Ridge
rdg = RidgeCV()
rdg.fit(X_train_prepared, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=None, fit_intercept=True,
        gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [40]:
#Ridge
evaluate(X_train_prepared, y_train, X_ver_prepared, y_ver,rdg)

Evaluate--- *** ----
RMSE-train 16837.398358967268
RMSE-test 22265.405322765517
('RMSE-test - RMSE-Train', -5428.006963798249)
RMSLE-train 0.09078240064805718
RMSLE-test 0.1100897745889497


In [48]:
elastic_net = ElasticNetCV(alphas=alphas_alt)
elastic_net.fit(X_train_prepared, y_train)

ElasticNetCV(alphas=[14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4,
                     15.5],
             copy_X=True, cv='warn', eps=0.001, fit_intercept=True,
             l1_ratio=0.5, max_iter=1000, n_alphas=100, n_jobs=None,
             normalize=False, positive=False, precompute='auto',
             random_state=None, selection='cyclic', tol=0.0001, verbose=0)

In [49]:
#Elastic nET
evaluate(X_train_prepared, y_train, X_ver_prepared, y_ver,elastic_net)

Evaluate--- *** ----
RMSE-train 81136.63176364661
RMSE-test 80186.79673322012
('RMSE-test - RMSE-Train', 949.8350304264895)
RMSLE-train 0.4026841070029064
RMSLE-test 0.3858349087330694


In [43]:
#LASSO
lasso = LassoCV()

lasso.fit(X_train_prepared, y_train)

#Lasso
evaluate(X_train_prepared, y_train, X_ver_prepared, y_ver,lasso)

Evaluate--- *** ----
RMSE-train 19682.997221295856
RMSE-test 23353.09224494613
('RMSE-test - RMSE-Train', -3670.095023650272)
RMSLE-train 0.10507958235184374
RMSLE-test 0.11275485668933945


In [44]:
#svr
svr = SVR()
svr.fit(X_train_prepared, y_train)

#SVR
evaluate(X_train_prepared, y_train, X_ver_prepared, y_ver,svr)

Evaluate--- *** ----
RMSE-train 18565.36841339645
RMSE-test 24305.294566492725
('RMSE-test - RMSE-Train', -5739.926153096276)
RMSLE-train 0.09791200902697306
RMSLE-test 0.11705475296214081


In [45]:
#GRADIENT BOOSTING REGRESSOR
gbr = GradientBoostingRegressor()
gbr.fit(X_train_prepared, y_train)
#gbr
evaluate(X_train_prepared, y_train, X_ver_prepared, y_ver,gbr)

Evaluate--- *** ----
RMSE-train 13485.46846114694
RMSE-test 27408.762373783175
('RMSE-test - RMSE-Train', -13923.293912636234)
RMSLE-train 0.07117758705505402
RMSLE-test 0.12313082105743182


In [46]:
rf_predict = elastic_net.predict(test_prepared)
rf_predict = np.expm1(rf_predict)
rf_predict

array([121150.47757047, 163082.19183483, 189819.87257633, ...,
       172190.88262456, 123968.16735733, 236788.34211046])

In [47]:
final_df= pd.DataFrame(data=zip(Id_test,rf_predict), columns=['Id', 'SalePrice'])
final_df.to_csv('zillow/sample_submission.csv', index=False, columns=['Id', 'SalePrice'])
pd.read_csv('zillow/sample_submission.csv')
final_df

Unnamed: 0,Id,SalePrice
0,1461,121150.477570
1,1462,163082.191835
2,1463,189819.872576
3,1464,206290.970656
4,1465,185347.801463
...,...,...
1454,2915,90359.799619
1455,2916,86500.664149
1456,2917,172190.882625
1457,2918,123968.167357
