In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats


import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error


In [2]:
#Importing dataset

train = pd.read_csv('zillow/train.csv')

#Test for evaluation
test = pd.read_csv('zillow/test.csv')

In [3]:
#Fixing typo

train = train.replace({"Exterior2nd":{"CmentBd":"CemntBd"}})
train = train[train.GrLivArea < 4500]

In [4]:
#Saving Id for further refference
Id_train =train['Id']
Id_test = test['Id']
#Dropping Id from dataset
train.drop('Id', axis=1, inplace=True)  #For feature engeneering
test.drop('Id', axis=1, inplace = True)

In [5]:
#Saving SalePrice
train['SalePrice'] = np.log1p(train['SalePrice'])
y = train['SalePrice'].reset_index(drop=True)

In [6]:
#Concatenate train and test 
df = pd.concat([train, test]).reset_index(drop=True)

In [7]:
#Remove SalePrice from df
df = df.drop(['SalePrice'], axis=1)

In [8]:
#Check shape
df.shape

(2917, 79)

In [9]:
#Function that split dataset to numerical and textual and check for missing values

def missing_numcat(data, categorical=True):
    """This function splits dataset into numerical and textual portion of the data and make dataframe with missing values"""
    if categorical:
        categorical_text = data.select_dtypes(include='object')
        categorical_cnt = categorical_text.count()
        cat_with_missing = categorical_cnt[categorical_cnt.values < len(data)]
        cat_percent =  (len(data) - cat_with_missing.values) / len(data)
        missing_categorical =pd.DataFrame({'values': cat_with_missing,
                                  'NaN vals':(len(data) - cat_with_missing.values), '(%) NaN':cat_percent})
        return missing_categorical
    else:
        numerical_data = data._get_numeric_data()
        numerical_cnt = numerical_data.count()
        num_with_missing = numerical_cnt[numerical_cnt.values < len(data)]
        num_percent =  (len(data) - num_with_missing.values) / len(data)
        missing_numerical =pd.DataFrame({'values': num_with_missing,
                                          'NaN vals':(len(data) - num_with_missing.values), '(%) NaN':num_percent}) 
        return missing_numerical
    
    
    
#Function that fixes skewnes
from scipy.stats import skew
def fix_skew(c):
    """This function takes only numeric portion of the dataset and fix skewnes"""
    numeric_feats = c.dtypes[c.dtypes != "object"].index

    skewed_feats = c[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index

    c[skewed_feats] = np.log1p(c[skewed_feats])
    return c 

In [10]:
missing_numcat(df, categorical=True) #Before

Unnamed: 0,values,NaN vals,(%) NaN
Alley,198,2719,0.932122
BsmtCond,2835,82,0.028111
BsmtExposure,2835,82,0.028111
BsmtFinType1,2838,79,0.027083
BsmtFinType2,2837,80,0.027425
BsmtQual,2836,81,0.027768
Electrical,2916,1,0.000343
Exterior1st,2916,1,0.000343
Exterior2nd,2916,1,0.000343
Fence,571,2346,0.804251


In [11]:
missing_numcat(df, categorical=False)

Unnamed: 0,values,NaN vals,(%) NaN
BsmtFinSF1,2916,1,0.000343
BsmtFinSF2,2916,1,0.000343
BsmtFullBath,2915,2,0.000686
BsmtHalfBath,2915,2,0.000686
BsmtUnfSF,2916,1,0.000343
GarageArea,2916,1,0.000343
GarageCars,2916,1,0.000343
GarageYrBlt,2758,159,0.054508
LotFrontage,2431,486,0.16661
MasVnrArea,2894,23,0.007885


In [12]:
df['MSSubClass'] = df['MSSubClass'].apply(str)
df['YrSold'] = df['YrSold'].astype(str)
df['MoSold'] = df['MoSold'].astype(str)
df['Functional'] = df['Functional'].fillna('Typ') 
df['Electrical'] = df['Electrical'].fillna("SBrkr") 
df['KitchenQual'] = df['KitchenQual'].fillna("TA") 
df["PoolQC"] = df["PoolQC"].fillna("None")
df['Exterior1st'] = df['Exterior1st'].fillna(df['Exterior1st'].mode()[0]) 
df['Exterior2nd'] = df['Exterior2nd'].fillna(df['Exterior2nd'].mode()[0])
df['SaleType'] = df['SaleType'].fillna(df['SaleType'].mode()[0])

In [13]:
#Garage
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)
df['GarageArea'] = df['GarageArea'].fillna(0)
df['GarageCars'] = df['GarageCars'].fillna(0)
df['GarageType'] = df['GarageType'].fillna("None")
df['GarageFinish'] = df['GarageFinish'].fillna("None")
df['GarageQual'] = df['GarageQual'].fillna("None")
df['GarageCond'] = df['GarageCond'].fillna("None")

In [14]:
#Basement
df['BsmtQual'] = df['BsmtQual'].fillna("None")
df['BsmtCond'] = df['BsmtCond'].fillna("None")
df['BsmtExposure'] = df['BsmtExposure'].fillna("None")
df['BsmtFinType1'] = df['BsmtFinSF1'].fillna("None")
df['BsmtFinType2'] = df['BsmtFinType2'].fillna("None")

In [15]:
#MSzoning
df['MSZoning'] = df.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

In [16]:
#Fill none with categorical
objects = []
for i in df.columns:
    if df[i].dtype == object:
        objects.append(i)
df.update(df[objects].fillna('None'))

In [17]:
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.mean()))

In [18]:
def nnous_sel(df):
    """This function dedect non informative features with more than 90 % zeros"""
    nn_cols = df.loc[:, (df.isin([' ','NULL',0]) | df.isnull()).mean() <= 0.9].columns.tolist()
    col_tdrop = set(df.columns.tolist()) - set(nn_cols)
    return df

In [19]:
#df = nnous_sel(df)

In [20]:
#Fill Zero with numerical
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in df.columns:
    if df[i].dtype in numeric_dtypes:
        numerics.append(i)

        df.update(df[numerics].fillna(0))

In [21]:
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
#Fixing skewnes with numerical
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in df.columns:
    if df[i].dtype in numeric_dtypes:
        numerics2.append(i)
skew_features = df[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.6]
skew_index = high_skew.index

for i in skew_index:
    df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1))

In [22]:
#Function that fixes skewnes
from scipy.stats import skew
def fix_skew(c):
    """This function takes only numeric portion of the dataset and fix skewnes"""
    numeric_feats = c.dtypes[c.dtypes != "object"].index

    skewed_feats = c[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.4]
    skewed_feats = skewed_feats.index

    c[skewed_feats] = np.log1p(c[skewed_feats])
    return c 

df = fix_skew(df)

In [23]:
#Adding new features and drop non informative

df = df.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

df['YrBltAndRemod']=df['YearBuilt']+df['YearRemodAdd']
df['TotalSF']=df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

df['Total_sqr_footage'] = (df['BsmtFinSF1'] + df['BsmtFinSF2'] +
                                 df['1stFlrSF'] + df['2ndFlrSF'])

df['Total_Bathrooms'] = (df['FullBath'] + (0.5 * df['HalfBath']) +
                               df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))

df['Total_porch_sf'] = (df['OpenPorchSF'] + df['3SsnPorch'] +
                              df['EnclosedPorch'] + df['ScreenPorch'] +
                              df['WoodDeckSF'])

In [24]:
df['haspool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
df['has2ndfloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
df['hasgarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
df['hasbsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
df['hasfireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [25]:
#Check for missing values in categorical
missing_numcat(df, categorical=True)

Unnamed: 0,values,NaN vals,(%) NaN


In [26]:
#Check for missing values in numerical
missing_numcat(df, categorical=False)

Unnamed: 0,values,NaN vals,(%) NaN


In [27]:
#Check shape
print(df.shape)

(2917, 86)


In [28]:
#COnvert categorical values using dummy
#final = pd.get_dummies(df).reset_index(drop=True)
#final.shape
final = df

In [29]:
X = final.iloc[:len(y), :]
X_test = final.iloc[len(y):, :]
X.shape, y.shape, X_test.shape

((1458, 86), (1458,), (1459, 86))

In [30]:
outliers = [30, 88, 462, 631, 1322]
X = X.drop(X.index[outliers])
y = y.drop(y.index[outliers])

overfit = []
for i in X.columns:
    counts = X[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(X) * 100 > 90.00:
        overfit.append(i)

overfit = list(overfit)
X = X.drop(overfit, axis=1)
X_test = X_test.drop(overfit, axis=1)

In [31]:
X.shape, y.shape, X_test.shape

((1453, 65), (1453,), (1459, 65))

In [32]:
#For stratification
Z = pd.concat((X,y), axis=1)

In [33]:
#Grids
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
kfolds = KFold(n_splits=10, shuffle=True, random_state=42)
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

In [34]:
#Split  training dataset to train and test set
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.13, random_state=42)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.22, random_state=12)

for train_index, test_index in split.split(Z, Z['Neighborhood']):
    str_train = Z.iloc[train_index]
    str_test   = Z.iloc[test_index]
    

y_train = str_train['SalePrice']
y_ver  = str_test['SalePrice']
X_train = str_train.drop('SalePrice', axis = 1)
X_ver = str_test.drop('SalePrice', axis = 1)

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler, Normalizer
from sklearn.impute import SimpleImputer


housing_num = X_train._get_numeric_data()
housing_cat = X_train.select_dtypes('object')

In [36]:
num_pipeline = Pipeline([
    ('std_scal', RobustScaler(with_centering=True, with_scaling=True)),
])

In [37]:
cat_pipeline = Pipeline([
    ('one_hen', OneHotEncoder(handle_unknown='ignore')),
])

In [38]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
num_attrs = housing_num.columns.tolist()
cat_attrs = housing_cat.columns.tolist()
full_pipeline = ColumnTransformer([
    
    ('numerical_pl', num_pipeline, num_attrs),
    ('categorical_pl', cat_pipeline, cat_attrs ),
])

In [39]:
X_train_prepared = full_pipeline.fit_transform(X_train)
X_ver_prepared = full_pipeline.transform(X_ver)
test_prepared = full_pipeline.transform(X_test)

# Ridge

In [67]:
from sklearn.linear_model import Ridge
rdg = Ridge(alpha=0.01, max_iter=1e8)
rdg.fit(X_train_prepared, y_train)

Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=100000000.0,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [68]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error
def evaluate(X_train, y_train, X_test,y_test, clf):
    clf_predict_train = clf.predict(X_train)
    clf_predict_test = clf.predict(X_test)
    clfmse_train = mean_squared_error(np.expm1(y_train), np.expm1(clf_predict_train))**0.5
    clfmse_test = mean_squared_error(np.expm1(y_test), np.expm1(clf_predict_test))**0.5
    clfrmse_train = mean_squared_log_error(np.expm1(y_train), np.expm1(clf_predict_train))**0.5
    clfrmse_test = mean_squared_log_error(np.expm1(y_test), np.expm1(clf_predict_test))**0.5
    print("Evaluate---","***", "----")
    print("RMSE-train",clfmse_train)
    print("RMSE-test", clfmse_test)
    print("RMSLE-train", clfrmse_train)
    print("RMSLE-test", clfrmse_test)

In [69]:
#Ridge
evaluate(X_train_prepared, y_train, X_ver_prepared, y_ver,rdg)

Evaluate--- *** ----
RMSE-train 15289.723190993534
RMSE-test 19323.71102404728
RMSLE-train 0.07734764150934674
RMSLE-test 0.10621138633971687


# Lasso

In [82]:
from sklearn import linear_model
lso = linear_model.Lasso(max_iter=1e7, alpha=0.0007, random_state=42, tol=0.0001)
lso.fit(X_train_prepared, y_train)

Lasso(alpha=0.0007, copy_X=True, fit_intercept=True, max_iter=10000000.0,
      normalize=False, positive=False, precompute=False, random_state=42,
      selection='cyclic', tol=0.0001, warm_start=False)

In [83]:
#Lasso
evaluate(X_train_prepared, y_train, X_ver_prepared, y_ver,lso)

Evaluate--- *** ----
RMSE-train 19358.337399031487
RMSE-test 18397.147802744126
RMSLE-train 0.09670662635888894
RMSLE-test 0.09798096117024632


In [45]:
rdg_predict = rdg.predict(test_prepared)

In [46]:
rdg_predict = np.expm1(rdg_predict)
rdg_predict

array([123562.11449056, 160152.6557215 , 184734.13775574, ...,
       168399.94638457, 116549.90489401, 215864.79942269])

In [70]:
final_df= pd.DataFrame(data=zip(Id_test,rdg_predict), columns=['Id', 'SalePrice'])
final_df.to_csv('zillow/sample_submission.csv', index=False, columns=['Id', 'SalePrice'])
pd.read_csv('zillow/sample_submission.csv')
final_df;

# ElasticNet

In [92]:
from sklearn.linear_model import ElasticNet

elnet = ElasticNet(alpha=0.001)
elnet.fit(X_train_prepared, y_train)

ElasticNet(alpha=0.001, copy_X=True, fit_intercept=True, l1_ratio=0.5,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [93]:
#ElasticNet
evaluate(X_train_prepared, y_train, X_ver_prepared, y_ver,elnet)

Evaluate--- *** ----
RMSE-train 19085.488870571226
RMSE-test 18324.810445624335
RMSLE-train 0.09506550288383965
RMSLE-test 0.09810800356691095


In [115]:
rdg_predict = rdg.predict(X_ver_prepared)
lasso_predict = lso.predict(X_ver_prepared)
el_predict = lso.predict(X_ver_prepared)

In [122]:
stacked = np.expm1(0.1*rdg_predict + 0.3*lasso_predict + 0.6*el_predict)

In [123]:
mean_squared_log_error(np.expm1(y_ver), stacked)**0.5

0.09808204341353054

In [112]:
X_train.shape

(1133, 65)

In [113]:
X_ver.shape

(320, 65)