In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats


import warnings
warnings.filterwarnings('ignore')


In [2]:
#Importing dataset

train = pd.read_csv('zillow/train.csv')

#Test for evaluation
test = pd.read_csv('zillow/test.csv')

In [3]:
#Outlier removal

print("Shape before removal ", train.shape)
#train.drop(train[(train['OverallQual']<5) & (train['SalePrice']>200000)].index, inplace=True)
train.drop(train[(train['GrLivArea']>3100) & (train['SalePrice']<300000)].index, inplace=True)
print("Shape after removal ", train.shape)

Shape before removal  (1460, 81)
Shape after removal  (1455, 81)


In [4]:
#Fixing typo

df_train = train.replace({"Exterior2nd":{"CmentBd":"CemntBd"}})

#Saving Id for further refference
Id = df_train['Id']

#Dropping Id from dataset
df_train.drop('Id', axis=1, inplace=True)  #For feature engeneering

In [5]:
#Function that split dataset to numerical and textual and check for missing values

def missing_numcat(data, categorical=True):
    """This function splits dataset into numerical and textual portion of the data and make dataframe with missing values"""
    if categorical:
        categorical_text = data.select_dtypes(include='object')
        categorical_cnt = categorical_text.count()
        cat_with_missing = categorical_cnt[categorical_cnt.values < len(data)]
        cat_percent =  (len(data) - cat_with_missing.values) / len(data)
        missing_categorical =pd.DataFrame({'values': cat_with_missing,
                                  'NaN vals':(len(data) - cat_with_missing.values), '(%) NaN':cat_percent})
        return missing_categorical
    else:
        numerical_data = data._get_numeric_data()
        numerical_cnt = numerical_data.count()
        num_with_missing = numerical_cnt[numerical_cnt.values < len(data)]
        num_percent =  (len(data) - num_with_missing.values) / len(data)
        missing_numerical =pd.DataFrame({'values': num_with_missing,
                                          'NaN vals':(len(data) - num_with_missing.values), '(%) NaN':num_percent}) 
        return missing_numerical
    
    
    
#Function that fixes skewnes
from scipy.stats import skew
def fix_skew(c):
    """This function takes only numeric portion of the dataset and fix skewnes"""
    numeric_feats = c.dtypes[c.dtypes != "object"].index

    skewed_feats = c[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index

    c[skewed_feats] = np.log1p(c[skewed_feats])
    return c    

In [6]:
cat_missing = missing_numcat(df_train, categorical=True)
num_missing = missing_numcat(df_train, categorical=False)

In [7]:
#Missing data on cathegorical /Training

cat_missing 

Unnamed: 0,values,NaN vals,(%) NaN
Alley,90,1365,0.938144
MasVnrType,1447,8,0.005498
BsmtQual,1418,37,0.02543
BsmtCond,1418,37,0.02543
BsmtExposure,1417,38,0.026117
BsmtFinType1,1418,37,0.02543
BsmtFinType2,1417,38,0.026117
Electrical,1454,1,0.000687
FireplaceQu,766,689,0.47354
GarageType,1375,80,0.054983


In [8]:
df_train.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [9]:
#According to the documentation these cathegorical features should have No feature as string instead nan values
df_train['Alley'] = df_train['Alley'].fillna('No Alley Access')
df_train['BsmtQual'] = df_train['BsmtQual'].fillna('No Basement')
df_train['BsmtCond'] = df_train['BsmtCond'].fillna('No Basement')
df_train['BsmtExposure'] = df_train['BsmtExposure'].fillna('No Basement')
df_train['BsmtFinType1'] = df_train['BsmtFinType1'].fillna('No Basement')
df_train['BsmtFinType2'] = df_train['BsmtFinType2'].fillna('No Basement')
df_train['FireplaceQu'] = df_train['FireplaceQu'].fillna('No Fireplace')
df_train['GarageType']  = df_train['GarageType'].fillna('No Garage')
df_train['GarageFinish'] = df_train['GarageFinish'].fillna('No Garage')
df_train['GarageQual'] = df_train['GarageQual'].fillna('No Garage')
df_train['GarageCond'] = df_train['GarageCond'].fillna('No Garage')
df_train['PoolQC'] = df_train['PoolQC'].fillna('No Pool')
df_train['Fence'] = df_train['Fence'].fillna('NoFence')
df_train['MiscFeature'] = df_train['MiscFeature'].fillna('No msc feature')
df_train['MasVnrType'] = df_train['MasVnrType'].fillna('No Veneer')

In [10]:
#Check after imputation
missing_numcat(df_train, categorical=True)

Unnamed: 0,values,NaN vals,(%) NaN
Electrical,1454,1,0.000687


In [11]:
#Check for missing values in  numerical portion of the data
num_missing

Unnamed: 0,values,NaN vals,(%) NaN
LotFrontage,1196,259,0.178007
MasVnrArea,1447,8,0.005498
GarageYrBlt,1375,80,0.054983


In [12]:
def nnous_sel(df):
    """This function dedect non informative features with more than 90 % zeros"""
    nn_cols = df.loc[:, (df.isin([' ','NULL',0]) | df.isnull()).mean() <= 0.9].columns.tolist()
    col_tdrop = set(df.columns.tolist()) - set(nn_cols)
    return df

In [13]:
df_train = df_train.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

df_train['YrBltAndRemod']=df_train['YearBuilt']+df_train['YearRemodAdd']
df_train['TotalSF']=df_train['TotalBsmtSF'] + df_train['1stFlrSF'] + df_train['2ndFlrSF']

df_train['Total_sqr_footage'] = (df_train['BsmtFinSF1'] + df_train['BsmtFinSF2'] +
                                 df_train['1stFlrSF'] + df_train['2ndFlrSF'])

df_train['Total_Bathrooms'] = (df_train['FullBath'] + (0.5 * df_train['HalfBath']) +
                               df_train['BsmtFullBath'] + (0.5 * df_train['BsmtHalfBath']))

df_train['Total_porch_sf'] = (df_train['OpenPorchSF'] + df_train['3SsnPorch'] +
                              df_train['EnclosedPorch'] + df_train['ScreenPorch'] +
                              df_train['WoodDeckSF'])
df_train['AreaAboveGrade'] = df_train['1stFlrSF'] + df_train['2ndFlrSF']

In [14]:
test = test.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

test['YrBltAndRemod']=test['YearBuilt']+test['YearRemodAdd']
test['TotalSF']=test['TotalBsmtSF'] + test['1stFlrSF'] + test['2ndFlrSF']

test['Total_sqr_footage'] = (test['BsmtFinSF1'] + test['BsmtFinSF2'] +
                                 test['1stFlrSF'] + test['2ndFlrSF'])

test['Total_Bathrooms'] = (test['FullBath'] + (0.5 * test['HalfBath']) +
                               test['BsmtFullBath'] + (0.5 * test['BsmtHalfBath']))

test['Total_porch_sf'] = (test['OpenPorchSF'] + test['3SsnPorch'] +
                              test['EnclosedPorch'] + test['ScreenPorch'] +
                              test['WoodDeckSF'])
test['AreaAboveGrade'] = test['1stFlrSF'] + test['2ndFlrSF']

In [15]:
df_train['MSSubClass'] = df_train['MSSubClass'].apply(str)
df_train['YrSold'] = df_train['YrSold'].astype(str)
df_train['MoSold'] = df_train['MoSold'].astype(str)

test['MSSubClass'] = test['MSSubClass'].apply(str)
test['YrSold'] = test['YrSold'].astype(str)
test['MoSold'] = test['MoSold'].astype(str)

In [16]:
df_train.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Alley', 'LotShape',
       'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
    

In [17]:
df_train['LowQualFinSF'].describe()

count    1455.000000
mean        5.241237
std        45.727997
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max       572.000000
Name: LowQualFinSF, dtype: float64

In [18]:
df_train = df_train.copy()
y = df_train.SalePrice  #Create target variable
X = df_train.drop(labels=['SalePrice'], axis=1).copy() #Cut SalePrice variable from training set

In [19]:
X = nnous_sel(X) #deleting non informative columns from training set

In [20]:
#Split  training dataset to train and test set
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.13, random_state=42)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.1, train_size=0.9, random_state=42)
for train_index, test_index in split.split(X, X['Neighborhood']):
    str_train = df_train.iloc[train_index]
    str_test   = df_train.iloc[test_index]

#OverallQual, TotRmsAbvGrd, BedroomAbvGr
y_train = str_train['SalePrice']
y_test  = str_test['SalePrice']
X_train = str_train.drop('SalePrice', axis = 1)
X_test = str_test.drop('SalePrice', axis = 1)

In [21]:
#Dealing with nan - Categorical
missing_cat_args = missing_numcat(X_train, categorical=True).index.tolist()
missing_cat_args

['Electrical']

In [22]:
#Dealing with nan - Numerical
missing_numcat(X_test, categorical=False)

Unnamed: 0,values,NaN vals,(%) NaN
LotFrontage,118,28,0.191781
GarageYrBlt,139,7,0.047945


In [23]:
from sklearn.base import TransformerMixin, BaseEstimator

class UniversalImputer(BaseEstimator, TransformerMixin):
    
    def __init__(self):
               """Impute missing values.

        If the Series is of dtype Object, then impute with the most frequent object.
        If the Series is not of dtype Object, then impute with the mean.  

        """
        
    def fit(self, X, y=None):
        
        self.fill = pd.Series([X[c].value_counts().index[0] if X[c].dtype == np.dtype('O') else X[c].ffill() for c in X],
        index = X.columns)
        #if X.dtype == np.dtype('O'):
        #    self.fill = X.value_counts().index()[0]
        #else:
        #    self.fill = X.mean()    
    
        return self
    
    def transform(self, X, y=None):
        return X.fillna(self.fill)
        

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler, Normalizer
from sklearn.impute import SimpleImputer


housing_num = X_train._get_numeric_data()
housing_cat = X_train.select_dtypes('object')

In [25]:
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scal', RobustScaler()),
])

In [26]:
cat_pipeline = Pipeline([
    ('imput_cat', UniversalImputer()),
    ('one_hen', OneHotEncoder(handle_unknown='ignore')),
])


In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
num_attrs = housing_num.columns.tolist()
cat_attrs = housing_cat.columns.tolist()
full_pipeline = ColumnTransformer([
    
    ('numerical_pl', num_pipeline, num_attrs),
    ('categorical_pl', cat_pipeline, cat_attrs ),
])

In [28]:
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)
test_prepared = full_pipeline.transform(nnous_sel(test))

In [29]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import cross_val_score
#Score Display
def display_scores(scores):
    """This function display scores , mean and standard deviation uses scores as input"""
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [30]:
def evaluate(X_train, y_train, X_test,y_test, clf):
    clf_predict_train = clf.predict(X_train)
    clf_predict_test = clf.predict(X_test)
    clfmse_train = mean_squared_error(y_train, clf_predict_train)**0.5
    clfmse_test = mean_squared_error(y_test, clf_predict_test)**0.5
    clfrmse_train = mean_squared_log_error(y_train, clf_predict_train)**0.5
    clfrmse_test = mean_squared_log_error(y_test, clf_predict_test)**0.5
    print("Evaluate---","***", "----")
    print("RMSE-train",clfmse_train)
    print("RMSE-test", clfmse_test)
    print("RMSLE-train", clfrmse_train)
    print("RMSLE-test", clfrmse_test)

In [31]:
from sklearn.model_selection import RandomizedSearchCV# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [32]:
#rf_random.best_params_

In [33]:
list({'n_estimators': 1400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 100,
 'bootstrap': True})

['n_estimators',
 'min_samples_split',
 'min_samples_leaf',
 'max_features',
 'max_depth',
 'bootstrap']

In [34]:
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(n_estimators= 1400,
 min_samples_split=2,
 min_samples_leaf= 1,
 max_features= 'auto',
 max_depth= 100,
 bootstrap= True)
rf_reg.fit(X_train_prepared, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=100,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=1400,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [35]:
#Rndom forest
evaluate(X_train_prepared, y_train, X_test_prepared, y_test,rf_reg);

Evaluate--- *** ----
RMSE-train 9165.498249807917
RMSE-test 20345.95043943756
RMSLE-train 0.054590478855733565
RMSLE-test 0.10486828993680464


In [36]:
id_ = test['Id']
df_test = test.drop(['Id'], axis=1)

In [37]:
rf_predict = rf_reg.predict(test_prepared)

In [38]:
final_df= pd.DataFrame(data=zip(id_,rf_predict), columns=['Id', 'SalePrice'])
final_df.to_csv('zillow/sample_submission.csv', index=False, columns=['Id', 'SalePrice'])
pd.read_csv('zillow/sample_submission.csv')
final_df

Unnamed: 0,Id,SalePrice
0,1461,130036.839286
1,1462,165024.283571
2,1463,180521.097143
3,1464,186214.192143
4,1465,196810.023571
...,...,...
1454,2915,86402.027143
1455,2916,92355.686429
1456,2917,162486.857857
1457,2918,115319.778571


In [39]:
final_df['SalePrice'].describe()

count      1459.000000
mean     179821.277498
std       76989.977357
min       61253.430714
25%      130322.167500
50%      158661.420714
75%      210397.715714
max      645515.270000
Name: SalePrice, dtype: float64