In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats


import warnings
warnings.filterwarnings('ignore')

In [2]:
#Importing dataset

train = pd.read_csv('zillow/train.csv')
Id = train['Id']
train = train.drop('Id', axis=1)
#Test for evaluation
test = pd.read_csv('zillow/test.csv')
test_id = test['Id']
test = test.drop('Id', axis=1)

train_len= len(train)
test_len = len(test)
print(train.shape)
print(test.shape)
print(test_len)
1460+1459

(1460, 80)
(1459, 79)
1459


2919

# Data Cleaning

In [3]:
def nnous_sel(df):
    """This function dedect non informative features with more than 90 % zeros"""
    nn_cols = df.loc[:, (df.isin([' ','NULL',0]) | df.isnull()).mean() <= 0.9].columns.tolist()
    col_tdrop = set(df.columns.tolist()) - set(nn_cols)
    return list(col_tdrop)

In [4]:
#Combined train /test
df = pd.concat((train, test), axis=0, ignore_index=True).reset_index(drop=True)
print(df.shape)

(2919, 80)


In [5]:
#Fixing typo
df = df.replace({"Exterior2nd":{"CmentBd":"CemntBd"}})


##Dropping Id from dataset
df.drop(nnous_sel(df), axis=1, inplace=True)  #For removed non informative columns in combined dataset
print(df.shape)

(2919, 71)


In [6]:
#Function that split dataset to numerical and textual and check for missing values

def missing_numcat(data, categorical=True):
    """This function splits dataset into numerical and textual portion of the data and make dataframe with missing values"""
    if categorical:
        categorical_text = data.select_dtypes(include='object')
        categorical_cnt = categorical_text.count()
        cat_with_missing = categorical_cnt[categorical_cnt.values < len(data)]
        cat_percent =  (len(data) - cat_with_missing.values) / len(data)
        missing_categorical =pd.DataFrame({'values': cat_with_missing,
                                  'NaN vals':(len(data) - cat_with_missing.values), '(%) NaN':cat_percent})
        return missing_categorical
    else:
        numerical_data = data._get_numeric_data()
        numerical_cnt = numerical_data.count()
        num_with_missing = numerical_cnt[numerical_cnt.values < len(data)]
        num_percent =  (len(data) - num_with_missing.values) / len(data)
        missing_numerical =pd.DataFrame({'values': num_with_missing,
                                          'NaN vals':(len(data) - num_with_missing.values), '(%) NaN':num_percent}) 
        return missing_numerical

In [7]:
#According to the documentation these cathegorical features should have No feature as string instead nan values
df['BsmtQual'] = df['BsmtQual'].fillna('No Basement')
df['BsmtCond'] = df['BsmtCond'].fillna('No Basement')
df['BsmtExposure'] = df['BsmtExposure'].fillna('No Basement')
df['BsmtFinType1'] = df['BsmtFinType1'].fillna('No Basement')
df['BsmtFinType2'] = df['BsmtFinType2'].fillna('No Basement')
df['FireplaceQu'] = df['FireplaceQu'].fillna('No Fireplace')
df['GarageType']  = df['GarageType'].fillna('No Garage')
df['GarageFinish'] = df['GarageFinish'].fillna('No Garage')
df['GarageQual'] = df['GarageQual'].fillna('No Garage')
df['GarageCond'] = df['GarageCond'].fillna('No Garage')
df['Fence'] = df['Fence'].fillna('NoFence')
df['MasVnrType'] = df['MasVnrType'].fillna('No Veneer')

In [8]:
#Dealing with categorical missing data
missing_numcat(df, categorical=True)

Unnamed: 0,values,NaN vals,(%) NaN
Electrical,2918,1,0.000343
Exterior1st,2918,1,0.000343
Exterior2nd,2918,1,0.000343
Functional,2917,2,0.000685
KitchenQual,2918,1,0.000343
MSZoning,2915,4,0.00137
SaleType,2918,1,0.000343
Utilities,2917,2,0.000685


In [9]:
def ffier(df, cols, method = None):
    if method == 'ffill':
        for col in cols:
            df[col] = df[col].fillna(method = method)
    return df[col]

In [10]:
cls = missing_numcat(df, categorical=True).index.tolist()
ffier(df, cls, method = 'ffill');

In [11]:
missing_numcat(df, categorical=True)  #check for missing in numerical

Unnamed: 0,values,NaN vals,(%) NaN


In [12]:
cls_num = missing_numcat(df, categorical=False).index.tolist()
for column in cls_num:
    df[column] = df[column].fillna(df[column].mean())

In [13]:
missing_numcat(df, categorical=False)  #check for missing in categorical

Unnamed: 0,values,NaN vals,(%) NaN


In [14]:
from scipy.stats import skew
def fix_skew(c):
    numeric_feats = c.dtypes[c.dtypes != "object"].index

    skewed_feats = c[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index

    c[skewed_feats] = np.log1p(c[skewed_feats])
    return c

df = fix_skew(df)

In [15]:
#Splitting the data to origina state
train = df[0:train_len]
test  = df[test_len:]

In [16]:
print(train.shape)
print(test.shape)

(1460, 71)
(1460, 71)


In [17]:
#Outlier removal
print("Shape before removal ", train.shape)
#train.drop(train[(train['OverallQual']<5) & (train['SalePrice']>200000)].index, inplace=True)
train.drop(train[(train['GrLivArea']>3100) & (train['SalePrice']<300000)].index, inplace=True)
print("Shape after removal ", train.shape)
print(test.shape)

Shape before removal  (1460, 71)
Shape after removal  (1460, 71)
(1460, 71)


In [18]:
train = train.copy()
y = train.SalePrice  #Create target variable
X = train.drop(labels=['SalePrice'], axis=1).copy() #Cut SalePrice variable from training set

In [19]:
#Split  training dataset to train and test set
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.13, random_state=42)

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, train_size=0.8, random_state=42)
for train_index, test_index in split.split(X, X['Neighborhood']):
    str_train = train.iloc[train_index]
    str_test   = train.iloc[test_index]

#OverallQual, TotRmsAbvGrd, BedroomAbvGr
y_train = str_train['SalePrice']
y_test  = str_test['SalePrice']
X_train = str_train.drop('SalePrice', axis = 1)
X_test = str_test.drop('SalePrice', axis = 1)

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, RobustScaler, Normalizer, scale
from sklearn.impute import SimpleImputer


housing_num = X_train._get_numeric_data()
housing_cat = X_train.select_dtypes('object')

In [21]:
num_pipeline = Pipeline([
    ('std_scal', RobustScaler()),
])

cat_pipeline = Pipeline([
    ('one_hen', OneHotEncoder(handle_unknown='ignore')),
])

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
num_attrs = housing_num.columns.tolist()
cat_attrs = housing_cat.columns.tolist()
full_pipeline = ColumnTransformer([
    ('numerical_pl', num_pipeline, num_attrs),
    ('categorical_pl', cat_pipeline, cat_attrs ),
])
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)
test_prepared = full_pipeline.transform(test)

In [23]:
test_prepared.shape

(1460, 280)

In [24]:
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.model_selection import cross_val_score
#Score Display
def display_scores(scores):
    """This function display scores , mean and standard deviation uses scores as input"""
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
def evaluate(X_train, y_train, X_test, y_test, clf):
    clf_predict_train = clf.predict(X_train)
    clf_predict_test = clf.predict(X_test)
    clfmse_train = (mean_squared_error(y_train, clf_predict_train)**0.5)
    clfmse_test = (mean_squared_error(y_test, clf_predict_test)**0.5)
    clfrmse_train = (mean_squared_log_error(y_train, clf_predict_train)**0.5)*10
    clfrmse_test = (mean_squared_log_error(y_test, clf_predict_test)**0.5)*10
    print("Evaluate---","***", "----")
    print("RMSE-train",clfmse_train )
    print("RMSE-test", clfmse_test)
    print("RMSLE-train", clfrmse_train)
    print("RMSLE-test", clfrmse_test)    

In [25]:
#Random Forest
from sklearn.ensemble import RandomForestRegressor

rf_reg = RandomForestRegressor(n_estimators=20,max_depth=20)
rf_reg.fit(X_train_prepared, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [26]:
evaluate(X_train_prepared, y_train, X_test_prepared, y_test,rf_reg)

Evaluate--- *** ----
RMSE-train 0.060256542254404966
RMSE-test 0.14694563844690922
RMSLE-train 0.04721350759671555
RMSLE-test 0.11247555868519894


In [27]:
rf_predict = rf_reg.predict(test_prepared)

In [28]:
final_df= pd.DataFrame(data=zip(test_id,rf_predict*10000), columns=['Id', 'SalePrice'])
final_df.to_csv('zillow/sample_submission.csv', index=False, columns=['Id', 'SalePrice'])
pd.read_csv('zillow/sample_submission.csv')

Unnamed: 0,Id,SalePrice
0,1461,118993.540141
1,1462,116853.174272
2,1463,119629.417884
3,1464,120766.632963
4,1465,121250.984986
...,...,...
1454,2915,113280.891414
1455,2916,113251.218588
1456,2917,113509.061175
1457,2918,119126.914613


In [29]:
final_df.shape

(1459, 2)