# Introduction

This notebook depicts the code used for the **Housing Pricing Competition** in  kaggle

In the following lines i will do this series of steps:

1. Read test and train data provided by kaggle
2. Explore said data
3. Build a Random Forest model with the data, and see how accurate it can get
4. Create a submission csv file and then upload it to kaggle

In [1]:
#Importing the libraries i think will be required
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

In [2]:
#set my file names
test_file = 'test.csv'
train_file = 'train.csv'
submission_file = 'luisreyes_submission.csv'

In [3]:
#load data into pandas frame
train_df = pd.read_csv(train_file, index_col='Id')
                       
# Remove rows with missing target, separate target from predictors
train_df.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = train_df.SalePrice                     
train_df.drop(['SalePrice'], axis=1, inplace=True)
# Create X which will be our main source
X = train_df


### Basic Exploratory Analysis

In [4]:
#check data description
X.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [5]:
X.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,46.549315,...,472.980137,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753
std,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,161.319273,...,213.804841,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095
min,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,0.0,...,334.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0
50%,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,0.0,...,480.0,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,0.0,...,576.0,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1474.0,...,1418.0,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0


In [6]:
#quantify missing values
#check missing data
print(X.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(1460, 79)
LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64


In [7]:
#divide our data into 60% training , 40 percent test
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.6, test_size=0.4,
                                                      random_state=0)

#After verifying that some columns are more than 70%empty we can drop(also drop id because it doesnt add anything valuable to the mix)
missing_values = ['Alley','PoolQC','Fence','MiscFeature','FireplaceQu']
X_train = X_train.drop(missing_values, axis=1)
X_valid = X_valid.drop(missing_values, axis=1)

In [8]:
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if 
                   set(X_train[col]) == set(X_valid[col])]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be label encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be label encoded: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'BldgType', 'HouseStyle', 'RoofStyle', 'MasVnrType', 'ExterQual', 'Foundation', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'CentralAir', 'KitchenQual', 'GarageFinish', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['Electrical', 'Utilities', 'Heating', 'Neighborhood', 'ExterCond', 'Condition2', 'Exterior2nd', 'Exterior1st', 'Condition1', 'GarageType', 'Functional', 'RoofMatl', 'HeatingQC', 'GarageQual', 'BsmtCond']


In [9]:
from sklearn.preprocessing import LabelEncoder

# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)


# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in good_label_cols:
    label_X_train[col] = label_encoder.fit_transform(X_train[col].astype(str))
    label_X_valid[col] = label_encoder.transform(X_valid[col].astype(str))

In [10]:
#verify categorical features are succesfully encoded
label_X_train[0:5]

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,BldgType,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1170,60,3,118.0,35760,1,0,3,1,0,0,...,76,0,0,0,0,0,7,2006,8,4
1147,20,3,,11200,1,3,3,4,0,0,...,26,0,0,0,0,0,5,2006,8,4
504,20,3,100.0,15602,1,0,3,4,0,0,...,54,0,0,161,0,0,3,2010,8,4
1450,180,4,21.0,1533,1,3,3,4,0,3,...,0,0,0,0,0,0,8,2006,8,0
1064,30,4,50.0,6000,1,3,3,4,0,0,...,120,0,0,0,0,0,7,2006,8,4


In [37]:
#using SimpleImputer we will handle missing values
imputation = SimpleImputer(strategy = 'mean') 
imputed_X_train = pd.DataFrame( imputation.fit_transform(label_X_train))
imputed_X_valid = pd.DataFrame(imputation.transform(label_X_valid))

#imputation removed column names; put them back
imputed_X_train.columns = label_X_train.columns
imputed_X_valid.columns = label_X_valid.columns

In [38]:
imputed_X_train[0:5]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,BldgType,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60.0,3.0,118.0,35760.0,1.0,0.0,3.0,1.0,0.0,0.0,...,76.0,0.0,0.0,0.0,0.0,0.0,7.0,2006.0,8.0,4.0
1,20.0,3.0,69.783032,11200.0,1.0,3.0,3.0,4.0,0.0,0.0,...,26.0,0.0,0.0,0.0,0.0,0.0,5.0,2006.0,8.0,4.0
2,20.0,3.0,100.0,15602.0,1.0,0.0,3.0,4.0,0.0,0.0,...,54.0,0.0,0.0,161.0,0.0,0.0,3.0,2010.0,8.0,4.0
3,180.0,4.0,21.0,1533.0,1.0,3.0,3.0,4.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8.0,2006.0,8.0,0.0
4,30.0,4.0,50.0,6000.0,1.0,3.0,3.0,4.0,0.0,0.0,...,120.0,0.0,0.0,0.0,0.0,0.0,7.0,2006.0,8.0,4.0


### Model Building

We'll be using **Random Forest** to evaluate and predict the housing prices

Determining which features are important

In [39]:
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(imputed_X_train, y_train)

feat_labels = imputed_X_train.columns
# Print the name and gini importance of each feature
for feature in zip(feat_labels, clf.feature_importances_):
    print(feature)

('MSSubClass', 0.014662137069875958)
('MSZoning', 0.010253263863283773)
('LotFrontage', 0.039303050257876955)
('LotArea', 0.044212523089688434)
('Street', 0.0008556791551346901)
('LotShape', 0.01142305288225821)
('LandContour', 0.006327925980328041)
('LotConfig', 0.012896253882171353)
('LandSlope', 0.003845439361677309)
('BldgType', 0.006032462092908377)
('HouseStyle', 0.012556046877990246)
('OverallQual', 0.020181167732250017)
('OverallCond', 0.018764826542341205)
('YearBuilt', 0.03553775751484402)
('YearRemodAdd', 0.031010777662893695)
('RoofStyle', 0.009544449224237276)
('MasVnrType', 0.013230615576547942)
('MasVnrArea', 0.02428939636169443)
('ExterQual', 0.008097270589854969)
('Foundation', 0.01002947221091087)
('BsmtQual', 0.010742540956072532)
('BsmtExposure', 0.014001552130702175)
('BsmtFinType1', 0.019323944985944277)
('BsmtFinSF1', 0.032506073376673)
('BsmtFinType2', 0.009477770481729677)
('BsmtFinSF2', 0.011225501192600835)
('BsmtUnfSF', 0.04123347395604518)
('TotalBsmtSF', 0

In [40]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.1
sfm = SelectFromModel(clf, threshold=0.02)

# Train the selector
sfm.fit(imputed_X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
        max_features=None, norm_order=1, prefit=False, threshold=0.02)

In [41]:
# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])

LotFrontage
LotArea
OverallQual
YearBuilt
YearRemodAdd
MasVnrArea
BsmtFinSF1
BsmtUnfSF
TotalBsmtSF
1stFlrSF
2ndFlrSF
GrLivArea
TotRmsAbvGrd
GarageYrBlt
GarageArea
WoodDeckSF
OpenPorchSF
MoSold
YrSold


In [42]:
# Transform the data to create a new dataset containing only the most important features
X_important_train = sfm.transform(imputed_X_train)
X_important_test = sfm.transform(imputed_X_valid)

In [56]:
X_important_train

array([[1.1800000e+02, 3.5760000e+04, 1.0000000e+01, ..., 7.6000000e+01,
        7.0000000e+00, 2.0060000e+03],
       [6.9783032e+01, 1.1200000e+04, 6.0000000e+00, ..., 2.6000000e+01,
        5.0000000e+00, 2.0060000e+03],
       [1.0000000e+02, 1.5602000e+04, 7.0000000e+00, ..., 5.4000000e+01,
        3.0000000e+00, 2.0100000e+03],
       ...,
       [6.8000000e+01, 8.9300000e+03, 6.0000000e+00, ..., 0.0000000e+00,
        4.0000000e+00, 2.0100000e+03],
       [6.9783032e+01, 3.1960000e+03, 7.0000000e+00, ..., 2.0000000e+01,
        1.0000000e+01, 2.0060000e+03],
       [5.8000000e+01, 1.6770000e+04, 7.0000000e+00, ..., 8.1000000e+01,
        6.0000000e+00, 2.0100000e+03]])

In [44]:
# Create a new random forest classifier for the most important features
clf_important = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)

# Train the new classifier on the new dataset containing the most important features
clf_important.fit(X_important_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [45]:
# Apply The Full Featured Classifier To The Test Data
y_important_pred = clf_important.predict(X_important_test)

# View The Accuracy Of Our Limited Feature (2 Features) Model
accuracy_score(y_valid, y_important_pred)

0.008561643835616438

In [46]:
# Apply The Full Featured Classifier To The Test Data
y_pred = clf.predict(imputed_X_valid)

# View The Accuracy Of Our Full Feature (4 Features) Model
accuracy_score(y_valid, y_pred)

0.005136986301369863

In [48]:
# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [49]:
score_dataset(X_important_train,X_important_test,y_train,y_valid)

18886.896592465753

In [50]:
score_dataset(imputed_X_train,imputed_X_valid,y_train,y_valid)

17419.9236130137

Model building with test data

In [28]:
# To improve accuracy, create a new Random Forest model which you will train on all training data
forest_full_data = RandomForestRegressor(n_estimators=345,random_state=1)

# fit rf_model_on_full_data on all data from the training data
forest_full_data.fit(X,y)

test_df = pd.read_csv(test_file)

test_X = test_df[features]

# Imputation
my_imputer = SimpleImputer()
imputed_X_test = pd.DataFrame(my_imputer.fit_transform(test_X))

# Imputation removed column names; put them back
imputed_X_test.columns = train_X.columns

test_prediction = forest_full_data.predict(imputed_X_test)
test_prediction

array([127777.14782609, 153264.63768116, 167241.72753623, ...,
       180380.06956522, 111491.88405797, 236521.00289855])

In [26]:
#saves submission csv for competition
output = pd.DataFrame({'Id': test_df.Id,
                       'SalePrice': test_prediction})
output.to_csv('submission.csv', index=False)