In [2]:
# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [3]:
# importing datasets
train_df = pd.read_csv("dataset/train.csv")
train_df.drop(['Id'], axis=1, inplace=True)
print(train_df.shape)
train_df.head()

(1460, 80)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


## Data Preprocessing

In [4]:
# finding missing values..
train_df.isnull().sum().sort_values(ascending=False).head(10)

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageCond        81
GarageFinish      81
GarageQual        81
dtype: int64

In [5]:
# Dropping top 4 columns as more than 50% of the data are missing...
train_df.drop(labels=['PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1, inplace=True)

In [6]:
# getting information of each columns to handle missing values
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 76 columns):
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-null object
Exterior2nd      1460 non-null object
MasVnrType       1452 no

In [7]:
# separating categorical and quantitative dtypes..
# and fixxing missing categorical features with mode value and quantitative features with mean value
quantitative_features = []
categorical_features = []

for col in train_df.columns:
    if train_df[col].dtype == 'object':
        categorical_features.append(col)
        train_df[col].fillna(value=train_df[col].mode()[0], inplace=True)
    else:
        quantitative_features.append(col)
        train_df[col].fillna(value=train_df[col].mean(), inplace=True)
        
# print('C Feature: \n', categorical_features)
# print('Q Feature: \n', quantitative_features)

In [8]:
train_df.shape

(1460, 76)

## Handeling categorical features..

In [63]:
# imorting precessed test dataset inorder to implement OneHotEncoder

test_df=pd.read_csv("2_formulated_test.csv")
test_id = test_df['Id']
test_df.drop('Id', inplace=True, axis=1)
test_df.shape

(1459, 75)

In [10]:
# concating train and test dataset as there may be some categorical features in test dataset that are missing in train dataset
temp_df = pd.concat([train_df, test_df], axis=0, ignore_index=True)

In [11]:
temp_df.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,BedroomAbvGr,BldgType,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,...,SaleType,ScreenPorch,Street,TotRmsAbvGrd,TotalBsmtSF,Utilities,WoodDeckSF,YearBuilt,YearRemodAdd,YrSold
0,856,854,0,3,1Fam,TA,No,706.0,0.0,GLQ,...,WD,0,Pave,8,856.0,AllPub,0,2003,2003,2008
1,1262,0,0,3,1Fam,TA,Gd,978.0,0.0,ALQ,...,WD,0,Pave,6,1262.0,AllPub,298,1976,1976,2007
2,920,866,0,3,1Fam,TA,Mn,486.0,0.0,GLQ,...,WD,0,Pave,6,920.0,AllPub,0,2001,2002,2008
3,961,756,0,3,1Fam,Gd,No,216.0,0.0,ALQ,...,WD,0,Pave,7,756.0,AllPub,0,1915,1970,2006
4,1145,1053,0,4,1Fam,TA,Av,655.0,0.0,GLQ,...,WD,0,Pave,9,1145.0,AllPub,192,2000,2000,2008


In [12]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()

for feature in categorical_features:    
    # creating dummy of each categorical feature
    dummy_df = pd.DataFrame(onehotencoder.fit_transform(temp_df[[feature]]).toarray(), columns=onehotencoder.categories_)

    # droping original column 
    temp_df.drop(feature, axis=1, inplace=True)
    
    # concatinating dummy with original dataframe
    temp_df = pd.concat([temp_df, dummy_df], axis=1)

In [13]:
# number of features increased as each categorical value is encoded.. 
temp_df.shape

(2919, 276)

In [14]:
# removing any duplicated columns.
temp_df = temp_df.loc[:, ~temp_df.columns.duplicated()]

In [15]:
# splitting original train and test dataset

# train set contains all the value of saleprice whereas test set contains missing values for saleprice
train_df = temp_df[temp_df['SalePrice'].notnull()]
test_df = temp_df[temp_df['SalePrice'].isnull()]


In [33]:
X = train_df.drop('SalePrice', axis=1).values
y = train_df['SalePrice'].values


## Creating Model

In [48]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=500)
regressor.fit(X, y)
print("Score :", regressor.score(X, y))


# from sklearn.model_selection import GridSearchCV
# parameters = {'n_estimators':[50, 100, 250, 500, 1000]}
# gridsearch = GridSearchCV(estimator=regressor, param_grid=parameters, n_jobs=-1, cv=10)
# gridsearch.fit(X, y)

Score : 0.9815958582126525


In [None]:
# using regressor in test dataset

In [58]:
X_test = test_df.drop('SalePrice', axis=1).values
X_test 

array([[8.960e+02, 0.000e+00, 0.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [1.329e+03, 0.000e+00, 0.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [9.280e+02, 7.010e+02, 0.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [1.224e+03, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,
        0.000e+00],
       [9.700e+02, 0.000e+00, 0.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [9.960e+02, 1.004e+03, 0.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00]])

In [59]:
y_pred = regressor.predict(X_test)

In [92]:
op_df = pd.concat([pd.DataFrame(test_id), pd.DataFrame(y_pred, columns=['SalePrice'])], axis=1)

In [93]:
op_df.to_csv('2_submission.csv', index=False)