In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Feature Engineering

#### Handling missing data

In [4]:
#Getting the number of null items on each column
null_count = data.isnull().sum()
null_count.head(40)

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
Alley           1369
LotShape           0
LandContour        0
Utilities          0
LotConfig          0
LandSlope          0
Neighborhood       0
Condition1         0
Condition2         0
BldgType           0
HouseStyle         0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
RoofStyle          0
RoofMatl           0
Exterior1st        0
Exterior2nd        0
MasVnrType         8
MasVnrArea         8
ExterQual          0
ExterCond          0
Foundation         0
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinSF1         0
BsmtFinType2      38
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
Heating            0
dtype: int64

In [5]:
#The number of null items on the last 41 columns
null_count.tail(41)

HeatingQC           0
CentralAir          0
Electrical          1
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
GrLivArea           0
BsmtFullBath        0
BsmtHalfBath        0
FullBath            0
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive          0
WoodDeckSF          0
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
PoolQC           1453
Fence            1179
MiscFeature      1406
MiscVal             0
MoSold              0
YrSold              0
SaleType            0
SaleCondition       0
SalePrice           0
dtype: int64

In [6]:
#The non-null count and their data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [7]:
#I am going to drop the columns with a very large number of missing(null) values
data.drop(['Alley', 'PoolQC', 'Fence', 'MiscFeature'], axis=1, inplace=True)

In [8]:
#Confirm that the above columns have been dropped
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 77 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [9]:
#Handle the remaining columns that have any missing data
#Since lot frontage has float data-type we can get the mean of the column and use that to fill in the miissing values
data['LotFrontage'] = data['LotFrontage'].fillna(data['LotFrontage'].mean())

In [10]:
#The other columns have an object data-type and therefore the mean strategy won't work on them
#I chose to use mode instead
data['FireplaceQu']= data['FireplaceQu'].fillna(data['FireplaceQu'].mode()[0])
data['BsmtQual']= data['BsmtQual'].fillna(data['BsmtQual'].mode()[0])
data['BsmtCond']= data['BsmtCond'].fillna(data['BsmtCond'].mode()[0])
data['BsmtExposure']= data['BsmtExposure'].fillna(data['BsmtExposure'].mode()[0])
data['BsmtFinType1']= data['BsmtFinType1'].fillna(data['BsmtFinType1'].mode()[0])
data['BsmtFinType2']= data['BsmtFinType2'].fillna(data['BsmtFinType2'].mode()[0])

In [11]:
data['GarageYrBlt']= data['GarageYrBlt'].fillna(data['GarageYrBlt'].mode()[0])
data['GarageType']= data['GarageType'].fillna(data['GarageType'].mode()[0])
data['GarageFinish']= data['GarageFinish'].fillna(data['GarageFinish'].mode()[0])
data['GarageQual']= data['GarageQual'].fillna(data['GarageQual'].mode()[0])
data['GarageCond']= data['GarageCond'].fillna(data['GarageCond'].mode()[0])

In [12]:
data['MasVnrType'] = data['MasVnrType'].fillna(data['MasVnrType'].mode()[0])
data['MasVnrArea'] = data['MasVnrArea'].fillna(data['MasVnrArea'].mode()[0])

In [13]:
data['Electrical'] = data['Electrical'].fillna(data['Electrical'].mode()[0])

#### Handling text/object data

In [14]:
#Find the unique values in each column
for column in data.columns:
    print(column, ':', len(data[column].unique()), 'labels')

Id : 1460 labels
MSSubClass : 15 labels
MSZoning : 5 labels
LotFrontage : 111 labels
LotArea : 1073 labels
Street : 2 labels
LotShape : 4 labels
LandContour : 4 labels
Utilities : 2 labels
LotConfig : 5 labels
LandSlope : 3 labels
Neighborhood : 25 labels
Condition1 : 9 labels
Condition2 : 8 labels
BldgType : 5 labels
HouseStyle : 8 labels
OverallQual : 10 labels
OverallCond : 9 labels
YearBuilt : 112 labels
YearRemodAdd : 61 labels
RoofStyle : 6 labels
RoofMatl : 8 labels
Exterior1st : 15 labels
Exterior2nd : 16 labels
MasVnrType : 4 labels
MasVnrArea : 327 labels
ExterQual : 4 labels
ExterCond : 5 labels
Foundation : 6 labels
BsmtQual : 4 labels
BsmtCond : 4 labels
BsmtExposure : 4 labels
BsmtFinType1 : 6 labels
BsmtFinSF1 : 637 labels
BsmtFinType2 : 6 labels
BsmtFinSF2 : 144 labels
BsmtUnfSF : 780 labels
TotalBsmtSF : 721 labels
Heating : 6 labels
HeatingQC : 5 labels
CentralAir : 2 labels
Electrical : 5 labels
1stFlrSF : 753 labels
2ndFlrSF : 417 labels
LowQualFinSF : 24 labels
GrL

In [15]:
#Use of frequencymapping to handle text data
df_frequency_map = data.MSSubClass.value_counts().to_dict()

In [16]:
data1 = data.copy()
data1.MSSubClass = data1.MSSubClass.map(df_frequency_map)

In [17]:
data1.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,299,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,2,536,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,3,299,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,4,60,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,5,299,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [18]:
#All the columns containing object or text data type
object_columns = ['MSZoning','Street','LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
                  'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle','RoofStyle',
                  'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType','ExterQual', 'ExterCond',
                  'Foundation', 'BsmtQual', 'BsmtCond','BsmtExposure', 'BsmtFinType1','BsmtFinType2','Heating',
                  'HeatingQC','CentralAir', 'Electrical','KitchenQual','Functional','FireplaceQu', 'GarageType',
                  'GarageFinish','GarageQual', 'GarageCond','PavedDrive','SaleType','SaleCondition']

In [19]:
for column in data1.columns:
    for value in data1.columns:
        if data1[value].dtype == 'object':
            df_map = data1[column].value_counts().to_dict()
            data1[column] = data1[column].map(df_map)
            
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 77 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Id             1460 non-null   int64
 1   MSSubClass     1460 non-null   int64
 2   MSZoning       1460 non-null   int64
 3   LotFrontage    1460 non-null   int64
 4   LotArea        1460 non-null   int64
 5   Street         1460 non-null   int64
 6   LotShape       1460 non-null   int64
 7   LandContour    1460 non-null   int64
 8   Utilities      1460 non-null   int64
 9   LotConfig      1460 non-null   int64
 10  LandSlope      1460 non-null   int64
 11  Neighborhood   1460 non-null   int64
 12  Condition1     1460 non-null   int64
 13  Condition2     1460 non-null   int64
 14  BldgType       1460 non-null   int64
 15  HouseStyle     1460 non-null   int64
 16  OverallQual    1460 non-null   int64
 17  OverallCond    1460 non-null   int64
 18  YearBuilt      1460 non-null   int64
 19  YearRe

## Machine Learning model

In [20]:
data1['SalePrice'] = data['SalePrice']
data1.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1460,299,1151,44,12,1454,925,1311,1459,1052,...,1252,1436,1344,1453,1408,52,304,1267,1198,208500
1,1460,536,1151,69,24,1454,925,1311,1459,47,...,1252,1436,1344,1453,1408,204,329,1267,1198,181500
2,1460,299,1151,114,60,1454,484,1311,1459,1052,...,1252,1436,1344,1453,1408,63,304,1267,1198,223500
3,1460,60,1151,143,912,1454,484,1311,1459,263,...,74,1436,1344,1453,1408,52,314,1267,101,140000
4,1460,299,1151,90,912,1454,484,1311,1459,47,...,1252,1436,1344,1453,1408,59,304,1267,1198,250000
5,1460,144,1151,40,72,1454,484,1311,1459,1052,...,1252,15,1344,1453,5,89,338,1267,1198,143000
6,1460,536,1151,53,198,1454,925,1311,1459,1052,...,1252,1436,1344,1453,1408,122,329,1267,1198,307000
7,1460,299,1151,259,198,1454,484,1311,1459,263,...,27,1436,1344,1453,12,79,338,1267,1198,200000
8,1460,144,218,90,32,1454,925,1311,1459,1052,...,54,1436,1344,1453,1408,141,304,1267,101,129900
9,1460,30,1151,114,198,1454,925,1311,1459,263,...,1252,1436,1344,1453,1408,58,304,1267,1198,118000


In [21]:
from sklearn.linear_model import LinearRegression

#Creating a linear regression object
model = LinearRegression()

#Loading in the test data
test_data = pd.read_csv('test.csv')

x_data = data1.drop('SalePrice', axis = 1)   #Feature Matrix
y_data = data1["SalePrice"]                  #Target Array


In [22]:
from sklearn.naive_bayes import GaussianNB
model1 = GaussianNB()
model1.fit(x_data, y_data)

#Modifying the test data
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [23]:
#Drop the columns with a lot of missing values
test_data.drop(['MiscFeature', 'PoolQC', 'FireplaceQu', 'Alley'], axis=1, inplace=True )

In [24]:
#Fill all the missing data in the test data csv
for column in test_data.columns:
    for value in test_data.columns:
        if test_data[value].dtype == 'object':
            test_data[column] = test_data[column].fillna(test_data[column].mode()[0])
        

In [25]:
#Convert all object/text data in test data to integers using frequency count
for column in test_data.columns:
    for value in test_data.columns:
        if test_data[value].dtype == 'object':
            df1_map = test_data[column].value_counts().to_dict()
            test_data[column] = test_data[column].map(df1_map)
            
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 76 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   Id             1459 non-null   int64
 1   MSSubClass     1459 non-null   int64
 2   MSZoning       1459 non-null   int64
 3   LotFrontage    1459 non-null   int64
 4   LotArea        1459 non-null   int64
 5   Street         1459 non-null   int64
 6   LotShape       1459 non-null   int64
 7   LandContour    1459 non-null   int64
 8   Utilities      1459 non-null   int64
 9   LotConfig      1459 non-null   int64
 10  LandSlope      1459 non-null   int64
 11  Neighborhood   1459 non-null   int64
 12  Condition1     1459 non-null   int64
 13  Condition2     1459 non-null   int64
 14  BldgType       1459 non-null   int64
 15  HouseStyle     1459 non-null   int64
 16  OverallQual    1459 non-null   int64
 17  OverallCond    1459 non-null   int64
 18  YearBuilt      1459 non-null   int64
 19  YearRe

In [26]:
output = model1.predict(test_data)