# Project 2 - Ames Housing Data and Kaggle Challenge (03_Data_Cleaning_Encoding_Train_Test_Set)

## Libaries Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score

%matplotlib inline 

## Merging both test and train datasets

In [2]:
train_clean = pd.read_csv('../datasets/train_clean.csv')
test_clean = pd.read_csv('../datasets/test_clean.csv')

print(f"Train dataset: {train_clean.shape}")
print(f"Test dataset: {test_clean.shape}")

Train dataset: (1937, 80)
Test dataset: (878, 79)


- Additional column for test dataset is sale price

##### Storing training dataset sale price in a seperate set

In [3]:
train_saleprice = train_clean[['saleprice']]
train_saleprice.head()

Unnamed: 0,saleprice
0,130500
1,220000
2,109000
3,174000
4,138500


##### Dropping training dataset sale price

In [4]:
# Dropping saleprice column to be merged back on later after encoding clean train and test data together
train_clean.drop(columns='saleprice', inplace=True)

In [5]:
print(f"Train dataset: {train_clean.shape}")
print(f"Test dataset: {test_clean.shape}")

Train dataset: (1937, 79)
Test dataset: (878, 79)


- Both Train and Test datasets have the same number of matching columns

##### Concatenating both training and test datasets

In [6]:
merged = pd.concat([train_clean, test_clean], axis=0, ignore_index=True)

In [7]:
merged.shape

(2815, 79)

#### Getting null values and checking if any null values are left within all columns for merged training and test datasets

In [8]:
# Getting the total null values for the all columns in merged dataset
merged_null = pd.DataFrame(merged.isnull().sum(), columns = ['null_count']).sort_values(by = 'null_count',ascending = False)
merged_null = merged_null[merged_null['null_count'] > 0].sort_values('null_count', ascending=False).sort_index()

# Checking there are no null values
merged_null

Unnamed: 0,null_count


#### Writing cleaned merged dataset of training and test datasets into a csv file to datasets folder

In [9]:
merged.to_csv('../datasets/merged_clean.csv', index=False)

## Encoding Categorical Data

##### Loading in merged dataset

In [10]:
merged = pd.read_csv('../datasets/merged_clean.csv')

In [11]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2815 entries, 0 to 2814
Data columns (total 79 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               2815 non-null   int64  
 1   pid              2815 non-null   int64  
 2   ms_subclass      2815 non-null   int64  
 3   ms_zoning        2815 non-null   object 
 4   lot_frontage     2815 non-null   float64
 5   lot_area         2815 non-null   int64  
 6   street           2815 non-null   object 
 7   lot_shape        2815 non-null   object 
 8   land_contour     2815 non-null   object 
 9   utilities        2815 non-null   object 
 10  lot_config       2815 non-null   object 
 11  land_slope       2815 non-null   object 
 12  neighborhood     2815 non-null   object 
 13  condition_1      2815 non-null   object 
 14  condition_2      2815 non-null   object 
 15  bldg_type        2815 non-null   object 
 16  house_style      2815 non-null   object 
 17  overall_qual  

##### Listing out all catgeorical columns for referencing in encoding process 

In [12]:
cat_column = [col for col in merged.columns if merged[col].dtype == object]
cat_column

['ms_zoning',
 'street',
 'lot_shape',
 'land_contour',
 'utilities',
 'lot_config',
 'land_slope',
 'neighborhood',
 'condition_1',
 'condition_2',
 'bldg_type',
 'house_style',
 'roof_style',
 'roof_matl',
 'exterior_1st',
 'exterior_2nd',
 'mas_vnr_type',
 'exter_qual',
 'exter_cond',
 'foundation',
 'bsmt_qual',
 'bsmt_cond',
 'bsmt_exposure',
 'bsmtfin_type_1',
 'bsmtfin_type_2',
 'heating',
 'heating_qc',
 'central_air',
 'electrical',
 'kitchen_qual',
 'functional',
 'fireplace_qu',
 'garage_type',
 'garage_finish',
 'garage_qual',
 'garage_cond',
 'paved_drive',
 'pool_qc',
 'fence',
 'misc_feature',
 'sale_type']

##### Checking merged data has no null values for all columns

In [13]:
# Getting the total null values for the all columns in merged dataset
merged_null = pd.DataFrame(merged.isnull().sum(), columns = ['null_count']).sort_values(by = 'null_count',ascending = False)
merged_null = merged_null[merged_null['null_count'] > 0].sort_values('null_count', ascending=False).sort_index()

# Checking there are no null values
merged_null

Unnamed: 0,null_count


#### Arranging catergorical data into integer encoding for ordinal variables and one-hot encoding for nominal variables

##### Applying one-hot encoding to ms_zoning column

In [14]:
# Applying one-hot encoding
merged = pd.get_dummies(merged, columns=['ms_zoning'], prefix='zoning', drop_first=True)

In [15]:
# Checking columns left in dataset
merged.shape

(2815, 84)

In [16]:
merged.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,lot_config,...,misc_val,mo_sold,yr_sold,sale_type,zoning_C,zoning_FV,zoning_I,zoning_RH,zoning_RL,zoning_RM
0,109,533352170,60,0.0,13517,Pave,IR1,Lvl,AllPub,CulDSac,...,0,3,2010,WD,0,0,0,0,1,0
1,544,531379050,60,43.0,11492,Pave,IR1,Lvl,AllPub,CulDSac,...,0,4,2009,WD,0,0,0,0,1,0
2,153,535304180,20,68.0,7922,Pave,Reg,Lvl,AllPub,Inside,...,0,1,2010,WD,0,0,0,0,1,0
3,318,916386060,60,73.0,9802,Pave,Reg,Lvl,AllPub,Inside,...,0,4,2010,WD,0,0,0,0,1,0
4,255,906425045,50,82.0,14235,Pave,IR1,Lvl,AllPub,Inside,...,0,3,2010,WD,0,0,0,0,1,0


##### Applying one-hot encoding to street column

In [17]:
# Checking categorical variables
merged.groupby('street').count()['id']

street
Grvl      10
Pave    2805
Name: id, dtype: int64

In [18]:
# Applying one-hot encoding
merged = pd.get_dummies(merged, columns=['street'], prefix='street', drop_first=True)

In [19]:
# Checking columns left in dataset
merged.shape

(2815, 84)

In [20]:
merged.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,land_contour,utilities,lot_config,land_slope,...,mo_sold,yr_sold,sale_type,zoning_C,zoning_FV,zoning_I,zoning_RH,zoning_RL,zoning_RM,street_Pave
0,109,533352170,60,0.0,13517,IR1,Lvl,AllPub,CulDSac,Gtl,...,3,2010,WD,0,0,0,0,1,0,1
1,544,531379050,60,43.0,11492,IR1,Lvl,AllPub,CulDSac,Gtl,...,4,2009,WD,0,0,0,0,1,0,1
2,153,535304180,20,68.0,7922,Reg,Lvl,AllPub,Inside,Gtl,...,1,2010,WD,0,0,0,0,1,0,1
3,318,916386060,60,73.0,9802,Reg,Lvl,AllPub,Inside,Gtl,...,4,2010,WD,0,0,0,0,1,0,1
4,255,906425045,50,82.0,14235,IR1,Lvl,AllPub,Inside,Gtl,...,3,2010,WD,0,0,0,0,1,0,1


##### Applying integer encoding to lot_shape column

In [21]:
# Checking categorical variables
merged.groupby('lot_shape').count()['id']

lot_shape
IR1     962
IR2      76
IR3      16
Reg    1761
Name: id, dtype: int64

- Function to encode categorical variables into ordinal variables

In [22]:
def lot_shape_enc(var):
    if var == 'IR3':
        return 0
    elif var == 'IR2':
        return 1 
    elif var == 'IR1':
        return 2 
    elif var == 'Reg':
        return 3
    
# Applying integer encoding function to categorical data
merged['lot_shape'] = merged['lot_shape'].map(lot_shape_enc)

In [23]:
# Checking columns left in dataset
merged.shape

(2815, 84)

- Checking integer encoded variables

In [24]:
merged.groupby('lot_shape').count()['id']

lot_shape
0      16
1      76
2     962
3    1761
Name: id, dtype: int64

In [25]:
merged.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,land_contour,utilities,lot_config,land_slope,...,mo_sold,yr_sold,sale_type,zoning_C,zoning_FV,zoning_I,zoning_RH,zoning_RL,zoning_RM,street_Pave
0,109,533352170,60,0.0,13517,2,Lvl,AllPub,CulDSac,Gtl,...,3,2010,WD,0,0,0,0,1,0,1
1,544,531379050,60,43.0,11492,2,Lvl,AllPub,CulDSac,Gtl,...,4,2009,WD,0,0,0,0,1,0,1
2,153,535304180,20,68.0,7922,3,Lvl,AllPub,Inside,Gtl,...,1,2010,WD,0,0,0,0,1,0,1
3,318,916386060,60,73.0,9802,3,Lvl,AllPub,Inside,Gtl,...,4,2010,WD,0,0,0,0,1,0,1
4,255,906425045,50,82.0,14235,2,Lvl,AllPub,Inside,Gtl,...,3,2010,WD,0,0,0,0,1,0,1


##### Applying one-hot encoding to land_contour column

In [26]:
# Checking categorical variables
merged.groupby('land_contour').count()['id']

land_contour
Bnk     108
HLS     117
Low      56
Lvl    2534
Name: id, dtype: int64

In [27]:
# Applying one-hot encoding
merged = pd.get_dummies(merged, columns=['land_contour'], prefix='land_contour', drop_first=True)

In [28]:
# Checking columns left in dataset
merged.shape

(2815, 86)

In [29]:
merged.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,lot_config,land_slope,neighborhood,...,zoning_C,zoning_FV,zoning_I,zoning_RH,zoning_RL,zoning_RM,street_Pave,land_contour_HLS,land_contour_Low,land_contour_Lvl
0,109,533352170,60,0.0,13517,2,AllPub,CulDSac,Gtl,Sawyer,...,0,0,0,0,1,0,1,0,0,1
1,544,531379050,60,43.0,11492,2,AllPub,CulDSac,Gtl,SawyerW,...,0,0,0,0,1,0,1,0,0,1
2,153,535304180,20,68.0,7922,3,AllPub,Inside,Gtl,NAmes,...,0,0,0,0,1,0,1,0,0,1
3,318,916386060,60,73.0,9802,3,AllPub,Inside,Gtl,Timber,...,0,0,0,0,1,0,1,0,0,1
4,255,906425045,50,82.0,14235,2,AllPub,Inside,Gtl,SawyerW,...,0,0,0,0,1,0,1,0,0,1


##### Applying integer encoding to utilities column

In [30]:
# Checking categorical variables
merged.groupby('utilities').count()['id']

utilities
AllPub    2812
NoSeWa       1
NoSewr       2
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [31]:
def util_enc(var):
    if var == 'ELO':
        return 0
    elif var == 'NoSeWa':
        return 1 
    elif var == 'NoSewr':
        return 2 
    elif var == 'AllPub':
        return 3
# Applying integer encoding function to categorical data 
merged['utilities'] = merged['utilities'].map(util_enc)

In [32]:
merged.shape

(2815, 86)

- Checking integer encoded variables

In [33]:
merged.groupby('utilities').count()['id']

utilities
1       1
2       2
3    2812
Name: id, dtype: int64

In [34]:
merged.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,lot_config,land_slope,neighborhood,...,zoning_C,zoning_FV,zoning_I,zoning_RH,zoning_RL,zoning_RM,street_Pave,land_contour_HLS,land_contour_Low,land_contour_Lvl
0,109,533352170,60,0.0,13517,2,3,CulDSac,Gtl,Sawyer,...,0,0,0,0,1,0,1,0,0,1
1,544,531379050,60,43.0,11492,2,3,CulDSac,Gtl,SawyerW,...,0,0,0,0,1,0,1,0,0,1
2,153,535304180,20,68.0,7922,3,3,Inside,Gtl,NAmes,...,0,0,0,0,1,0,1,0,0,1
3,318,916386060,60,73.0,9802,3,3,Inside,Gtl,Timber,...,0,0,0,0,1,0,1,0,0,1
4,255,906425045,50,82.0,14235,2,3,Inside,Gtl,SawyerW,...,0,0,0,0,1,0,1,0,0,1


##### Applying one-hot encoding to lot_config column

In [35]:
# Checking categorical variables
merged.groupby('lot_config').count()['id']

lot_config
Corner      494
CulDSac     178
FR2          81
FR3          14
Inside     2048
Name: id, dtype: int64

In [36]:
# Applying one-hot encoding
merged = pd.get_dummies(merged, columns=['lot_config'], prefix='lot_config', drop_first=True)
merged.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,land_slope,neighborhood,condition_1,...,zoning_RL,zoning_RM,street_Pave,land_contour_HLS,land_contour_Low,land_contour_Lvl,lot_config_CulDSac,lot_config_FR2,lot_config_FR3,lot_config_Inside
0,109,533352170,60,0.0,13517,2,3,Gtl,Sawyer,RRAe,...,1,0,1,0,0,1,1,0,0,0
1,544,531379050,60,43.0,11492,2,3,Gtl,SawyerW,Norm,...,1,0,1,0,0,1,1,0,0,0
2,153,535304180,20,68.0,7922,3,3,Gtl,NAmes,Norm,...,1,0,1,0,0,1,0,0,0,1
3,318,916386060,60,73.0,9802,3,3,Gtl,Timber,Norm,...,1,0,1,0,0,1,0,0,0,1
4,255,906425045,50,82.0,14235,2,3,Gtl,SawyerW,Norm,...,1,0,1,0,0,1,0,0,0,1


In [37]:
# Checking columns left in dataset
merged.shape

(2815, 89)

##### Applying integer encoding to land_slope column

In [38]:
# Checking categorical variables
merged.groupby('land_slope').count()['id']

land_slope
Gtl    2678
Mod     122
Sev      15
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [39]:
def land_slope_enc(var):
    if var == 'Sev':
        return 0
    elif var == 'Mod':
        return 1 
    elif var == 'Gtl':
        return 2 
# Applying integer encoding function to categorical data 
merged['land_slope'] = merged['land_slope'].map(land_slope_enc)

In [40]:
merged.shape

(2815, 89)

- Checking integer encoded variables

In [41]:
merged.groupby('land_slope').count()['id']

land_slope
0      15
1     122
2    2678
Name: id, dtype: int64

In [42]:
merged.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,land_slope,neighborhood,condition_1,...,zoning_RL,zoning_RM,street_Pave,land_contour_HLS,land_contour_Low,land_contour_Lvl,lot_config_CulDSac,lot_config_FR2,lot_config_FR3,lot_config_Inside
0,109,533352170,60,0.0,13517,2,3,2,Sawyer,RRAe,...,1,0,1,0,0,1,1,0,0,0
1,544,531379050,60,43.0,11492,2,3,2,SawyerW,Norm,...,1,0,1,0,0,1,1,0,0,0
2,153,535304180,20,68.0,7922,3,3,2,NAmes,Norm,...,1,0,1,0,0,1,0,0,0,1
3,318,916386060,60,73.0,9802,3,3,2,Timber,Norm,...,1,0,1,0,0,1,0,0,0,1
4,255,906425045,50,82.0,14235,2,3,2,SawyerW,Norm,...,1,0,1,0,0,1,0,0,0,1


##### Applying one-hot encoding to neighborhood column

In [43]:
# Checking categorical variables
merged.groupby('neighborhood').count()['id']

neighborhood
Blmngtn     28
Blueste     10
BrDale      29
BrkSide     99
ClearCr     43
CollgCr    265
Crawfor    103
Edwards    163
Gilbert    165
Greens       8
GrnHill      2
IDOTRR      74
Landmrk      1
MeadowV     30
Mitchel    111
NAmes      435
NPkVill     23
NWAmes     131
NoRidge     71
NridgHt    166
OldTown    220
SWISU       43
Sawyer     147
SawyerW    119
Somerst    182
StoneBr     51
Timber      72
Veenker     24
Name: id, dtype: int64

In [44]:
# Applying one-hot encoding
merged = pd.get_dummies(merged, columns=['neighborhood'], prefix='neighborhood', drop_first=True)
merged.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,land_slope,condition_1,condition_2,...,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_SWISU,neighborhood_Sawyer,neighborhood_SawyerW,neighborhood_Somerst,neighborhood_StoneBr,neighborhood_Timber,neighborhood_Veenker
0,109,533352170,60,0.0,13517,2,3,2,RRAe,Norm,...,0,0,0,0,1,0,0,0,0,0
1,544,531379050,60,43.0,11492,2,3,2,Norm,Norm,...,0,0,0,0,0,1,0,0,0,0
2,153,535304180,20,68.0,7922,3,3,2,Norm,Norm,...,0,0,0,0,0,0,0,0,0,0
3,318,916386060,60,73.0,9802,3,3,2,Norm,Norm,...,0,0,0,0,0,0,0,0,1,0
4,255,906425045,50,82.0,14235,2,3,2,Norm,Norm,...,0,0,0,0,0,1,0,0,0,0


In [45]:
# Checking columns left in dataset
merged.shape

(2815, 115)

##### Applying one-hot encoding to condition_1 column

In [46]:
# Checking categorical variables
merged.groupby('condition_1').count()['id']

condition_1
Artery      84
Feedr      151
Norm      2430
PosA        20
PosN        38
RRAe        27
RRAn        50
RRNe         6
RRNn         9
Name: id, dtype: int64

In [47]:
# Applying one-hot encoding
merged = pd.get_dummies(merged, columns=['condition_1'], prefix='condition_1', drop_first=True)

In [48]:
# Checking status of implemented encoded columns
merged.iloc[:,99:].head()

Unnamed: 0,neighborhood_MeadowV,neighborhood_Mitchel,neighborhood_NAmes,neighborhood_NPkVill,neighborhood_NWAmes,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_SWISU,neighborhood_Sawyer,...,neighborhood_Timber,neighborhood_Veenker,condition_1_Feedr,condition_1_Norm,condition_1_PosA,condition_1_PosN,condition_1_RRAe,condition_1_RRAn,condition_1_RRNe,condition_1_RRNn
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [49]:
# Checking columns left in dataset
merged.shape 

(2815, 122)

##### Applying one-hot encoding to condition_2 column

In [50]:
# Checking categorical variables
merged.groupby('condition_2').count()['id']

condition_2
Artery       4
Feedr       12
Norm      2788
PosA         4
PosN         3
RRAe         1
RRAn         1
RRNn         2
Name: id, dtype: int64

In [51]:
# Applying one-hot encoding
merged = pd.get_dummies(merged, columns=['condition_2'], prefix='condition_2', drop_first=True)

In [52]:
# Checking status of implemented encoded columns
merged.iloc[:,99:].head()

Unnamed: 0,neighborhood_Mitchel,neighborhood_NAmes,neighborhood_NPkVill,neighborhood_NWAmes,neighborhood_NoRidge,neighborhood_NridgHt,neighborhood_OldTown,neighborhood_SWISU,neighborhood_Sawyer,neighborhood_SawyerW,...,condition_1_RRAn,condition_1_RRNe,condition_1_RRNn,condition_2_Feedr,condition_2_Norm,condition_2_PosA,condition_2_PosN,condition_2_RRAe,condition_2_RRAn,condition_2_RRNn
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [53]:
# Checking columns left in dataset
merged.shape 

(2815, 128)

##### Applying one-hot encoding to remaining categorical columns

In [54]:
merged = pd.get_dummies(merged, columns=['bldg_type'], prefix='bldg_type', drop_first=True)

merged = pd.get_dummies(merged, columns=['house_style'], prefix='house_style', drop_first=True)

merged = pd.get_dummies(merged, columns=['roof_style'], prefix='roof_style', drop_first=True)

merged = pd.get_dummies(merged, columns=['roof_matl'], prefix='roof_matl', drop_first=True)

merged = pd.get_dummies(merged, columns=['exterior_1st'], prefix='exterior_1st', drop_first=True)

merged = pd.get_dummies(merged, columns=['exterior_2nd'], prefix='ext2exterior_2ndnd', drop_first=True)

merged = pd.get_dummies(merged, columns=['mas_vnr_type'], prefix='mas_vnr_type', drop_first=True)

merged = pd.get_dummies(merged, columns=['foundation'], prefix='foundation', drop_first=True)

merged = pd.get_dummies(merged, columns=['heating'], prefix='heating', drop_first=True)

merged = pd.get_dummies(merged, columns=['central_air'], prefix='central_air', drop_first=True)

merged = pd.get_dummies(merged, columns=['garage_type'], prefix='garage_type', drop_first=True)

merged = pd.get_dummies(merged, columns=['misc_feature'], prefix='misc_feature', drop_first=True)

merged = pd.get_dummies(merged, columns=['sale_type'], prefix='sale_type', drop_first=True)


In [55]:
# Checking status of implemented encoded columns
merged.iloc[:,99:].head()

Unnamed: 0,neighborhood_Veenker,condition_1_Feedr,condition_1_Norm,condition_1_PosA,condition_1_PosN,condition_1_RRAe,condition_1_RRAn,condition_1_RRNe,condition_1_RRNn,condition_2_Feedr,...,misc_feature_TenC,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [56]:
# Checking columns left in dataset
merged.shape 

(2815, 204)

##### Applying integer encoding to exter_qual column

In [57]:
# Checking categorical variables
merged.groupby('exter_qual').count()['id']

exter_qual
Ex     106
Fa      24
Gd     983
TA    1702
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [58]:
def exter_qual_enc(var):
    if var == 'Po':
        return 0
    elif var == 'Fa':
        return 1 
    elif var == 'TA':
        return 2
    elif var == 'Gd':
        return 3
    elif var == 'Ex':
        return 4

# Applying integer encoding function to categorical data 
merged['exter_qual'] = merged['exter_qual'].map(exter_qual_enc)

In [59]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [60]:
merged.groupby('exter_qual').count()['id']

exter_qual
1      24
2    1702
3     983
4     106
Name: id, dtype: int64

In [61]:
merged.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,...,misc_feature_TenC,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD
0,109,533352170,60,0.0,13517,2,3,2,6,8,...,0,0,0,0,0,0,0,0,0,1
1,544,531379050,60,43.0,11492,2,3,2,7,5,...,0,0,0,0,0,0,0,0,0,1
2,153,535304180,20,68.0,7922,3,3,2,5,7,...,0,0,0,0,0,0,0,0,0,1
3,318,916386060,60,73.0,9802,3,3,2,5,5,...,0,0,0,0,0,0,0,0,0,1
4,255,906425045,50,82.0,14235,2,3,2,6,8,...,0,0,0,0,0,0,0,0,0,1


##### Applying integer encoding to exter_cond column

In [62]:
# Checking categorical variables
merged.groupby('exter_cond').count()['id']

exter_cond
Ex      11
Fa      51
Gd     287
Po       2
TA    2464
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [63]:
def exter_cond_enc(var):
    if var == 'Po':
        return 0
    elif var == 'Fa':
        return 1 
    elif var == 'TA':
        return 2
    elif var == 'Gd':
        return 3
    elif var == 'Ex':
        return 4

# Applying integer encoding function to categorical data 
merged['exter_cond'] = merged['exter_cond'].map(exter_cond_enc)

In [64]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [65]:
merged.groupby('exter_cond').count()['id']

exter_cond
0       2
1      51
2    2464
3     287
4      11
Name: id, dtype: int64

##### Applying integer encoding to bsmt_qual column

In [66]:
# Checking categorical variables
merged.groupby('bsmt_qual').count()['id']

bsmt_qual
Ex       257
Fa        78
Gd      1194
None      71
Po         2
TA      1213
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [67]:
def bsmt_qual_enc(var):
    if var == 'None':
        return 0
    elif var == 'Po':
        return 1
    elif var == 'Fa':
        return 2 
    elif var == 'TA':
        return 3
    elif var == 'Gd':
        return 4
    elif var == 'Ex':
        return 5
    
# Applying integer encoding function to categorical data 
merged['bsmt_qual'] = merged['bsmt_qual'].map(bsmt_qual_enc)

In [68]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [69]:
merged.groupby('bsmt_qual').count()['id']

bsmt_qual
0      71
1       2
2      78
3    1213
4    1194
5     257
Name: id, dtype: int64

##### Applying integer encoding to bsmt_cond column

In [70]:
# Checking categorical variables
merged.groupby('bsmt_cond').count()['id']

bsmt_cond
Ex         3
Fa        92
Gd       117
None      71
Po         2
TA      2530
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [71]:
def bsmt_cond_enc(var):
    if var == 'None':
        return 0
    elif var == 'Po':
        return 1
    elif var == 'Fa':
        return 2 
    elif var == 'TA':
        return 3
    elif var == 'Gd':
        return 4
    elif var == 'Ex':
        return 5
    
# Applying integer encoding function to categorical data 
merged['bsmt_cond'] = merged['bsmt_cond'].map(bsmt_cond_enc)

In [72]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [73]:
merged.groupby('bsmt_cond').count()['id']

bsmt_cond
0      71
1       2
2      92
3    2530
4     117
5       3
Name: id, dtype: int64

##### Applying integer encoding to bsmt_exposure column

In [74]:
# Checking categorical variables
merged.groupby('bsmt_exposure').count()['id']

bsmt_exposure
Av       406
Gd       275
Mn       234
No      1826
None      74
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [75]:
def bsmt_exposure_enc(var):
    if var == 'None':
        return 0
    elif var == 'No':
        return 1
    elif var == 'Mn':
        return 2 
    elif var == 'Av':
        return 3
    elif var == 'Gd':
        return 4
    
# Applying integer encoding function to categorical data 
merged['bsmt_exposure'] = merged['bsmt_exposure'].map(bsmt_exposure_enc)

In [76]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [77]:
merged.groupby('bsmt_exposure').count()['id']

bsmt_exposure
0      74
1    1826
2     234
3     406
4     275
Name: id, dtype: int64

##### Applying integer encoding to bsmtfin_type_1 column

In [78]:
# Checking categorical variables
merged.groupby('bsmtfin_type_1').count()['id']

bsmtfin_type_1
ALQ     415
BLQ     263
GLQ     842
LwQ     150
None     71
Rec     282
Unf     792
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [79]:
def bsmtfin_type_1_enc(var):
    if var == 'None':
        return 0
    elif var == 'Unf':
        return 1
    elif var == 'LwQ':
        return 2 
    elif var == 'Rec':
        return 3
    elif var == 'BLQ':
        return 4
    elif var == 'ALQ':
        return 5
    elif var == 'GLQ':
        return 6
    
# Applying integer encoding function to categorical data 
merged['bsmtfin_type_1'] = merged['bsmtfin_type_1'].map(bsmtfin_type_1_enc)

In [80]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [81]:
merged.groupby('bsmtfin_type_1').count()['id']

bsmtfin_type_1
0     71
1    792
2    150
3    282
4    263
5    415
6    842
Name: id, dtype: int64

##### Applying integer encoding to bsmtfin_type_2 column

In [82]:
# Checking categorical variables
merged.groupby('bsmtfin_type_2').count()['id']

bsmtfin_type_2
ALQ       52
BLQ       67
GLQ       33
LwQ       89
None      72
Rec      105
Unf     2397
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [83]:
def bsmtfin_type_2_enc(var):
    if var == 'None':
        return 0
    elif var == 'Unf':
        return 1
    elif var == 'LwQ':
        return 2 
    elif var == 'Rec':
        return 3
    elif var == 'BLQ':
        return 4
    elif var == 'ALQ':
        return 5
    elif var == 'GLQ':
        return 6
    
# Applying integer encoding function to categorical data 
merged['bsmtfin_type_2'] = merged['bsmtfin_type_2'].map(bsmtfin_type_2_enc)

In [84]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [85]:
merged.groupby('bsmtfin_type_2').count()['id']

bsmtfin_type_2
0      72
1    2397
2      89
3     105
4      67
5      52
6      33
Name: id, dtype: int64

##### Applying integer encoding to heating_qc column

In [86]:
# Checking categorical variables
merged.groupby('heating_qc').count()['id']

heating_qc
Ex    1461
Fa      78
Gd     455
Po       3
TA     818
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [87]:
def heating_qc_enc(var):
    if var == 'Po':
        return 0
    elif var == 'Fa':
        return 1
    elif var == 'TA':
        return 2 
    elif var == 'Gd':
        return 3
    elif var == 'Ex':
        return 4
    
# Applying integer encoding function to categorical data 
merged['heating_qc'] = merged['heating_qc'].map(heating_qc_enc)

In [88]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [89]:
merged.groupby('heating_qc').count()['id']

heating_qc
0       3
1      78
2     818
3     455
4    1461
Name: id, dtype: int64

##### Applying integer encoding to electrical column

In [90]:
# Checking categorical variables
merged.groupby('electrical').count()['id']

electrical
FuseA     171
FuseF      43
FuseP       5
Mix         1
SBrkr    2595
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [91]:
def electrical_enc(var):
    if var == 'Mix':
        return 0
    elif var == 'FuseP':
        return 1
    elif var == 'FuseF':
        return 2 
    elif var == 'FuseA':
        return 3
    elif var == 'SBrkr':
        return 4
    
# Applying integer encoding function to categorical data 
merged['electrical'] = merged['electrical'].map(electrical_enc)

In [92]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [93]:
merged.groupby('electrical').count()['id']

electrical
0       1
1       5
2      43
3     171
4    2595
Name: id, dtype: int64

##### Applying integer encoding to kitchen_qual column

In [94]:
# Checking categorical variables
merged.groupby('kitchen_qual').count()['id']

kitchen_qual
Ex     203
Fa      53
Gd    1145
Po       1
TA    1413
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [95]:
def kitchen_qual_enc(var):
    if var == 'Po':
        return 0
    elif var == 'Fa':
        return 1
    elif var == 'TA':
        return 2 
    elif var == 'Gd':
        return 3
    elif var == 'Ex':
        return 4
    
# Applying integer encoding function to categorical data 
merged['kitchen_qual'] = merged['kitchen_qual'].map(kitchen_qual_enc)

In [96]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [97]:
merged.groupby('kitchen_qual').count()['id']

kitchen_qual
0       1
1      53
2    1413
3    1145
4     203
Name: id, dtype: int64

##### Applying integer encoding to functional column

In [98]:
# Checking categorical variables
merged.groupby('functional').count()['id']

functional
Maj1      17
Maj2       7
Min1      64
Min2      67
Mod       32
Sal        2
Sev        1
Typ     2625
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [99]:
def functional_enc(var):
    if var == 'Sal':
        return 0
    elif var == 'Sev':
        return 1
    elif var == 'Maj2':
        return 2 
    elif var == 'Maj1':
        return 3
    elif var == 'Mod':
        return 4
    elif var == 'Min2':
        return 5
    elif var == 'Min1':
        return 6
    elif var == 'Typ':
        return 7
    
# Applying integer encoding function to categorical data 
merged['functional'] = merged['functional'].map(functional_enc)

In [100]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [101]:
merged.groupby('functional').count()['id']

functional
0       2
1       1
2       7
3      17
4      32
5      67
6      64
7    2625
Name: id, dtype: int64

##### Applying integer encoding to fireplace_qu column

In [102]:
# Checking categorical variables
merged.groupby('fireplace_qu').count()['id']

fireplace_qu
Ex        43
Fa        75
Gd       737
None    1317
Po        45
TA       598
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [103]:
def fireplace_qu_enc(var):
    if var == 'None':
        return 0
    elif var == 'Po':
        return 1
    elif var == 'Fa':
        return 2 
    elif var == 'TA':
        return 3
    elif var == 'Gd':
        return 4
    elif var == 'Ex':
        return 5
    
# Applying integer encoding function to categorical data 
merged['fireplace_qu'] = merged['fireplace_qu'].map(fireplace_qu_enc)

In [104]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [105]:
merged.groupby('fireplace_qu').count()['id']

fireplace_qu
0    1317
1      45
2      75
3     598
4     737
5      43
Name: id, dtype: int64

##### Applying integer encoding to garage_finish column

In [106]:
# Checking categorical variables
merged.groupby('garage_finish').count()['id']

garage_finish
Fin      727
None      45
RFn      812
Unf     1231
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [107]:
def garage_finish_enc(var):
    if var == 'None':
        return 0
    elif var == 'Unf':
        return 1
    elif var == 'RFn':
        return 2
    elif var == 'Fin':
        return 3 
    
# Applying integer encoding function to categorical data 
merged['garage_finish'] = merged['garage_finish'].map(garage_finish_enc)

In [108]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [109]:
merged.groupby('garage_finish').count()['id']

garage_finish
0      45
1    1231
2     812
3     727
Name: id, dtype: int64

##### Applying integer encoding to garage_qual column

In [110]:
# Checking categorical variables
merged.groupby('garage_qual').count()['id']

garage_qual
Ex         3
Fa       124
Gd        24
None      45
Po         5
TA      2614
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [111]:
def garage_qual_enc(var):
    if var == 'None':
        return 0
    elif var == 'Po':
        return 1
    elif var == 'Fa':
        return 2 
    elif var == 'TA':
        return 3
    elif var == 'Gd':
        return 4
    elif var == 'Ex':
        return 5
    
# Applying integer encoding function to categorical data 
merged['garage_qual'] = merged['garage_qual'].map(garage_qual_enc)

In [112]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [113]:
merged.groupby('garage_qual').count()['id']

garage_qual
0      45
1       5
2     124
3    2614
4      24
5       3
Name: id, dtype: int64

##### Applying integer encoding to garage_cond column

In [114]:
# Checking categorical variables
merged.groupby('garage_cond').count()['id']

garage_cond
Ex         3
Fa        74
Gd        15
None      45
Po        14
TA      2664
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [115]:
def garage_cond_enc(var):
    if var == 'None':
        return 0
    elif var == 'Po':
        return 1
    elif var == 'Fa':
        return 2 
    elif var == 'TA':
        return 3
    elif var == 'Gd':
        return 4
    elif var == 'Ex':
        return 5
    
# Applying integer encoding function to categorical data 
merged['garage_cond'] = merged['garage_cond'].map(garage_cond_enc)

In [116]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [117]:
merged.groupby('garage_cond').count()['id']

garage_cond
0      45
1      14
2      74
3    2664
4      15
5       3
Name: id, dtype: int64

##### Applying integer encoding to paved_drive column

In [118]:
# Checking categorical variables
merged.groupby('paved_drive').count()['id']

paved_drive
N     164
P      59
Y    2592
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [119]:
def paved_drive_enc(var):
    if var == 'N':
        return 0
    elif var == 'P':
        return 1
    elif var == 'Y':
        return 2
    
# Applying integer encoding function to categorical data 
merged['paved_drive'] = merged['paved_drive'].map(paved_drive_enc)

In [120]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [121]:
merged.groupby('paved_drive').count()['id']

paved_drive
0     164
1      59
2    2592
Name: id, dtype: int64

##### Applying integer encoding to pool_qc column

In [122]:
# Checking categorical variables
merged.groupby('pool_qc').count()['id']

pool_qc
Ex         4
Fa         2
Gd         4
None    2802
TA         3
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [123]:
def pool_qc_enc(var):
    if var == 'None':
        return 0
    elif var == 'Fa':
        return 1
    elif var == 'TA':
        return 2 
    elif var == 'Gd':
        return 3
    elif var == 'Ex':
        return 4
    
# Applying integer encoding function to categorical data 
merged['pool_qc'] = merged['pool_qc'].map(pool_qc_enc)

In [124]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [125]:
merged.groupby('pool_qc').count()['id']

pool_qc
0    2802
1       2
2       3
3       4
4       4
Name: id, dtype: int64

##### Applying integer encoding to fence column

In [126]:
# Checking categorical variables
merged.groupby('fence').count()['id']

fence
GdPrv     117
GdWo      105
MnPrv     317
MnWw       12
None     2264
Name: id, dtype: int64

- Function to encode categorical variables into ordinal values

In [127]:
def fence_enc(var):
    if var == 'None':
        return 0
    elif var == 'MnWw':
        return 1
    elif var == 'GdWo':
        return 2 
    elif var == 'MnPrv':
        return 3
    elif var == 'GdPrv':
        return 4

# Applying integer encoding function to categorical data 
merged['fence'] = merged['fence'].map(fence_enc)

In [128]:
merged.shape

(2815, 204)

- Checking integer encoded variables

In [129]:
merged.groupby('fence').count()['id']

fence
0    2264
1      12
2     105
3     317
4     117
Name: id, dtype: int64

##### Checking merged dataset do not have any categorical data

In [130]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2815 entries, 0 to 2814
Columns: 204 entries, id to sale_type_WD 
dtypes: float64(11), int64(48), uint8(145)
memory usage: 1.7 MB


##### Checking for float values ensuring that it should contain only continuous variables

In [131]:
[col for col in merged.columns if merged[col].dtype == float]

['lot_frontage',
 'mas_vnr_area',
 'bsmtfin_sf_1',
 'bsmtfin_sf_2',
 'bsmt_unf_sf',
 'total_bsmt_sf',
 'bsmt_full_bath',
 'bsmt_half_bath',
 'garage_yr_blt',
 'garage_cars',
 'garage_area']

- It can be observed that garage_yr_blt should be a integer value since it is represented by years

In [132]:
merged['garage_yr_blt'].head()

0    1976.0
1    1997.0
2    1953.0
3    2007.0
4    1957.0
Name: garage_yr_blt, dtype: float64

##### Converting garage_yr_blt datatype into int

In [133]:
# Converts garage_yr_blt datatype to integer value
merged['garage_yr_blt'] = merged['garage_yr_blt'].astype('int64') 

In [134]:
# Checking garage_yr_blt has been converted to integer value
merged['garage_yr_blt'].dtype

dtype('int64')

In [135]:
merged['garage_yr_blt'].head()

0    1976
1    1997
2    1953
3    2007
4    1957
Name: garage_yr_blt, dtype: int64

In [136]:
merged.shape

(2815, 204)

In [137]:
# Getting the total null values for the all columns in merged dataset
merged_null = pd.DataFrame(merged.isnull().sum(), columns = ['null_count']).sort_values(by = 'null_count',ascending = False)
merged_null = merged_null[merged_null['null_count'] > 0].sort_values('null_count', ascending=False).sort_index()

# Checking there are no null values
merged_null

Unnamed: 0,null_count


#### Writing cleaned and encoded merged dataset into a csv file to datasets folder

In [138]:
merged.to_csv('../datasets/merged_clean_enc.csv', index=False)

In [139]:
# Checking number of rows and cols for cleaned train and test datasets
print(f"Train dataset: {train_clean.shape}")
print(f"Test dataset: {test_clean.shape}")

Train dataset: (1937, 79)
Test dataset: (878, 79)


### Splitting cleaned and encoded merged dataset into training and test set as being encoded

In [140]:
# Splitting into train set
train_enc = merged.iloc[:1937, :]

# Checking number of rows and cols of train encoded set
train_enc.shape

(1937, 204)

In [141]:
# Splitting into train set
test_enc = merged.iloc[1937:, :]

# resetting index of test encoded set and checking number of rows and cols  
test_enc.reset_index(drop=True, inplace=True)
test_enc.shape

(878, 204)

##### Checking status of split encoded train and test datasets

In [142]:
train_enc.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,...,misc_feature_TenC,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD
0,109,533352170,60,0.0,13517,2,3,2,6,8,...,0,0,0,0,0,0,0,0,0,1
1,544,531379050,60,43.0,11492,2,3,2,7,5,...,0,0,0,0,0,0,0,0,0,1
2,153,535304180,20,68.0,7922,3,3,2,5,7,...,0,0,0,0,0,0,0,0,0,1
3,318,916386060,60,73.0,9802,3,3,2,5,5,...,0,0,0,0,0,0,0,0,0,1
4,255,906425045,50,82.0,14235,2,3,2,6,8,...,0,0,0,0,0,0,0,0,0,1


In [143]:
test_enc.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,...,misc_feature_TenC,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD
0,2658,902301120,190,69.0,9142,3,3,2,6,8,...,0,0,0,0,0,0,0,0,0,1
1,2718,905108090,90,0.0,9662,2,3,2,5,4,...,0,0,0,0,0,0,0,0,0,1
2,2414,528218130,60,58.0,17104,2,3,2,7,5,...,0,0,0,0,0,0,1,0,0,0
3,1989,902207150,30,60.0,8520,3,3,2,5,6,...,0,0,0,0,0,0,0,0,0,1
4,625,535105100,20,0.0,9500,2,3,2,6,5,...,0,0,0,0,0,0,0,0,0,1


In [144]:
test_clean.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Reg,Lvl,AllPub,...,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,0.0,9662,Pave,IR1,Lvl,AllPub,...,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,IR1,Lvl,AllPub,...,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,Reg,Lvl,AllPub,...,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,0.0,9500,Pave,IR1,Lvl,AllPub,...,0,185,0,,,,0,7,2009,WD


In [145]:
train_clean.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,lot_shape,land_contour,utilities,...,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,109,533352170,60,RL,0.0,13517,Pave,IR1,Lvl,AllPub,...,0,0,0,,,,0,3,2010,WD
1,544,531379050,60,RL,43.0,11492,Pave,IR1,Lvl,AllPub,...,0,0,0,,,,0,4,2009,WD
2,153,535304180,20,RL,68.0,7922,Pave,Reg,Lvl,AllPub,...,0,0,0,,,,0,1,2010,WD
3,318,916386060,60,RL,73.0,9802,Pave,Reg,Lvl,AllPub,...,0,0,0,,,,0,4,2010,WD
4,255,906425045,50,RL,82.0,14235,Pave,IR1,Lvl,AllPub,...,0,0,0,,,,0,3,2010,WD


In [146]:
# Checking status of seperated saleprice column from beginning
train_saleprice.head()

Unnamed: 0,saleprice
0,130500
1,220000
2,109000
3,174000
4,138500


##### Merging back sale price into encoded training dataset

In [147]:
# Joining back sale price column into training set
train_enc = pd.merge(train_enc, train_saleprice, how='inner', left_index=True, right_index=True)
# Checking sale price has been merged back
train_enc.head()

Unnamed: 0,id,pid,ms_subclass,lot_frontage,lot_area,lot_shape,utilities,land_slope,overall_qual,overall_cond,...,sale_type_CWD,sale_type_Con,sale_type_ConLD,sale_type_ConLI,sale_type_ConLw,sale_type_New,sale_type_Oth,sale_type_VWD,sale_type_WD,saleprice
0,109,533352170,60,0.0,13517,2,3,2,6,8,...,0,0,0,0,0,0,0,0,1,130500
1,544,531379050,60,43.0,11492,2,3,2,7,5,...,0,0,0,0,0,0,0,0,1,220000
2,153,535304180,20,68.0,7922,3,3,2,5,7,...,0,0,0,0,0,0,0,0,1,109000
3,318,916386060,60,73.0,9802,3,3,2,5,5,...,0,0,0,0,0,0,0,0,1,174000
4,255,906425045,50,82.0,14235,2,3,2,6,8,...,0,0,0,0,0,0,0,0,1,138500


In [148]:
# Getting the total null values for the all columns in merged dataset
train_enc_null = pd.DataFrame(train_enc.isnull().sum(), columns = ['null_count']).sort_values(by = 'null_count',ascending = False)
train_enc_null = train_enc_null[train_enc_null['null_count'] > 0].sort_values('null_count', ascending=False).sort_index()

# Checking there are no null values
train_enc_null

Unnamed: 0,null_count


##### Checking number of rows and cols for cleaned and encoded train and test datasets

In [149]:
print(f"Clean and Encoded Train dataset: {train_enc.shape}")
print(f"Clean and Encoded Test dataset: {test_enc.shape}")

Clean and Encoded Train dataset: (1937, 205)
Clean and Encoded Test dataset: (878, 204)


#### Writing cleaned and encoded training and test dataset into a csv file to datasets folder

In [150]:
train_enc.to_csv('../datasets/train_clean_enc.csv', index=False)
test_enc.to_csv('../datasets/test_clean_enc.csv', index=False)