# Project 2 - Ames Housing Data and Kaggle Challenge (02_Data_Cleaning_Test_Set)

## Libaries Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score

%matplotlib inline 

## Data Cleaning - Test Set

In [2]:
pd.options.display.max_columns = None
test = pd.read_csv('../datasets/test.csv')

In [3]:
# Checking rows and columns for test data
test.shape

(878, 80)

In [4]:
# Checking info for test data
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               878 non-null    int64  
 1   PID              878 non-null    int64  
 2   MS SubClass      878 non-null    int64  
 3   MS Zoning        878 non-null    object 
 4   Lot Frontage     718 non-null    float64
 5   Lot Area         878 non-null    int64  
 6   Street           878 non-null    object 
 7   Alley            58 non-null     object 
 8   Lot Shape        878 non-null    object 
 9   Land Contour     878 non-null    object 
 10  Utilities        878 non-null    object 
 11  Lot Config       878 non-null    object 
 12  Land Slope       878 non-null    object 
 13  Neighborhood     878 non-null    object 
 14  Condition 1      878 non-null    object 
 15  Condition 2      878 non-null    object 
 16  Bldg Type        878 non-null    object 
 17  House Style     

##### Checking if Id and PID are unique

In [5]:
print('Total unique rows for Id:', len(test['Id'].unique()))
print('Total unique rows for PID:', len(test['PID'].unique()))

Total unique rows for Id: 878
Total unique rows for PID: 878


##### Setting column names to lower case and replacing spacing with a '_'

In [6]:
test.columns = test.columns.str.lower()
test.columns = test.columns.str.replace(' ', '_')
test.head()

Unnamed: 0,id,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,utilities,lot_config,land_slope,neighborhood,condition_1,condition_2,bldg_type,house_style,overall_qual,overall_cond,year_built,year_remod/add,roof_style,roof_matl,exterior_1st,exterior_2nd,mas_vnr_type,mas_vnr_area,exter_qual,exter_cond,foundation,bsmt_qual,bsmt_cond,bsmt_exposure,bsmtfin_type_1,bsmtfin_sf_1,bsmtfin_type_2,bsmtfin_sf_2,bsmt_unf_sf,total_bsmt_sf,heating,heating_qc,central_air,electrical,1st_flr_sf,2nd_flr_sf,low_qual_fin_sf,gr_liv_area,bsmt_full_bath,bsmt_half_bath,full_bath,half_bath,bedroom_abvgr,kitchen_abvgr,kitchen_qual,totrms_abvgrd,functional,fireplaces,fireplace_qu,garage_type,garage_yr_blt,garage_finish,garage_cars,garage_area,garage_qual,garage_cond,paved_drive,wood_deck_sf,open_porch_sf,enclosed_porch,3ssn_porch,screen_porch,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,2fmCon,2Story,6,8,1910,1950,Gable,CompShg,AsbShng,AsbShng,,0.0,TA,Fa,Stone,Fa,TA,No,Unf,0,Unf,0,1020,1020,GasA,Gd,N,FuseP,908,1020,0,1928,0,0,2,0,4,2,Fa,9,Typ,0,,Detchd,1910.0,Unf,1,440,Po,Po,Y,0,60,112,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Sawyer,Norm,Norm,Duplex,1Story,5,4,1977,1977,Gable,CompShg,Plywood,Plywood,,0.0,TA,TA,CBlock,Gd,TA,No,Unf,0,Unf,0,1967,1967,GasA,TA,Y,SBrkr,1967,0,0,1967,0,0,2,0,6,2,TA,10,Typ,0,,Attchd,1977.0,Fin,2,580,TA,TA,Y,170,0,0,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,7,5,2006,2006,Gable,CompShg,VinylSd,VinylSd,,0.0,Gd,TA,PConc,Gd,Gd,Av,GLQ,554,Unf,0,100,654,GasA,Ex,Y,SBrkr,664,832,0,1496,1,0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,2006.0,RFn,2,426,TA,TA,Y,100,24,0,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Fam,1Story,5,6,1923,2006,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,Gd,TA,CBlock,TA,TA,No,Unf,0,Unf,0,968,968,GasA,TA,Y,SBrkr,968,0,0,968,0,0,1,0,2,1,TA,5,Typ,0,,Detchd,1935.0,Unf,2,480,Fa,TA,N,0,0,184,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1963,1963,Gable,CompShg,Plywood,Plywood,BrkFace,247.0,TA,TA,CBlock,Gd,TA,No,BLQ,609,Unf,0,785,1394,GasA,Gd,Y,SBrkr,1394,0,0,1394,1,0,1,1,3,1,TA,6,Typ,2,Gd,Attchd,1963.0,RFn,2,514,TA,TA,Y,0,76,0,0,185,0,,,,0,7,2009,WD


##### Checking for Duplicated values

In [7]:
print('Duplicated test values:', test.duplicated().sum())

Duplicated test values: 0


##### Getting numerical datatypes

In [8]:
# Assigning all variables that are numerical to num_column
num_column = [col for col in test.columns 
              if (test[col].dtype == 'int64' or test[col].dtype == float)]
num_column

['id',
 'pid',
 'ms_subclass',
 'lot_frontage',
 'lot_area',
 'overall_qual',
 'overall_cond',
 'year_built',
 'year_remod/add',
 'mas_vnr_area',
 'bsmtfin_sf_1',
 'bsmtfin_sf_2',
 'bsmt_unf_sf',
 'total_bsmt_sf',
 '1st_flr_sf',
 '2nd_flr_sf',
 'low_qual_fin_sf',
 'gr_liv_area',
 'bsmt_full_bath',
 'bsmt_half_bath',
 'full_bath',
 'half_bath',
 'bedroom_abvgr',
 'kitchen_abvgr',
 'totrms_abvgrd',
 'fireplaces',
 'garage_yr_blt',
 'garage_cars',
 'garage_area',
 'wood_deck_sf',
 'open_porch_sf',
 'enclosed_porch',
 '3ssn_porch',
 'screen_porch',
 'pool_area',
 'misc_val',
 'mo_sold',
 'yr_sold']

## Null Values

### Checking for Null values

In [9]:
test.isnull().sum().sort_values(ascending=False)[:20]

pool_qc           874
misc_feature      837
alley             820
fence             706
fireplace_qu      422
lot_frontage      160
garage_cond        45
garage_qual        45
garage_yr_blt      45
garage_finish      45
garage_type        44
bsmt_exposure      25
bsmtfin_type_1     25
bsmt_qual          25
bsmtfin_type_2     25
bsmt_cond          25
mas_vnr_area        1
mas_vnr_type        1
electrical          1
overall_cond        0
dtype: int64

### Handling Numerical Null Values

In [10]:
#Getting the total null values for the numerical columns in test dataset
test_null_num = pd.DataFrame(test[num_column].isnull().sum(), columns=['null_count'])
test_null_num = test_null_num[test_null_num['null_count'] > 0].sort_values('null_count', ascending=False).sort_index()
test_null_num

Unnamed: 0,null_count
garage_yr_blt,45
lot_frontage,160
mas_vnr_area,1


##### Cleaning column garage_yr_blt null values

In [11]:
# Identifying number of rows for Null values
test[test['garage_yr_blt'].isna()]['garage_yr_blt'].shape

(45,)

- As this is test set data, rows of test dataset can't be dropped.


- Assigning values of 0 to null values as garage_yr_blt has also a small amount of null values

In [12]:
# Setting Null values to 0 
test.loc[test[test['garage_yr_blt'].isna()].index, 'garage_yr_blt'] = 0.0

In [13]:
# Verifying that rows of the null values have been cleaned
test[test['garage_yr_blt'].isna()]['garage_yr_blt'].shape

(0,)

In [14]:
# Verifying that the null values has been changed
test.groupby('garage_yr_blt').count()['id'].sum()

878

##### Cleaning column lot_frontage null values

In [15]:
# Identifying number of rows for Null values
test[test['lot_frontage'].isna()]['lot_frontage'].shape

(160,)

- Assigning values of 0 to null values as lot_frontage is a continuous variable 

In [16]:
# Setting Null values to 0 
test.loc[test[test['lot_frontage'].isna()].index, 'lot_frontage'] = 0.0

In [17]:
# Verifying that rows of the null values have been cleaned
test[test['lot_frontage'].isna()]['lot_frontage'].shape

(0,)

In [18]:
# Verifying that the null values has been changed
test.groupby('lot_frontage').count()['id'].sum()

878

##### Cleaning column mas_vnr_area null values

In [19]:
# Identifying number of rows for Null values
test[test['mas_vnr_area'].isna()]['mas_vnr_area'].shape

(1,)

- Assigning values of 0 to null values as mas_vnr_area is a continuous variable 

In [20]:
# Setting Null values to 0 
test.loc[test[test['mas_vnr_area'].isna()].index, 'mas_vnr_area'] = 0.0

In [21]:
# Verifying that rows of the null values have been cleaned
test[test['mas_vnr_area'].isna()]['mas_vnr_area'].shape

(0,)

In [22]:
# Verifying that the null values has been changed
test.groupby('mas_vnr_area').count()['id'].sum()

878

#### Getting null values and checking if any null values are left within numerical columns for test dataset

In [23]:
# Getting the total null values for the numerical columns in test dataset
test_null_num = pd.DataFrame(test[num_column].isnull().sum(), columns=['null_count'])
test_null_num = test_null_num[test_null_num['null_count'] > 0].sort_values('null_count', ascending=False).sort_index()
test_null_num

Unnamed: 0,null_count


### Handling Categorical Null Values

In [24]:
#Getting the total null values for the Categorical columns in test dataset
test_null_cat = pd.DataFrame(test.isnull().sum(), columns=['null_count'])
test_null_cat = test_null_cat[test_null_cat['null_count'] > 0].sort_values('null_count', ascending=False).sort_index()
test_null_cat

Unnamed: 0,null_count
alley,820
bsmt_cond,25
bsmt_exposure,25
bsmt_qual,25
bsmtfin_type_1,25
bsmtfin_type_2,25
electrical,1
fence,706
fireplace_qu,422
garage_cond,45


##### Cleaning column alley null values

In [25]:
# Checking non null values
test['alley'].value_counts()

Grvl    35
Pave    23
Name: alley, dtype: int64

In [26]:
# Checking total null values
test['alley'].isnull().sum()

820

In [27]:
# Percentage of null values for alley in test dataset
(test['alley'].isnull().sum())/test.shape[0]*100

93.39407744874715

- As there is a total of 93.39% null values in Alley which is a large amount of null data similar to test dataset, this possibily not likely to affect the pricing of the house. Thus, column Alley shall be dropped.

In [28]:
test.drop(columns = 'alley', inplace=True)

In [29]:
test.shape

(878, 79)

- Total columns for test dataset is now 79

##### Cleaning column bsmt_cond null values

In [30]:
# Checking total non null values initially
test['bsmt_cond'].value_counts().sum()

853

In [31]:
# Checking breakdown of non null values initially
test['bsmt_cond'].value_counts()

TA    781
Fa     39
Gd     33
Name: bsmt_cond, dtype: int64

In [32]:
# Checking total null values
test['bsmt_cond'].isnull().sum()

25

- The column bsmt_cond for number of null values seemed to be relatively reasonable while comparing it to the entire test dataset


- From the data dictionary, it is noted that the NA value for bsmt_cond is representing no basement


- Thus, assigning the NaN values as a neutral category for further investigation

In [33]:
#Setting null values to 'None' to be an included value for categorical test dataset
test.loc[test[test['bsmt_cond'].isnull()].index, 'bsmt_cond'] = 'None'

# Checking updated total null values
test['bsmt_cond'].isnull().sum()

0

In [34]:
# Checking the null values are in the dataset as value counts
test['bsmt_cond'].value_counts()

TA      781
Fa       39
Gd       33
None     25
Name: bsmt_cond, dtype: int64

In [35]:
# Checking updated total non null values
test['bsmt_cond'].value_counts().sum()

878

##### Cleaning column bsmt_exposure null values

In [36]:
# Checking total non null values initially
test['bsmt_exposure'].value_counts().sum()

853

In [37]:
# Checking breakdown of non null values initially
test['bsmt_exposure'].value_counts()

No    567
Av    130
Gd     80
Mn     76
Name: bsmt_exposure, dtype: int64

In [38]:
# Checking total null values
test['bsmt_exposure'].isnull().sum()

25

- The column bsmt_exposure for number of null values seemed to be relatively reasonable while comparing it to the entire test dataset


- From the data dictionary, it is noted that the NA value for bsmt_exposure is representing no basement


- Thus, assigning the NaN values as a neutral category for further investigation

In [39]:
#Setting null values to 'None' to be an included value for categorical test dataset
test.loc[test[test['bsmt_exposure'].isnull()].index, 'bsmt_exposure'] = 'None'

# Checking updated total null values
test['bsmt_exposure'].isnull().sum()

0

In [40]:
# Checking the null values are in the dataset as value counts
test['bsmt_exposure'].value_counts()

No      567
Av      130
Gd       80
Mn       76
None     25
Name: bsmt_exposure, dtype: int64

In [41]:
# Checking updated total non null values
test['bsmt_exposure'].value_counts().sum()

878

##### Cleaning column bsmt_qual null values

In [42]:
# Checking total non null values initially
test['bsmt_qual'].value_counts().sum()

853

In [43]:
# Checking breakdown of non null values initially
test['bsmt_qual'].value_counts()

TA    396
Gd    355
Ex     73
Fa     28
Po      1
Name: bsmt_qual, dtype: int64

In [44]:
# Checking total null values
test['bsmt_qual'].isnull().sum()

25

- The column bsmt_qual for number of null values seemed to be relatively reasonable while comparing it to the entire test dataset


- From the data dictionary, it is noted that the NA value for bsmt_qual is representing no basement


- Thus, assigning the NaN values as a neutral category for further investigation

In [45]:
#Setting null values to 'None' to be an included value for categorical test dataset
test.loc[test[test['bsmt_qual'].isnull()].index, 'bsmt_qual'] = 'None'

# Checking updated total null values
test['bsmt_qual'].isnull().sum()

0

In [46]:
# Checking the null values are in the dataset as value counts
test['bsmt_qual'].value_counts()

TA      396
Gd      355
Ex       73
Fa       28
None     25
Po        1
Name: bsmt_qual, dtype: int64

In [47]:
# Checking updated total non null values
test['bsmt_qual'].value_counts().sum()

878

##### Cleaning column bsmtfin_type_1 null values

In [48]:
# Checking total non null values initially
test['bsmtfin_type_1'].value_counts().sum()

853

In [49]:
# Checking breakdown of non null values initially
test['bsmtfin_type_1'].value_counts()

Unf    248
GLQ    243
ALQ    136
Rec    105
BLQ     69
LwQ     52
Name: bsmtfin_type_1, dtype: int64

In [50]:
# Checking total null values
test['bsmtfin_type_1'].isnull().sum()

25

- The column bsmtfin_type_1 for number of null values seemed to be relatively reasonable while comparing it to the entire test dataset


- From the data dictionary, it is noted that the NA value for bsmtfin_type_1 is representing no basement


- Thus, assigning the NaN values as a neutral category for further investigation

In [51]:
#Setting null values to 'None' to be an included value for categorical test dataset
test.loc[test[test['bsmtfin_type_1'].isnull()].index, 'bsmtfin_type_1'] = 'None'

# Checking updated total null values
test['bsmtfin_type_1'].isnull().sum()

0

In [52]:
# Checking the null values are in the dataset as value counts
test['bsmtfin_type_1'].value_counts()

Unf     248
GLQ     243
ALQ     136
Rec     105
BLQ      69
LwQ      52
None     25
Name: bsmtfin_type_1, dtype: int64

In [53]:
# Checking updated total non null values
test['bsmtfin_type_1'].value_counts().sum()

878

##### Cleaning column bsmtfin_type_2 null values

In [54]:
# Checking total non null values initially
test['bsmtfin_type_2'].value_counts().sum()

853

In [55]:
# Checking breakdown of non null values initially
test['bsmtfin_type_2'].value_counts()

Unf    749
LwQ     29
Rec     26
BLQ     20
ALQ     18
GLQ     11
Name: bsmtfin_type_2, dtype: int64

In [56]:
# Checking total null values
test['bsmtfin_type_2'].isnull().sum()

25

- The column bsmtfin_type_2 for number of null values seemed to be relatively reasonable while comparing it to the entire test dataset


- From the data dictionary, it is noted that the NA value for bsmtfin_type_2 is representing no basement


- Thus, assigning the NaN values as a neutral category for further investigation

In [57]:
#Setting null values to 'None' to be an included value for categorical test dataset
test.loc[test[test['bsmtfin_type_2'].isnull()].index, 'bsmtfin_type_2'] = 'None'

# Checking updated total null values
test['bsmtfin_type_2'].isnull().sum()

0

In [58]:
# Checking the null values are in the dataset as value counts
test['bsmtfin_type_2'].value_counts()

Unf     749
LwQ      29
Rec      26
None     25
BLQ      20
ALQ      18
GLQ      11
Name: bsmtfin_type_2, dtype: int64

In [59]:
# Checking updated total non null values
test['bsmtfin_type_2'].value_counts().sum()

878

##### Cleaning column electrical null values

In [60]:
# Checking total non null values initially
test['electrical'].value_counts().sum()

877

In [61]:
# Checking breakdown of non null values initially
test['electrical'].value_counts()

SBrkr    813
FuseA     48
FuseF     15
FuseP      1
Name: electrical, dtype: int64

In [62]:
# Checking total null values
test['electrical'].isnull().sum()

1

- Checking the most frequently used categorical variable for electrical coulmn among the neighborhoods.

In [63]:
# Grouping by neighborhood to check the mostly used electrical component
test.groupby('neighborhood')['electrical'].describe()

Unnamed: 0_level_0,count,unique,top,freq
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Blmngtn,6,1,SBrkr,6
Blueste,4,1,SBrkr,4
BrDale,11,1,SBrkr,11
BrkSide,32,3,SBrkr,24
ClearCr,17,1,SBrkr,17
CollgCr,87,1,SBrkr,87
Crawfor,32,3,SBrkr,28
Edwards,50,3,SBrkr,41
Gilbert,49,2,SBrkr,48
Greens,5,1,SBrkr,5


- As there is only 1 null value in column Electrical, the value will be kept.


- However, there is no 'none' category value in column Electrical from the data dictionary.


- Thus, assigning the null value as a Standard Circuit Breakers & Romex (SBrkr) which is the mostly used category for column Electrical among the neighborhoods.

In [64]:
#Setting null values of electrical col to 'SBrkr' to be an included value for categorical test dataset
test.loc[test[test['electrical'].isnull()].index, 'electrical'] = 'SBrkr'

# Checking updated total null values
test['electrical'].isnull().sum()

0

In [65]:
# Checking the null values are in the dataset as value counts
test['electrical'].value_counts()

SBrkr    814
FuseA     48
FuseF     15
FuseP      1
Name: electrical, dtype: int64

In [66]:
# Checking updated total non null values
test['electrical'].value_counts().sum()

878

##### Cleaning column fence null values

In [67]:
# Checking total non null values initially
test['fence'].value_counts().sum()

172

In [68]:
# Checking breakdown of non null values initially
test['fence'].value_counts()

MnPrv    103
GdPrv     35
GdWo      32
MnWw       2
Name: fence, dtype: int64

In [69]:
# Checking total null values
test['fence'].isnull().sum()

706

- Although it seems that column Fence has many NaN values, there is a possibility that it might be a valuable predictor for pricing.


- From the data dictionary, it is noted that the NA value for Fence is representing no Fence.


- Thus, assigning the NaN values as a neutral category for further investigation

In [70]:
#Setting null values to 'None' to be an included value for categorical test dataset
test.loc[test[test['fence'].isnull()].index, 'fence'] = 'None'

# Checking updated total null values
test['fence'].isnull().sum()

0

In [71]:
# Checking the null values are in the dataset as value counts
test['fence'].value_counts()

None     706
MnPrv    103
GdPrv     35
GdWo      32
MnWw       2
Name: fence, dtype: int64

In [72]:
# Checking updated total non null values
test['fence'].value_counts().sum()

878

##### Cleaning column fireplace_qu null values

In [73]:
# Checking total non null values initially
test['fireplace_qu'].value_counts().sum()

456

In [74]:
# Checking breakdown of non null values initially
test['fireplace_qu'].value_counts()

Gd    220
TA    193
Fa     16
Po     15
Ex     12
Name: fireplace_qu, dtype: int64

In [75]:
# Checking total null values
test['fireplace_qu'].isnull().sum()

422

- Although it seems that column fireplace_qu has many NaN values, there is a possibility that it might be a valuable predictor for pricing.


- From the data dictionary, it is noted that the NA value for fireplace_qu is representing no fireplace.


- Thus, assigning the NaN values as a neutral category for further investigation

In [76]:
#Setting null values to 'None' to be an included value for categorical test dataset
test.loc[test[test['fireplace_qu'].isnull()].index, 'fireplace_qu'] = 'None'

# Checking updated total null values
test['fireplace_qu'].isnull().sum()

0

In [77]:
# Checking the null values are in the dataset as value counts
test['fireplace_qu'].value_counts()

None    422
Gd      220
TA      193
Fa       16
Po       15
Ex       12
Name: fireplace_qu, dtype: int64

In [78]:
# Checking updated total non null values
test['fireplace_qu'].value_counts().sum()

878

##### Cleaning column garage_cond null values

In [79]:
# Checking total non null values initially
test['garage_cond'].value_counts().sum()

833

In [80]:
# Checking breakdown of non null values initially
test['garage_cond'].value_counts()

TA    796
Fa     27
Po      6
Gd      3
Ex      1
Name: garage_cond, dtype: int64

In [81]:
# Checking total null values
test['garage_cond'].isnull().sum()

45

- The column garage_cond for number of null values seemed to be relatively reasonable while comparing it to the entire test dataset.


- From the data dictionary, it is noted that the NA value for garage_cond is representing no garage.


- Thus, assigning the NaN values as a neutral category for further investigation

In [82]:
#Setting null values to 'None' to be an included value for categorical test dataset
test.loc[test[test['garage_cond'].isnull()].index, 'garage_cond'] = 'None'

# Checking updated total null values
test['garage_cond'].isnull().sum()

0

In [83]:
# Checking the null values are in the dataset as value counts
test['garage_cond'].value_counts()

TA      796
None     45
Fa       27
Po        6
Gd        3
Ex        1
Name: garage_cond, dtype: int64

In [84]:
# Checking updated total non null values
test['garage_cond'].value_counts().sum()

878

##### Cleaning column garage_finish null values

In [85]:
# Checking total non null values initially
test['garage_finish'].value_counts().sum()

833

In [86]:
# Checking breakdown of non null values initially
test['garage_finish'].value_counts()

Unf    382
RFn    233
Fin    218
Name: garage_finish, dtype: int64

In [87]:
# Checking total null values
test['garage_finish'].isnull().sum()

45

- The column garage_finish for number of null values seemed to be relatively reasonable while comparing it to the entire test dataset.


- From the data dictionary, it is noted that the NA value for garage_finish is representing no garage.


- Thus, assigning the NaN values as a neutral category for further investigation

In [88]:
#Setting null values to 'None' to be an included value for categorical test dataset
test.loc[test[test['garage_finish'].isnull()].index, 'garage_finish'] = 'None'

# Checking updated total null values
test['garage_finish'].isnull().sum()

0

In [89]:
# Checking the null values are in the dataset as value counts
test['garage_finish'].value_counts()

Unf     382
RFn     233
Fin     218
None     45
Name: garage_finish, dtype: int64

In [90]:
# Checking updated total non null values
test['garage_finish'].value_counts().sum()

878

##### Cleaning column garage_qual null values

In [91]:
# Checking total non null values initially
test['garage_qual'].value_counts().sum()

833

In [92]:
# Checking breakdown of non null values initially
test['garage_qual'].value_counts()

TA    782
Fa     42
Gd      6
Po      3
Name: garage_qual, dtype: int64

In [93]:
# Checking total null values
test['garage_qual'].isnull().sum()

45

- The column garage_qual for number of null values seemed to be relatively reasonable while comparing it to the entire test dataset.


- From the data dictionary, it is noted that the NA value for garage_qual is representing no garage.


- Thus, assigning the NaN values as a neutral category for further investigation

In [94]:
#Setting null values to 'None' to be an included value for categorical test dataset
test.loc[test[test['garage_qual'].isnull()].index, 'garage_qual'] = 'None'

# Checking updated total null values
test['garage_qual'].isnull().sum()

0

In [95]:
# Checking the null values are in the dataset as value counts
test['garage_qual'].value_counts()

TA      782
None     45
Fa       42
Gd        6
Po        3
Name: garage_qual, dtype: int64

In [96]:
# Checking updated total non null values
test['garage_qual'].value_counts().sum()

878

##### Cleaning column garage_type null values

In [97]:
# Checking total non null values initially
test['garage_type'].value_counts().sum()

834

In [98]:
# Checking breakdown of non null values initially
test['garage_type'].value_counts()

Attchd     518
Detchd     246
BuiltIn     53
Basment      9
CarPort      4
2Types       4
Name: garage_type, dtype: int64

In [99]:
# Checking total null values
test['garage_type'].isnull().sum()

44

- The column garage_type for number of null values seemed to be relatively reasonable while comparing it to the entire test dataset.


- From the data dictionary, it is noted that the NA value for garage_type is representing no garage.


- Thus, assigning the NaN values as a neutral category for further investigation

In [100]:
#Setting null values to 'None' to be an included value for categorical test dataset
test.loc[test[test['garage_type'].isnull()].index, 'garage_type'] = 'None'

# Checking updated total null values
test['garage_type'].isnull().sum()

0

In [101]:
# Checking the null values are in the dataset as value counts
test['garage_type'].value_counts()

Attchd     518
Detchd     246
BuiltIn     53
None        44
Basment      9
CarPort      4
2Types       4
Name: garage_type, dtype: int64

In [102]:
# Checking updated total non null values
test['garage_type'].value_counts().sum()

878

##### Cleaning column mas_vnr_type null values

In [103]:
# Checking total non null values initially
test['mas_vnr_type'].value_counts().sum()

877

In [104]:
# Checking breakdown of non null values initially
test['mas_vnr_type'].value_counts()

None       534
BrkFace    250
Stone       80
BrkCmn      12
CBlock       1
Name: mas_vnr_type, dtype: int64

In [105]:
# Checking total null values
test['mas_vnr_type'].isnull().sum()

1

- There is only 1 null value for mas_vnr_type


- From the data dictionary, it is noted that the NA value for mas_vnr_type is representing as 'None'.


- Thus, assigning the NaN values as a neutral category for further investigation

In [106]:
#Setting null values to 'None' to be an included value for categorical test dataset
test.loc[test[test['mas_vnr_type'].isnull()].index, 'mas_vnr_type'] = 'None'

# Checking updated total null values
test['mas_vnr_type'].isnull().sum()

0

In [107]:
# Checking the null values are in the dataset as value counts
test['mas_vnr_type'].value_counts()

None       535
BrkFace    250
Stone       80
BrkCmn      12
CBlock       1
Name: mas_vnr_type, dtype: int64

In [108]:
# Checking updated total non null values
test['mas_vnr_type'].value_counts().sum()

878

##### Cleaning column misc_feature null values

In [109]:
# Checking total non null values initially
test['misc_feature'].value_counts().sum()

41

In [110]:
# Checking breakdown of non null values initially
test['misc_feature'].value_counts()

Shed    39
Gar2     1
Othr     1
Name: misc_feature, dtype: int64

In [111]:
# Checking total null values
test['misc_feature'].isnull().sum()

837

- Although it seems that column misc_feature has many NaN values, there is a possibility that it might be a valuable predictor for pricing.


- From the data dictionary, it is noted that the NA value for misc_feature is representing as 'None'.


- Thus, assigning the NaN values as a neutral category for further investigation

In [112]:
#Setting null values to 'None' to be an included value for categorical test dataset
test.loc[test[test['misc_feature'].isnull()].index, 'misc_feature'] = 'None'

# Checking updated total null values
test['misc_feature'].isnull().sum()

0

In [113]:
# Checking the null values are in the dataset as value counts
test['misc_feature'].value_counts()

None    837
Shed     39
Gar2      1
Othr      1
Name: misc_feature, dtype: int64

In [114]:
# Checking updated total non null values
test['misc_feature'].value_counts().sum()

878

##### Cleaning column pool_qc null values

In [115]:
# Checking total non null values initially
test['pool_qc'].value_counts().sum()

4

In [116]:
# Checking breakdown of non null values initially
test['pool_qc'].value_counts()

Ex    3
TA    1
Name: pool_qc, dtype: int64

In [117]:
# Checking total null values
test['pool_qc'].isnull().sum()

874

- Although it seems that column pool_qc has many NaN values, there is a possibility that it might be a valuable predictor for pricing.


- From the data dictionary, it is noted that the NA value for pool_qc is representing no pool.


- Thus, assigning the NaN values as a neutral category for further investigation

In [118]:
#Setting null values to 'None' to be an included value for categorical test dataset
test.loc[test[test['pool_qc'].isnull()].index, 'pool_qc'] = 'None'

# Checking updated total null values
test['pool_qc'].isnull().sum()

0

In [119]:
# Checking the null values are in the dataset as value counts
test['pool_qc'].value_counts()

None    874
Ex        3
TA        1
Name: pool_qc, dtype: int64

In [120]:
# Checking updated total non null values
test['pool_qc'].value_counts().sum()

878

#### Getting null values and checking if any null values are left within all columns for test dataset

In [121]:
# Getting the total null values for the all columns in train dataset
test_null = pd.DataFrame(test.isnull().sum(), columns = ['null_count']).sort_values(by = 'null_count',ascending = False)
test_null = test_null[test_null['null_count'] > 0].sort_values('null_count', ascending=False).sort_index()

# Checking there are no null values
test_null

Unnamed: 0,null_count


- Checking if each test columns values matching the total number of rows

In [122]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 79 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               878 non-null    int64  
 1   pid              878 non-null    int64  
 2   ms_subclass      878 non-null    int64  
 3   ms_zoning        878 non-null    object 
 4   lot_frontage     878 non-null    float64
 5   lot_area         878 non-null    int64  
 6   street           878 non-null    object 
 7   lot_shape        878 non-null    object 
 8   land_contour     878 non-null    object 
 9   utilities        878 non-null    object 
 10  lot_config       878 non-null    object 
 11  land_slope       878 non-null    object 
 12  neighborhood     878 non-null    object 
 13  condition_1      878 non-null    object 
 14  condition_2      878 non-null    object 
 15  bldg_type        878 non-null    object 
 16  house_style      878 non-null    object 
 17  overall_qual    

- It appears that ms_zoning has some seperated spacing in its categorical naming from the data dictionary

In [123]:
test['ms_zoning'].value_counts()

RL         674
RM         146
FV          38
RH          13
C (all)      6
I (all)      1
Name: ms_zoning, dtype: int64

- Replacing categorical naming of ms_zoning with seperated spacing to a single character for easy referencing

In [124]:
test['ms_zoning'].replace({'C (all)': 'C', 'A (agr)': 'A', 'I (all)': 'I'}, inplace=True)

In [125]:
test['ms_zoning'].value_counts()

RL    674
RM    146
FV     38
RH     13
C       6
I       1
Name: ms_zoning, dtype: int64

#### Writing cleaned test dataset into a csv file to datasets folder

In [126]:
test.to_csv('../datasets/test_clean.csv', index=False)