In [1]:
import pandas as pd
import numpy as np
import itertools
import re
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import helper

In [2]:
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
isu_pal = [
    "#FF0B04", "#9A3324", "#B9975B", "#707372", "#F1BE48",
    "#FF0B04", "#9A3324", "#B9975B", "#707372", "#F1BE48",
    "#FF0B04", "#9A3324", "#B9975B", "#707372", "#F1BE48",
    "#FF0B04", "#9A3324", "#B9975B", "#707372", "#F1BE48",
    "#FF0B04", "#9A3324", "#B9975B", "#707372", "#F1BE48",
    "#FF0B04", "#9A3324", "#B9975B", "#707372", "#F1BE48"
]
sns.set_palette(sns.color_palette(isu_pal))

In [3]:
hous_trn = pd.read_csv('train.csv', index_col=0, low_memory=False)

In [4]:
hous_trn.head()

Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,528382020,2270,290000,60,RL,92.0,11764,Pave,,IR1,...,0,0,,,,0,4,2010,WD,Normal
1,527450110,987,103400,160,RM,21.0,1680,Pave,,Reg,...,0,0,,,,0,2,2008,WD,Normal
2,909282020,2320,259500,70,RL,61.0,7259,Pave,,IR1,...,0,0,,MnPrv,,0,7,2007,WD,Normal
3,905200010,912,129000,20,RL,,8169,Pave,,Reg,...,0,0,,MnPrv,,0,7,2007,WD,Normal
4,914465020,2052,251000,60,RL,75.0,10125,Pave,,Reg,...,0,0,,,,0,5,2009,WD,Normal


In [5]:
# Removing the duplicated record has eliminated dulpicates from PID.
hous_trn[hous_trn.PID.duplicated()]

Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition


In [6]:
pd.DataFrame(hous_trn.columns.to_list()).to_csv('vars.csv')

In [7]:
myvar_dict = {'Lot':['LotFrontage', 'LotArea', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope'], 
         'Neighborhood':['Neighborhood'],
          'Bsmnt':['Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 
                   'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'],
         'Bedroom':['BedroomAbvGr'],
         'Porch':['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']
         }

In [8]:
myvar = list(itertools.chain(*myvar_dict.values()))

In [9]:
hous_trn[myvar].head()

Unnamed: 0,LotFrontage,LotArea,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,Foundation,BsmtQual,BsmtCond,...,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,BedroomAbvGr,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch
0,92.0,11764,IR1,Lvl,CulDSac,Gtl,NoRidge,PConc,Gd,TA,...,628.0,1152.0,0.0,0.0,4,132,57,0,0,0
1,21.0,1680,Reg,Lvl,Inside,Gtl,BrDale,CBlock,TA,TA,...,25.0,483.0,0.0,1.0,2,52,0,0,0,0
2,61.0,7259,IR1,Lvl,Inside,Mod,Crawfor,CBlock,TA,TA,...,104.0,1028.0,1.0,0.0,3,224,0,0,0,0
3,,8169,Reg,Lvl,Corner,Gtl,Sawyer,CBlock,TA,TA,...,261.0,912.0,1.0,0.0,3,204,0,0,0,0
4,75.0,10125,Reg,Lvl,Inside,Gtl,Mitchel,PConc,Gd,TA,...,412.0,1107.0,0.0,0.0,3,210,91,0,0,0


In [10]:
hous_trn[myvar].dtypes 

LotFrontage      float64
LotArea            int64
LotShape          object
LandContour       object
LotConfig         object
LandSlope         object
Neighborhood      object
Foundation        object
BsmtQual          object
BsmtCond          object
BsmtExposure      object
BsmtFinType1      object
BsmtFinSF1       float64
BsmtFinType2      object
BsmtFinSF2       float64
BsmtUnfSF        float64
TotalBsmtSF      float64
BsmtFullBath     float64
BsmtHalfBath     float64
BedroomAbvGr       int64
WoodDeckSF         int64
OpenPorchSF        int64
EnclosedPorch      int64
3SsnPorch          int64
ScreenPorch        int64
dtype: object

In [11]:
for var in hous_trn[myvar].columns.to_list():
    print(f'{var}: {hous_trn[myvar][var].unique()}')

LotFrontage: [ 92.  21.  61.  nan  75.  67.  42.  78. 138.  24.  60.  64.  77. 105.
  50.  80.  90.  63.  40.  87.  70.  35.  85.  65.  91.  58.  79.  59.
  76.  81. 130.  98.  62.  72.  43.  73.  41.  52. 120.  47. 149.  94.
 110.  68.  55.  88.  82. 144.  48.  74.  45.  53. 113. 107.  34.  46.
  57.  56.  54. 114. 125.  83.  69.  30.  93.  38. 128.  86. 100.  66.
 121.  44.  37.  84.  31.  51.  71.  95.  89. 112. 102.  49.  28. 129.
 195.  36. 123. 152.  99. 200.  96.  32. 109.  39.  97. 124. 116. 313.
 136. 141.  22. 108. 140. 104. 103. 115. 101. 119.  33. 118. 153. 160.
  26. 150. 106. 111. 131.  25.]
LotArea: [11764  1680  7259 ... 16059 22420  3612]
LotShape: ['IR1' 'Reg' 'IR2' 'IR3']
LandContour: ['Lvl' 'Bnk' 'HLS' 'Low']
LotConfig: ['CulDSac' 'Inside' 'Corner' 'FR2' 'FR3']
LandSlope: ['Gtl' 'Mod' 'Sev']
Neighborhood: ['NoRidge' 'BrDale' 'Crawfor' 'Sawyer' 'Mitchel' 'CollgCr' 'OldTown'
 'ClearCr' 'NPkVill' 'IDOTRR' 'Edwards' 'NridgHt' 'BrkSide' 'SawyerW'
 'NWAmes' 'Gilbert' 'Som

In [12]:
cols_na = hous_trn[myvar].loc[:,hous_trn[myvar].isna().any(axis=0)].columns.to_list()
cols_na

['LotFrontage',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'BsmtFullBath',
 'BsmtHalfBath']

In [13]:
hous_trn = hous_trn.astype({'LotArea':'float64', 'LotShape':'string', 
                 'LandContour':'string', 'LotConfig':'string',
                 'LandSlope':'string', 'Neighborhood':'string',
                 'Foundation':'string', 'BsmtQual':'string',
                 'BsmtCond':'string', 'BsmtExposure':'string', 
                 'BsmtFinType1':'string', 'BsmtFinType2':'string',
                 'BsmtFullBath':'Int64', 'BsmtHalfBath':'Int64',
                 'WoodDeckSF':'float64', 'OpenPorchSF':'float64',
                 'EnclosedPorch':'float64', '3SsnPorch':'float64',
                 'ScreenPorch':'float64'})

In [14]:
hous_trn[myvar].dtypes

LotFrontage      float64
LotArea          float64
LotShape          string
LandContour       string
LotConfig         string
LandSlope         string
Neighborhood      string
Foundation        string
BsmtQual          string
BsmtCond          string
BsmtExposure      string
BsmtFinType1      string
BsmtFinSF1       float64
BsmtFinType2      string
BsmtFinSF2       float64
BsmtUnfSF        float64
TotalBsmtSF      float64
BsmtFullBath       Int64
BsmtHalfBath       Int64
BedroomAbvGr       int64
WoodDeckSF       float64
OpenPorchSF      float64
EnclosedPorch    float64
3SsnPorch        float64
ScreenPorch      float64
dtype: object

In [15]:
bsmt_cols = [re.search('Bsmt', col) != None for col in hous_trn.columns]
bsmt_cols

[False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 True,
 True,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False,
 False]

In [16]:
hous_trn[hous_trn['BsmtHalfBath'].isna()].loc[:,[col.startswith('Bsmt') for col in hous_trn.columns]]

Unnamed: 0,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,BsmtFullBath,BsmtHalfBath
619,,,,,0.0,,0.0,0.0,,
1825,,,,,,,,,,


In [17]:
hous_trn.iloc[[1825]]

Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1825,903230120,896,79000,20,RM,99.0,5940.0,Pave,,IR1,...,0.0,0,,MnPrv,,0,4,2008,ConLD,Abnorml


In [18]:
hous_trn[(hous_trn['TotalBsmtSF'].isna()) | (hous_trn['TotalBsmtSF']==0)].shape

(56, 81)

In [19]:
bsmt_df1 = hous_trn[(hous_trn['TotalBsmtSF'].isna()) | (hous_trn['TotalBsmtSF']==0)]

In [20]:
hous_trn[hous_trn['BsmtQual'].isna()].shape

(56, 81)

In [21]:
bsmt_df2 = hous_trn[hous_trn['BsmtQual'].isna()]

In [22]:
bsmt_df3 = bsmt_df1.merge(bsmt_df2, how='outer', indicator=True)
df3[df3['_merge']!='both']

NameError: name 'df3' is not defined

In [None]:
hous_trn[(hous_trn['BsmtQual'].isna()) & (hous_trn['TotalBsmtSF']!=0)].loc[:,[re.search('Bsmt', col) != None for col in hous_trn.columns]].shape