In [32]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest, mutual_info_regression, RFE
from sklearn.linear_model import LinearRegression, LassoCV

In [33]:
df = pd.read_csv('test.csv')

In [34]:
print(df.dtypes)

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 80, dtype: object


In [35]:
df.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
count,1459.0,1459.0,1232.0,1459.0,1459.0,1459.0,1459.0,1459.0,1444.0,1458.0,...,1458.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0,1459.0
mean,2190.0,57.378341,68.580357,9819.161069,6.078821,5.553804,1971.357779,1983.662783,100.709141,439.203704,...,472.768861,93.174777,48.313914,24.243317,1.79438,17.064428,1.744345,58.167923,6.104181,2007.769705
std,421.321334,42.74688,22.376841,4955.517327,1.436812,1.11374,30.390071,21.130467,177.6259,455.268042,...,217.048611,127.744882,68.883364,67.227765,20.207842,56.609763,30.491646,630.806978,2.722432,1.30174
min,1461.0,20.0,21.0,1470.0,1.0,1.0,1879.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0
25%,1825.5,20.0,58.0,7391.0,5.0,5.0,1953.0,1963.0,0.0,0.0,...,318.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0
50%,2190.0,50.0,67.0,9399.0,6.0,5.0,1973.0,1992.0,0.0,350.5,...,480.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0
75%,2554.5,70.0,80.0,11517.5,7.0,6.0,2001.0,2004.0,164.0,753.5,...,576.0,168.0,72.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0
max,2919.0,190.0,200.0,56600.0,10.0,9.0,2010.0,2010.0,1290.0,4010.0,...,1488.0,1424.0,742.0,1012.0,360.0,576.0,800.0,17000.0,12.0,2010.0


## Missing Values

In [36]:
missing_values = df.isnull().sum()
cols_missing_values = missing_values[missing_values > 0].sort_values()
cols_missing_values

TotalBsmtSF        1
GarageArea         1
GarageCars         1
KitchenQual        1
BsmtUnfSF          1
BsmtFinSF2         1
BsmtFinSF1         1
SaleType           1
Exterior1st        1
Exterior2nd        1
Functional         2
Utilities          2
BsmtHalfBath       2
BsmtFullBath       2
MSZoning           4
MasVnrArea        15
MasVnrType        16
BsmtFinType2      42
BsmtFinType1      42
BsmtQual          44
BsmtExposure      44
BsmtCond          45
GarageType        76
GarageFinish      78
GarageQual        78
GarageCond        78
GarageYrBlt       78
LotFrontage      227
FireplaceQu      730
Fence           1169
Alley           1352
MiscFeature     1408
PoolQC          1456
dtype: int64

### Electrical - filled with dominant class

In [37]:
print(df.Electrical.value_counts(normalize = True))
df.Electrical.fillna('SBrkr', inplace = True)

SBrkr    0.916381
FuseA    0.064428
FuseF    0.015764
FuseP    0.003427
Name: Electrical, dtype: float64


### MasVnrType - filled with dominant class 

In [38]:
print(df.MasVnrType.value_counts())
df.MasVnrType.fillna('None', inplace = True)

None       878
BrkFace    434
Stone      121
BrkCmn      10
Name: MasVnrType, dtype: int64


### MasVnrArea - filled with 0 class to match class type

In [39]:
print(df.MasVnrArea.value_counts())
df.MasVnrArea.fillna(0, inplace = True)

0.0       877
176.0      10
144.0       9
120.0       8
216.0       8
         ... 
647.0       1
1290.0      1
495.0       1
292.0       1
382.0       1
Name: MasVnrArea, Length: 303, dtype: int64


### BsmtQual, BsmtCond, BsmtFinType1, BsmtExposure,BsmtFinType2 - Filled with NA to indicate no basement (from data description file)

In [40]:
df.BsmtQual.fillna('NA', inplace = True)

In [41]:
df.BsmtCond.fillna('NA', inplace = True)

In [42]:
df.BsmtFinType1.fillna('NA', inplace = True)

In [43]:
df.BsmtExposure.fillna('NA', inplace = True)

In [44]:
df.BsmtFinType2.fillna('NA', inplace = True)

### Garage - Filled with NA to indicate no garage (from data description file)

In [45]:
df.GarageType.fillna('NA', inplace = True)

In [46]:
df.GarageYrBlt.fillna('NA', inplace = True)

In [47]:
df.GarageFinish.fillna('NA', inplace = True)

In [48]:
df.GarageQual.fillna('NA', inplace = True)

In [49]:
df.GarageCond.fillna('NA', inplace = True)

### LotFrontage

In [50]:
df.LotFrontage.fillna(np.mean(df.LotFrontage), inplace = True)

### FireplaceQu

In [51]:
df.FireplaceQu.fillna('NA', inplace = True)

### Fence

In [52]:
df.Fence.fillna('NA', inplace = True)

### Alley

In [53]:
df.Alley.fillna('NA', inplace = True)

### Pool

In [54]:
df.PoolQC.fillna('NA', inplace = True)

### Misc Features

In [55]:
df.MiscFeature.fillna('None', inplace = True)

### Missing Values Removed

In [56]:
missing_values = df.isnull().sum()
cols_missing_values = missing_values[missing_values > 0].sort_values()
cols_missing_values

Exterior1st     1
Exterior2nd     1
BsmtFinSF1      1
BsmtFinSF2      1
BsmtUnfSF       1
TotalBsmtSF     1
KitchenQual     1
GarageCars      1
GarageArea      1
SaleType        1
Utilities       2
BsmtFullBath    2
BsmtHalfBath    2
Functional      2
MSZoning        4
dtype: int64

# Columns Examination


In [57]:
cols_to_plot = ['LotFrontage', 'LotArea', 'OverallQual', 'HouseStyle', 'OverallCond', 'YearBuilt', 'YearRemodAdd'
               'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'TotRmsAbvGrd',
               'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'MoSold', 'YrSold', 'SalePrice']


In [58]:
a = list(df.columns)
for i in a:
    print(i)
    print(df[i].value_counts(normalize = True))
    print('----------------------------')

Id
1461    0.000685
2441    0.000685
2439    0.000685
2438    0.000685
2437    0.000685
          ...   
1945    0.000685
1944    0.000685
1943    0.000685
1942    0.000685
2919    0.000685
Name: Id, Length: 1459, dtype: float64
----------------------------
MSSubClass
20     0.372173
60     0.189171
50     0.098012
120    0.065113
30     0.047978
70     0.046607
160    0.044551
80     0.041124
90     0.039068
190    0.021247
85     0.019191
75     0.004798
180    0.004798
45     0.004112
40     0.001371
150    0.000685
Name: MSSubClass, dtype: float64
----------------------------
MSZoning
RL         0.765636
RM         0.166323
FV         0.050859
C (all)    0.010309
RH         0.006873
Name: MSZoning, dtype: float64
----------------------------
LotFrontage
68.580357     0.155586
60.000000     0.091158
80.000000     0.046607
70.000000     0.043180
50.000000     0.041124
                ...   
117.000000    0.000685
31.000000     0.000685
119.000000    0.000685
25.000000     0.000685
14

In [59]:
df.shape

(1459, 80)

In [60]:
#cols wo enough variance, drop
cols_to_drop = ['MSZoning', 'Street', 'Alley', 'LandContour', 'Utilities','LandSlope', 'Condition1', 'Condition2',
               'BldgType','RoofMatl', 'BsmtCond', 'BsmtFinType2', 'BsmtFinSF2', 'Heating', 'LowQualFinSF', 'BsmtHalfBath',
               'Functional', 'FireplaceQu', 'GarageYrBlt','GarageQual','GarageCond','PavedDrive', 'EnclosedPorch', 
                '3SsnPorch', 'PoolQC', 'MiscFeature', 'MiscVal', 'ExterCond', 'PoolArea']
df.drop(cols_to_drop, inplace = True, axis = 1)





In [61]:
#cols that need to be condensed
cols_to_combine = ['LotShape', 'LotConfig', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
                  'ExterQual', 'Foundation', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC'
                  'Electrical', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
                  'Fireplaces', 'GarageType', 'GarageFinish', 'GarageCars', 'PoolArea', 'Fence', 'SaleType',
                  'SaleCondition']

## Column Condensation

In [62]:
#cols that need to be condensed
cols_to_combine = ['LotShape', 'LotConfig', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
                  'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC'
                  'Electrical', 'FullBath', 'HalfBath', 'KitchenQual',
                  'GarageType', 'GarageFinish', 'GarageCars', 'Fence', 'SaleType',
                  'SaleCondition']

In [63]:
#LotShape
for i in range(len(df.LotShape)):
    if df.LotShape[i] == 'Reg':
        df['LotShape'][i] = 1
    else:
        df['LotShape'][i] = 0
df.LotShape = df.LotShape.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['LotShape'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['LotShape'][i] = 0


In [64]:
print(df.LotConfig.value_counts())
for i in range(len(df.LotConfig)):
    if df.LotConfig[i] == 'Inside':
        df['LotConfig'][i] = 1
    else:
        df['LotConfig'][i] = 0
df.LotConfig = df.LotConfig.astype(int)

Inside     1081
Corner      248
CulDSac      82
FR2          38
FR3          10
Name: LotConfig, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['LotConfig'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['LotConfig'][i] = 0


In [65]:
print(df.HouseStyle.value_counts())

1Story    745
2Story    427
1.5Fin    160
SLvl       63
SFoyer     46
2.5Unf     13
1.5Unf      5
Name: HouseStyle, dtype: int64


In [66]:
for i in range(len(df.HouseStyle)):
    if df.HouseStyle[i] in ('SLvl', 'SFoyer', '1.5Unf', '2.5Unf', '2.5Fin'):
        df.HouseStyle[i] = 'Other'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.HouseStyle[i] = 'Other'


In [67]:
print(df.RoofStyle.value_counts())
for i in range(len(df.RoofStyle)):
    if df.RoofStyle[i] == 'Gable':
        df['RoofStyle'][i] = 1
    else:
        df['RoofStyle'][i] = 0
df.RoofStyle = df.RoofStyle.astype(int)

Gable      1169
Hip         265
Gambrel      11
Flat          7
Mansard       4
Shed          3
Name: RoofStyle, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RoofStyle'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['RoofStyle'][i] = 0


In [68]:
cols_to_combine = ['LotShape', 'LotConfig', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
                  'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'HeatingQC'
                  'Electrical', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
                  'Fireplaces', 'GarageType', 'GarageFinish', 'GarageCars', 'PoolArea', 'Fence', 'SaleType',
                  'SaleCondition']

In [92]:
df.Exterior1st.value_counts()
for i in range(len(df.Exterior1st)):
    if df.Exterior1st[i] in ('CemntBd', 'BrkFace', 'WdShing', 'Stucco', 'AsbShng', 'BrkComm', 'Stone', 'AsphShn', 'ImStucc', 'CBlock'):
        df.Exterior1st[i] = 'Other'


NameError: name 'na' is not defined

In [70]:
df.Exterior1st.value_counts()

VinylSd    510
MetalSd    230
HdBoard    220
Wd Sdng    205
Other      180
Plywood    113
Name: Exterior1st, dtype: int64

In [71]:
df.Exterior2nd.value_counts()
for i in range(len(df.Exterior2nd)):
    if df.Exterior2nd[i] in ('CmentBd', 'Wd Shng', 'Brk Cmn','CemntBd', 'BrkFace', 'WdShing', 'Stucco', 'AsbShng', 'BrkComm', 'Stone', 'AsphShn', 'ImStucc', 'CBlock'):
        df.Exterior2nd[i] = 'Other'

        
        
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Exterior2nd[i] = 'Other'


In [72]:
df.MasVnrType.value_counts(normalize = True)
for i in range(len(df.MasVnrType)):
    if df.MasVnrType[i] == 'None':
        df['MasVnrType'][i] = 0
    else:
        df['MasVnrType'][i] = 1
df.MasVnrType = df.MasVnrType.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['MasVnrType'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['MasVnrType'][i] = 1


In [73]:
df.ExterQual.value_counts(normalize = True)
for i in range(len(df.ExterQual)):
    if df.ExterQual[i] == 'Fa':
        df['ExterQual'][i] = 'TA'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ExterQual'][i] = 'TA'


In [74]:
df.Foundation.value_counts(normalize = True)
for i in range(len(df.Foundation)):
    if df.Foundation[i] in ('Slab', 'Stone', 'Wood'):
        df['Foundation'][i] = 'Other'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Foundation'][i] = 'Other'


In [75]:
df.BsmtQual.value_counts(normalize = True)

TA    0.434544
Gd    0.405072
Ex    0.093900
Fa    0.036326
NA    0.030158
Name: BsmtQual, dtype: float64

In [76]:
df.BsmtQual.value_counts(normalize = True)
for i in range(len(df.BsmtQual)):
    if df.BsmtQual[i] in ('NA', 'Fa', 'TA'):
        df['BsmtQual'][i] = 0
    else:
        df['BsmtQual'][i] = 1
        
df.BsmtQual = df.BsmtQual.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BsmtQual'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BsmtQual'][i] = 1


In [77]:
df.BsmtExposure.value_counts(normalize = True)
for i in range(len(df.BsmtExposure)):
    if df.BsmtExposure[i] in ('No', 'NA'):
        df['BsmtExposure'][i] = 0
    else:
        df['BsmtExposure'][i] = 1
df.BsmtExposure = df.BsmtExposure.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BsmtExposure'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BsmtExposure'][i] = 1


In [78]:
df.BsmtFinType1.value_counts(normalize = True)
for i in range(len(df.BsmtFinType1)):
    if df.BsmtFinType1[i] in ('Unf', 'NA'):
        df['BsmtFinType1'][i] = 0
    elif df.BsmtFinType1[i] in ('Rec', 'BLQ', 'LwQ'):
        df['BsmtFinType1'][i] = 1
    else:
        df.BsmtFinType1[i] = 2
df.BsmtFinType1 = df.BsmtFinType1.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BsmtFinType1'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.BsmtFinType1[i] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BsmtFinType1'][i] = 0


In [79]:
df.HeatingQC.value_counts(normalize = True)
for i in range(len(df.HeatingQC)):
    if df.HeatingQC[i] in ('Ex'):
        df['HeatingQC'][i] = 1
    else:
        df['HeatingQC'][i] = 0
df.HeatingQC = df.HeatingQC.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['HeatingQC'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['HeatingQC'][i] = 1


In [80]:
df.Electrical.value_counts(normalize = True)
for i in range(len(df.Electrical)):
    if df.Electrical[i] in ('SBrkr'):
        df['Electrical'][i] = 1
    else:
        df['Electrical'][i] = 0
df.Electrical = df.Electrical.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Electrical'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Electrical'][i] = 0


In [81]:
df.HalfBath.value_counts(normalize = True)
df.FullBath.value_counts(normalize = True)

df['NoBath'] = df.FullBath + df.HalfBath/2

df.drop(['HalfBath', 'FullBath'], inplace = True, axis = 1)

df.NoBath = df.NoBath.astype(int)

In [82]:
df.KitchenQual.value_counts()
for i in range(len(df.KitchenQual)):
    if df.KitchenQual[i] in ('Fa', 'TA'):
        df['KitchenQual'][i] = 0
    elif df.KitchenQual[i] == 'Gd':
        df['KitchenQual'][i] = 1
    else:
        df['KitchenQual'][i] = 2
df.KitchenQual = df.KitchenQual.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['KitchenQual'][i] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['KitchenQual'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['KitchenQual'][i] = 2


In [83]:
df.GarageType.value_counts()
for i in range(len(df.GarageType)):
    if df.GarageType[i] in ('NA'):
        df['GarageType'][i] = 0
    else:
        df['GarageType'][i] = 1
df.GarageType = df.GarageType.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['GarageType'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['GarageType'][i] = 0


In [84]:
df.GarageFinish.value_counts()
for i in range(len(df.GarageFinish)):
    if df.GarageFinish[i] == 'NA':
        df['GarageFinish'][i] = 0
    elif df.GarageFinish[i] in ('Unf', 'RFn'):
        df['GarageFinish'][i] = 1
    else:
        df['GarageFinish'][i] = 2
df.GarageFinish = df.GarageFinish.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['GarageFinish'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['GarageFinish'][i] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['GarageFinish'][i] = 0


In [85]:
df.Fence.value_counts()
for i in range(len(df.Fence)):
    if df.Fence[i] == 'NA':
        df['Fence'][i] = 0
    else:
        df['Fence'][i] = 1
df.Fence = df.Fence.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Fence'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Fence'][i] = 0


In [86]:
df['SaleType'].value_counts()
for i in range(len(df.SaleType)):
    if df.SaleType[i] in ('WD', 'New'):
        df['SaleType'][i] = 1
    else:
        df['SaleType'][i] = 0
df.SaleType = df.SaleType.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SaleType'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SaleType'][i] = 0


In [87]:
df['SaleCondition'].value_counts()
for i in range(len(df.SaleCondition)):
    if df.SaleCondition[i] == 'Normal':
        df['SaleCondition'][i] = 1
    else:
        df['SaleCondition'][i] = 0
df.SaleCondition = df.SaleCondition.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SaleCondition'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SaleCondition'][i] = 0


In [88]:
df['CentralAir'].value_counts()
for i in range(len(df.CentralAir)):
    if df.CentralAir[i] == 'Y':
        df['CentralAir'][i] = 1
    else:
        df['CentralAir'][i] = 0
df.CentralAir = df.CentralAir.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CentralAir'][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CentralAir'][i] = 0


In [98]:
missing_values = df.isnull().sum()
cols_missing_values = missing_values[missing_values > 0].sort_values()
cols_missing_values

BsmtFinSF1      1
BsmtUnfSF       1
TotalBsmtSF     1
GarageCars      1
GarageArea      1
BsmtFullBath    2
dtype: int64

In [95]:
df.Exterior1st.fillna('Other', inplace = True)

In [97]:
df.Exterior2nd.fillna('Other', inplace = True)

In [99]:
df.BsmtFinSF1.fillna(0, inplace = True)
df.BsmtUnfSF.fillna(0, inplace = True)
df.TotalBsmtSF.fillna(0, inplace = True)
df.GarageCars.fillna(0, inplace = True)
df.GarageArea.fillna(0, inplace = True)
df.BsmtFullBath.fillna(0, inplace = True)

In [100]:
cols_to_dummies = ['Neighborhood', 'HouseStyle', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'Foundation']

In [101]:
df = pd.get_dummies(df, cols_to_dummies, drop_first = True)

In [None]:
df.columns

In [102]:
df.YearBuilt.apply(pd.to_datetime, 'year')
year = 2010
df['YrsOld'] = (year - df['YearBuilt'])

In [103]:
df['YrsOld']

0       49
1       52
2       13
3       12
4       18
        ..
1454    40
1455    40
1456    50
1457    18
1458    17
Name: YrsOld, Length: 1459, dtype: int64

In [104]:
df.YearRemodAdd.apply(pd.to_datetime, 'year')
df['YrsSinceRemod'] = (year - df['YearRemodAdd'])

In [105]:
df['YrsSinceRemod']

0       49
1       52
2       12
3       12
4       18
        ..
1454    40
1455    40
1456    14
1457    18
1458    16
Name: YrsSinceRemod, Length: 1459, dtype: int64

In [106]:
df.drop(['YearRemodAdd', 'YearBuilt'], axis = 1, inplace = True)

In [107]:
df.shape

(1459, 86)

In [108]:
df.to_csv('cleaned_data_submittal.csv')

# Plotting

In [None]:
cols_to_plot = ['LotFrontage', 'LotArea', 'OverallQual', 'HouseStyle', 'OverallCond', 'YearBuilt', 'YearRemodAdd'
               'MasVnrArea', 'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'TotRmsAbvGrd',
               'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'MoSold', 'YrSold', 'SalePrice']



In [None]:
sns.distplot(df.YrSold);
plt.title('Year Sold')
plt.show()

In [None]:
#sns.distplot(np.log(df.LotArea))

In [None]:
a = sns.distplot(df.OverallQual)
plt.title("Overall Quality"); 

In [None]:

y = df['SalePrice']
sns.distplot(y)
plt.title('Sale Price')
plt.show()

In [None]:
sns.distplot(df.OverallQual)
plt.title("Overall Quality"); 