In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
train = pd.read_csv('../data/processed/train.csv')

# Few value

- Fence
- Alley
- Heating
- Condition 1 and 2

In [3]:
# Few Values
train.drop(columns=['Fence', 'Alley', 'Heating', 'Condition1',
                    'Condition2', 'Electrical'], inplace=True)


# Bool Variables

These variables below can be transformed for a boolean feature. The reason behind that is because the categories don't impact so much the house price as the difference in the houses that has or not this features.

- Garage
- Bsmt
- Pool
- SecondFlor
- FirePlace
- PavedDrive

In [4]:
train['HasBsmt'] = train['TotalBsmtSF'].apply(lambda x: 1 if x>0 else 0)
train['HasPool'] = train['PoolArea'].apply(lambda x: 1 if x>0 else 0)
train['Has2ndFlr'] = train['2ndFlrSF'].apply(lambda x: 1 if x>0 else 0)
train['HasFirePlace'] = train['Fireplaces'].apply(lambda x: 1 if x!='0' else 0)
train['HasGarage'] = train['GarageCars'].apply(lambda x: 1 if x>0 else 0)
train['HasPavedDrive'] = train['PavedDrive'].apply(lambda x: 1 if x=='Yes' else 0)

In [8]:
fig = px.violin(train, y='SalePrice', x='HasPool', box=True)

fig.show()

In [9]:
# Bool variables
train.drop(columns=['Fireplaces', 'PoolArea', 'PavedDrive'], inplace=True)

# Garage Variables

In [10]:
fig = px.scatter(train[train['GarageCars'] > 0], x="GarageArea", y="SalePrice", color='GarageCars')
fig.show()

**Conclusion**

The most important features related with Garage is Garage cars and area. So, the rest will be droped

In [11]:
# Garage Variables
train.drop(columns=['GarageType', 'GarageQual',
                    'GarageYrBlt', 'GarageFinish',
                    'GarageCond'], inplace=True)

# Portch Area

In [13]:
train['PorchArea'] = train[['WoodDeckSF','OpenPorchSF','EnclosedPorch',
                            '3SsnPorch','ScreenPorch']].sum(axis=1)

In [17]:
fig = px.scatter(train[train['PorchArea']>0], x='PorchArea', y='SalePrice', trendline='ols')
fig.show()

In [18]:
#Porch Area
train.drop(columns=['WoodDeckSF','OpenPorchSF','EnclosedPorch',
                    '3SsnPorch','ScreenPorch'], inplace=True)

# Lot propriets

- LotConfig
- LotShape
- LotArea
- LotFrontage
- HouseStyle
- LandSlope
- LandContour

In [37]:
fig = px.scatter(train[train['LotArea'] < 50000], x="LotArea", y="SalePrice",
                 color='MSSubClass', facet_col="HouseStyle")

fig.show()

In [40]:
fig = px.scatter(train[train['LotArea']], x='LotFrontage', y="SalePrice",
                 color='MSSubClass', facet_col="HouseStyle")

fig.show()

In [41]:
# Qual Values
train.drop(columns=['LotConfig', 'LotShape', 'LandSlope',  'LandContour'], inplace=True)

# Qual

In [20]:
qual_columns = ['ExterQual','ExterCond','BsmtQual','KitchenQual']
train[qual_columns].describe()

Unnamed: 0,ExterQual,ExterCond,BsmtQual,KitchenQual
count,1460,1460,1423,1460
unique,3,4,4,4
top,Fa/TA,TA,TA,TA
freq,920,1282,649,735


In [21]:
train[qual_columns]

Unnamed: 0,ExterQual,ExterCond,BsmtQual,KitchenQual
0,Gd,TA,Gd,Gd
1,Fa/TA,TA,Gd,TA
2,Gd,TA,Gd,Gd
3,Fa/TA,TA,TA,Gd
4,Gd,TA,Gd,Gd
...,...,...,...,...
1455,Fa/TA,TA,Gd,TA
1456,Fa/TA,TA,Gd,TA
1457,Ex,Gd,TA,Gd
1458,Fa/TA,TA,TA,Gd


In [33]:
fig = px.violin(train, x='OverallQual', y='SalePrice')
fig.show()

**Conclusion**

For the first model we can use just the OveralQual

In [34]:
# Qual Values
train.drop(columns=['FireplaceQu', 'ExterQual', 'PoolQC',  'HeatingQC',
                    'ExterCond','BsmtQual', 'KitchenQual', 'BsmtCond'], inplace=True)

# Bsmt

In [43]:
col = train.columns
bsmt = [col for col in train.columns if 'Bsmt' in col]
train[bsmt].head()

Unnamed: 0,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,HasBsmt
0,No,GLQ,706,Unf,0,150,856,1
1,Gd,ALQ,978,Unf,0,284,1262,1
2,Mn,GLQ,486,Unf,0,434,920,1
3,No,ALQ,216,Unf,0,540,756,1
4,Av,GLQ,655,Unf,0,490,1145,1


In [95]:
fig = px.scatter(train[train['HasBsmt']==1], x="SalePrice", y="TotalBsmtSF", trendline='ols')

fig.show()

In [28]:
train['BsmtExposure'].fillna('No', inplace=True)
fig = px.scatter(train[train['HasBsmt']==1], x="SalePrice", y="TotalBsmtSF", trendline='ols',
                 color='BsmtExposure')

fig.show()

**Conclusion**

We can use just TotalBsmtSF for the first model

In [44]:
# Bsmt
train.drop(columns=['BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
                    'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF'], inplace=True)

# Zone

In [36]:
fig = px.violin(train, y="SalePrice", color='MSZoning', box=True)
fig.show()

# Roof

In [77]:
train[['RoofStyle', 'RoofMatl']].head()

Unnamed: 0,RoofStyle,RoofMatl
0,Gable,CompShg
1,Gable,CompShg
2,Gable,CompShg
3,Gable,CompShg
4,Gable,CompShg


In [81]:
fig = px.violin(train, y="SalePrice", x='RoofMatl', color='RoofStyle', points='all', box=True)
fig.show()

In [83]:
train.drop(columns=['RoofStyle', 'RoofMatl'], inplace=True)

# Exterior

In [89]:
fig = px.violin(train, y="SalePrice", x='Exterior1st', points='all', box=True)
fig.show()

In [90]:
fig = px.violin(train, y="SalePrice", x='Exterior1st', points='all', box=True)
fig.show()

In [95]:
train['LowQualExterior1st'] = train['Exterior1st'].apply(lambda x: -1 if x=='LowQualMat' else 0)
train['LowQualExterior2nd'] = train['Exterior2nd'].apply(lambda x: -1 if x=='LowQualMat' else 0)
train['HighQualExterior1st'] = train['Exterior1st'].apply(lambda x: 1 if x=='HighQualMat' else 0)
train['HighQualExterior2nd'] = train['Exterior2nd'].apply(lambda x: 1 if x=='HighQualMat' else 0)

train['QualMatExterior'] = train[['LowQualExterior1st', 'LowQualExterior2nd',
                                  'HighQualExterior1st', 'HighQualExterior2nd']].sum(axis=1)

In [97]:
fig = px.violin(train, y="SalePrice", x='QualMatExterior', points='all', box=True)
fig.show()

In [99]:
train.drop(columns=['LowQualExterior1st', 'LowQualExterior', 'LowQualExterior2nd',
                    'HighQualExterior1st', 'HighQualExterior2nd', 'Exterior1st',
                    'Exterior2nd'], inplace=True)

# Some other features

**MasVnrType**

In [61]:
fig = px.violin(train, y="SalePrice", x='MasVnrType', box=True)
fig.show()

**BldgType**

In [70]:
fig = px.violin(train, y="SalePrice", x='BldgType', box=True)
fig.show()

In [71]:
train['1FamBldg'] = train['BldgType'].apply(lambda x: 1 if x=='1Fam' else 0)

In [72]:
fig = px.violin(train, y="SalePrice", x='1FamBldg', box=True)
fig.show()

In [74]:
train.drop(columns=['BldgType'], inplace=True)

**Functional**

In [110]:
train['Functional'].value_counts()

Typ     1360
Min2      34
Min1      31
Mod       15
Maj1      14
Maj2       5
Sev        1
Name: Functional, dtype: int64

In [113]:
fig = px.violin(train, y="SalePrice", x='Functional', box=True)
fig.show()

In [114]:
train['FunctionalDed'] = train['Functional'].apply(lambda x: 0 if x=='Typ' else 1)

In [115]:
fig = px.violin(train, y="SalePrice", x='FunctionalDed', box=True)
fig.show()

In [117]:
train.drop(columns=['Functional'], inplace=True)

In [119]:
train.columns

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Neighborhood',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'MasVnrType', 'MasVnrArea', 'Foundation', 'TotalBsmtSF', 'CentralAir',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea', 'MiscVal',
       'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice',
       'TotalBath', 'HasBsmt', 'HasPool', 'Has2ndFlr', 'HasFirePlace',
       'HasGarage', 'HasPavedDrive', 'PorchArea', '1FamBldg',
       'QualMatExterior', 'FunctionalDed'],
      dtype='object')

# Conclusions

In this notebook I dropped and replace a lot of features, trying to make the model more simple without miss information. So, here are the results: