In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets, model_selection
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns

In [None]:
data = pd.read_csv("data/train.csv")

In [None]:
'''
Drop unused features.
'''

data = data.drop(["Condition1", "Condition2",
                 "MiscVal", "MiscFeature", "GarageYrBlt",
                 "GarageCars", "GarageFinish", "GarageType",
                 "Fireplaces", "FireplaceQu", "PavedDrive", "MasVnrArea",
                  "MasVnrType", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
                  "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch",
                  "ScreenPorch", "BsmtFinType1", "BsmtFinType2",
                  "BsmtFinSF1", "BsmtFinSF2", "Id", "PoolQC", "Fence",
                  "Alley", "PoolArea", "Exterior1st", "Exterior2nd", "Street",
                  "BsmtExposure", "MoSold", "YrSold", "YearBuilt", "Heating",
                  "SaleType", "SaleCondition", "GarageQual", "CentralAir", "Utilities", "BsmtQual",
                  "Electrical", "BsmtCond", "GarageCond", "OverallCond", "KitchenAbvGr", "BedroomAbvGr", "BsmtUnfSF"
                  ], axis=1)


In [None]:
'''
summarize all bathrooms data into one.
'''
data['bathrooms'] = data[["BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath"]].sum(axis=1)
data = data.drop(["BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath"], axis=1)
data.insert(0, 'bathrooms', data.pop('bathrooms'))

In [None]:
'''
Distribution of price
'''
plt.figure(figsize=(9, 8))
sns.distplot(data['SalePrice'], color='g', bins=100, hist_kws={'alpha': 0.4});

In [5]:
for col in data.columns:
    print(data[col].value_counts())

2    572
3    477
1    228
4    172
5      8
6      3
Name: bathrooms, dtype: int64
20     536
60     299
50     144
120     87
30      69
160     63
70      60
80      58
90      52
190     30
85      20
75      16
45      12
180     10
40       4
Name: MSSubClass, dtype: int64
RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64
60.0     143
70.0      70
80.0      69
50.0      57
75.0      53
        ... 
137.0      1
141.0      1
38.0       1
140.0      1
46.0       1
Name: LotFrontage, Length: 110, dtype: int64
7200     25
9600     24
6000     17
9000     14
8400     14
         ..
14601     1
13682     1
4058      1
17104     1
9717      1
Name: LotArea, Length: 1073, dtype: int64
Reg    925
IR1    484
IR2     41
IR3     10
Name: LotShape, dtype: int64
Lvl    1311
Bnk      63
HLS      50
Low      36
Name: LandContour, dtype: int64
Inside     1052
Corner      263
CulDSac      94
FR2          47
FR3           4
Name: LotConfig,

In [19]:
"""
Get all numerical features
"""
data_num = data.select_dtypes(include = ['float64', 'int64']).drop(["MSSubClass"], axis=1)
# df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8); # ; avoid having the matplotlib verbose information
for col in data_num.columns:
    data[col] = data[col].fillna(0)


In [None]:
for i in range(0, len(data_num.columns), 5):
    sns.pairplot(data=data_num,
                x_vars=data_num.columns[i:i+5],
                y_vars=['SalePrice'])

In [None]:
'''
Remove outliers
'''
data = data[data["LotFrontage"] < 200]
data = data[data["LotArea"] < 100000]
data = data[data["TotalBsmtSF"] < 4000]
data = data[data["GrLivArea"] < 4000]
data.describe()

In [20]:
data_num_corr = data_num.corr()['SalePrice'][:-1] # -1 because the latest row is SalePrice
golden_features_list = data_num_corr[abs(data_num_corr) > -100].sort_values(ascending=False)
print("Correlation:\n{}".format(golden_features_list))

Correlation:
OverallQual     0.790982
GrLivArea       0.708624
GarageArea      0.623431
TotalBsmtSF     0.613581
bathrooms       0.613005
TotRmsAbvGrd    0.533723
YearRemodAdd    0.507101
LotFrontage     0.351799
LotArea         0.263843
Name: SalePrice, dtype: float64


In [6]:
"""
Deal with categorical data
"""
df_cat = data.select_dtypes(include = ['object'])

# df_cat['MSSubClass'] = data['MSSubClass']
#
# categorical_col = df_cat.columns
# for col in categorical_col:
#     data[col] = data[col].fillna("NA")
#     data[col] = data[col].astype("category")


Unnamed: 0,MSZoning,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,BldgType,HouseStyle,RoofStyle,RoofMatl,ExterQual,ExterCond,Foundation,HeatingQC,KitchenQual,Functional
0,RL,Reg,Lvl,Inside,Gtl,CollgCr,1Fam,2Story,Gable,CompShg,Gd,TA,PConc,Ex,Gd,Typ
1,RL,Reg,Lvl,FR2,Gtl,Veenker,1Fam,1Story,Gable,CompShg,TA,TA,CBlock,Ex,TA,Typ
2,RL,IR1,Lvl,Inside,Gtl,CollgCr,1Fam,2Story,Gable,CompShg,Gd,TA,PConc,Ex,Gd,Typ
3,RL,IR1,Lvl,Corner,Gtl,Crawfor,1Fam,2Story,Gable,CompShg,TA,TA,BrkTil,Gd,Gd,Typ
4,RL,IR1,Lvl,FR2,Gtl,NoRidge,1Fam,2Story,Gable,CompShg,Gd,TA,PConc,Ex,Gd,Typ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Reg,Lvl,Inside,Gtl,Gilbert,1Fam,2Story,Gable,CompShg,TA,TA,PConc,Ex,TA,Typ
1456,RL,Reg,Lvl,Inside,Gtl,NWAmes,1Fam,1Story,Gable,CompShg,TA,TA,CBlock,TA,TA,Min1
1457,RL,Reg,Lvl,Inside,Gtl,Crawfor,1Fam,2Story,Gable,CompShg,Ex,Gd,Stone,Ex,Gd,Typ
1458,RL,Reg,Lvl,Inside,Gtl,NAmes,1Fam,1Story,Hip,CompShg,TA,TA,CBlock,Gd,Gd,Typ


In [8]:
df_cat


Unnamed: 0,MSZoning,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,BldgType,HouseStyle,RoofStyle,RoofMatl,ExterQual,ExterCond,Foundation,HeatingQC,KitchenQual,Functional
0,RL,Reg,Lvl,Inside,Gtl,CollgCr,1Fam,2Story,Gable,CompShg,Gd,TA,PConc,Ex,Gd,Typ
1,RL,Reg,Lvl,FR2,Gtl,Veenker,1Fam,1Story,Gable,CompShg,TA,TA,CBlock,Ex,TA,Typ
2,RL,IR1,Lvl,Inside,Gtl,CollgCr,1Fam,2Story,Gable,CompShg,Gd,TA,PConc,Ex,Gd,Typ
3,RL,IR1,Lvl,Corner,Gtl,Crawfor,1Fam,2Story,Gable,CompShg,TA,TA,BrkTil,Gd,Gd,Typ
4,RL,IR1,Lvl,FR2,Gtl,NoRidge,1Fam,2Story,Gable,CompShg,Gd,TA,PConc,Ex,Gd,Typ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Reg,Lvl,Inside,Gtl,Gilbert,1Fam,2Story,Gable,CompShg,TA,TA,PConc,Ex,TA,Typ
1456,RL,Reg,Lvl,Inside,Gtl,NWAmes,1Fam,1Story,Gable,CompShg,TA,TA,CBlock,TA,TA,Min1
1457,RL,Reg,Lvl,Inside,Gtl,Crawfor,1Fam,2Story,Gable,CompShg,Ex,Gd,Stone,Ex,Gd,Typ
1458,RL,Reg,Lvl,Inside,Gtl,NAmes,1Fam,1Story,Hip,CompShg,TA,TA,CBlock,Gd,Gd,Typ


In [21]:
"""
Ordinal: HeatingQC, LotShape, LandContour, LandSlope, ExterQual, ExterCond, HeatingQC, KitchenQual, Functional
Nominal: LotConfig, Neighborhood, HouseStyle, RoofStyle, RoofMatl, Foundation, MSZoning, BldgType
"""
from collections import Counter

data_cat_col_dict = {}
for col in df_cat.columns:
    data_cat_col_dict[col] = list(Counter(data[col]))

data_cat_col_dict["HeatingQC"]


['Ex', 'Gd', 'TA', 'Fa', 'Po']

In [7]:
data.isna().sum()

bathrooms         0
MSSubClass        0
MSZoning          0
LotFrontage     259
LotArea           0
LotShape          0
LandContour       0
LotConfig         0
LandSlope         0
Neighborhood      0
BldgType          0
HouseStyle        0
OverallQual       0
YearRemodAdd      0
RoofStyle         0
RoofMatl          0
ExterQual         0
ExterCond         0
Foundation        0
TotalBsmtSF       0
HeatingQC         0
GrLivArea         0
KitchenQual       0
TotRmsAbvGrd      0
Functional        0
GarageArea        0
SalePrice         0
dtype: int64

In [None]:
# from collections import Counter
#
# Counter(data["GarageCond"])

In [None]:
corr = data.drop('SalePrice', axis=1).corr() # We already examined SalePrice correlations
plt.figure(figsize=(12, 10))

sns.heatmap(corr[(corr >= 0.5) | (corr <= -2)],
            cmap='viridis', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

In [None]:
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile, chi2

data_num = data_num.drop(['SalePrice'], axis=1, errors='ignore')
# data = data.drop("MSSubClass", axis=1)
for col in data_num:
    data[col] = (data[col] - data[col].min()) / (data[col].max() - data[col].min())
data_cat = data.select_dtypes(include = ['category'])
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(missing_values=np.nan, strategy="median")),
        ("scaler", StandardScaler())
    ]
)
categorical_nominal_transformer = Pipeline(
    steps=[
        # ("imputer", SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)

categorical_ordinal_transformer = Pipeline(
    steps=[
        # ("imputer", SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)


preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, data_num.columns),
        ("cat", categorical_transformer, data_cat.columns

),
    ]
)
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("regression", DecisionTreeRegressor())]
)




In [None]:
y = data['SalePrice']
X = data.drop(['SalePrice'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
fig = plt.figure(figsize=(16, 8))
vis = plot_tree(clf, feature_names = data.columns, fontsize=9, proportion=True, filled=True, rounded=True)