In [2]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')

In [3]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [4]:
train = pd.read_csv('train_clean.csv', index_col=0)

In [6]:
cols_na = train.loc[:,train.isna().any(axis=0)].columns.to_list()
cols_na

[]

In [8]:
cat_feats = df.select_dtypes(['object','bool']).columns.to_list()


<bound method DataFrame.select_dtypes of             PID  GrLivArea  SalePrice  \
0     905101330       1296      90000   
1     909451100       1229     137000   
2     527451450        948      89000   
3     903232190       1040     123900   
4     914452120        912     156000   
...         ...        ...        ...   
1866  533212010       1200     151000   
1867  528168030       2400     290000   
1868  908275090       1346     112000   
1869  907251090       2031     237000   
1870  907230240       1320     137000   

                                           MSSubClass MSZoning  LotFrontage  \
0      MSSubClass_MSSubClass_MSSubClass_MSSubClass_90       RL    72.000000   
1     MSSubClass_MSSubClass_MSSubClass_MSSubClass_160       RM    24.000000   
2     MSSubClass_MSSubClass_MSSubClass_MSSubClass_160       RM    21.000000   
3      MSSubClass_MSSubClass_MSSubClass_MSSubClass_50       RM    52.000000   
4      MSSubClass_MSSubClass_MSSubClass_MSSubClass_85       RL    61.99

In [12]:
X = train.drop('SalePrice',axis=1)
transformer = ColumnTransformer([("Cat", OneHotEncoder(drop='first'), categorical)], remainder='passthrough')
X = transformer.fit_transform(X)

In [13]:
y = np.log(train['SalePrice'])

In [14]:
ols = linear_model.LinearRegression()

In [15]:
ols.fit(X, y)
ols.score(X, y)

0.9531732835572826

## test clean

In [16]:
test = pd.read_csv('test.csv', index_col=0)

In [17]:
# converting all similar mappings together
# most popular mapping
cat_ordinal_features = [
    'GarageQual','GarageCond',
    'FireplaceQu',
    'KitchenQual',
    'ExterQual','ExterCond',
    'BsmtQual','BsmtCond',
    'HeatingQC'
    ]
cat_ordinal_dict = {'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
test = helper.convert_cat_ordinal_vars_to_num(test,
                                               cat_ordinal_features,
                                               cat_ordinal_dict)
# now just unique mappings
# BsmtExposure
cat_ordinal_features = [
    'BsmtExposure'
]
cat_ordinal_dict = {'No':1,'Mn':2,'Av':3,'Gd':4}
test = helper.convert_cat_ordinal_vars_to_num(test,
                                               cat_ordinal_features,
                                               cat_ordinal_dict)
# Functional
cat_ordinal_features = [
    'Functional'
]
cat_ordinal_dict = {'Sal':1,'Sev':2,'Maj2':3,'Maj1':4,
                    'Mod':5,'Min2':6,'Min1':7,'Typ':8}
test = helper.convert_cat_ordinal_vars_to_num(test,
                                               cat_ordinal_features,
                                               cat_ordinal_dict)
# PoolQC
cat_ordinal_features = [
    'PoolQC'
]
cat_ordinal_dict = {'Fa':1,'TA':2,'Gd':3,'Ex':4}
test = helper.convert_cat_ordinal_vars_to_num(test,
                                               cat_ordinal_features,
                                               cat_ordinal_dict)
# Fence
cat_ordinal_features = [
    'Fence'
]
cat_ordinal_dict = {'MnWw':1,'GdWo':2,'MnPrv':3,'GdPrv':4}
test = helper.convert_cat_ordinal_vars_to_num(test,
                                               cat_ordinal_features,
                                               cat_ordinal_dict)

In [18]:
test.loc[test['LotFrontage'].isna(), 'LotFrontage'] = test.loc[test['LotFrontage'].isna(), :].apply(
    lambda x: LotFrontage_dict[x['BldgType']]*np.sqrt(x['LotArea']), axis=1
)

In [19]:
## LotFrontage now dealt with above. Had to add 'Electrical', 'RoofMatl' etc.

# weirdest nas. lot frontage. probably worth removing
# not dealing with them out of expediance. 
drop_now_but_look_at_later = ['MasVnrArea','GarageYrBlt','MasVnrType', 'Electrical', 'RoofMatl', 
                              'Exterior1st', 'Exterior2nd', 'Utilities']
test.drop(drop_now_but_look_at_later, axis=1,inplace = True)

In [20]:
na_none_features = ['MiscFeature','Alley','BsmtFinType1','BsmtFinType2',
                   'GarageFinish','GarageType']
for na_none_feature in na_none_features:
    test[na_none_feature] = test[na_none_feature].fillna(value = 'None')

In [21]:
## Typo similar to train?
na_zero_features = ['BsmtFullBath','BsmtHalfBath','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF']
for na_zero_feature in na_zero_features:
    test[na_zero_feature] = test[na_zero_feature].fillna(value = 0)

In [22]:
cols_na = test.loc[:,test.isna().any(axis=0)].columns.to_list()
cols_na

[]

In [23]:
X_tst = test.drop('SalePrice',axis=1)
X_tst = transformer.transform(X_tst)

In [24]:
y_tst = np.log(test['SalePrice'])

In [25]:
ols.fit(X_tst, y_tst)
ols.score(X_tst, y_tst)

0.9533403078902463