In [1]:
%reset -f
%matplotlib inline

In [2]:
import calendar
import numpy as np
import pandas as pd
import sebaba.ml as sbbml
import sebaba.utils as utils
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
from missingpy import KNNImputer

In [3]:
pd.options.display.float_format = "{:.4f}".format

In [4]:
df = pd.read_csv("data/house-prices/train.csv", sep = ",")
df = df.drop("Id", axis = 1)

In [5]:
utils.missing_var_pct(df)

PoolQC         99.5200
MiscFeature    96.3000
Alley          93.7700
Fence          80.7500
FireplaceQu    47.2600
LotFrontage    17.7400
GarageType      5.5500
GarageCond      5.5500
GarageFinish    5.5500
GarageQual      5.5500
GarageYrBlt     5.5500
BsmtFinType2    2.6000
BsmtExposure    2.6000
BsmtQual        2.5300
BsmtCond        2.5300
BsmtFinType1    2.5300
MasVnrArea      0.5500
MasVnrType      0.5500
Electrical      0.0700
dtype: float64


In [6]:
df = utils.drop_missing_var(df, threshold = 0.8)

In [7]:
df.FireplaceQu = df.FireplaceQu.fillna("NoFirePlace")
basement       = ["BsmtFinType2", "BsmtExposure", "BsmtFinType1", "BsmtCond", "BsmtQual"]
df[basement]   = df[basement].fillna("NoBasement")
garage         = ["GarageType", "GarageFinish", "GarageQual", "GarageCond"]
df[garage]     = df[garage].fillna("NoGarage")

In [8]:
df = df.dropna(how = "any", subset = ["MasVnrType", "MasVnrArea", "Electrical"])

In [9]:
imputer        = KNNImputer(n_neighbors = 5, weights = "distance", metric = "masked_euclidean")
df.LotFrontage = imputer.fit_transform(np.array(df.LotFrontage).reshape(-1, 1))
df.GarageYrBlt = imputer.fit_transform(np.array(df.GarageYrBlt).reshape(-1, 1))

In [10]:
utils.missing_var_pct(df)

The dataframe has no missing values in any column.


In [11]:
#changing numeric variables to categorical
df.MSSubClass = ["SC" + str(i) for i in df.MSSubClass]
df.MoSold     = [calendar.month_abbr[i] for i in df.MoSold]

In [12]:
#converting cat variables to an interval scale as they are ordinal in nature
df = df.replace({
    "ExterQual"   : {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "ExterCond"   : {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "BsmtQual"    : {"NoBasement": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "BsmtCond"    : {"NoBasement": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "BsmtExposure": {"NoBasement": 0, "No": 1, "Mn": 2, "Av": 3, "Gd": 4},
    "BsmtFinType1": {"NoBasement": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6},
    "BsmtFinType2": {"NoBasement": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6},
    "HeatingQC"   : {"Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
    "FireplaceQu" : {"NoFirePlace": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex" : 5},
    "GarageFinish": {"NoGarage": 0, "Unf": 1, "RFn": 2, "Fin": 3},
    "GarageQual"  : {"NoGarage": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
    "GarageCond"  : {"NoGarage": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5},
})

#creating a list of our ordinal variables
ordinal_vars = [
    "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure",
    "BsmtFinType1", "BsmtFinType2", "HeatingQC","KitchenQual", 
    "FireplaceQu", "GarageFinish", "GarageQual", "GarageCond"
]

In [13]:
#changing features to their correct data types
df.BsmtCond     = df.BsmtCond.astype("int64")
df.BsmtFinType2 = df.BsmtFinType2.astype("int64")
df.FireplaceQu  = df.FireplaceQu.astype("int64")

In [14]:
df = utils.change_vars_to_categorical(df)

In [15]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,SC60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,Feb,2008,WD,Normal,208500
1,SC20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,May,2007,WD,Normal,181500
2,SC60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,Sep,2008,WD,Normal,223500
3,SC70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,Feb,2006,WD,Abnorml,140000
4,SC60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,Dec,2008,WD,Normal,250000


In [30]:
np.arange(start = -1.0, stop = 1.2, step = 0.20)

array([-1.00000000e+00, -8.00000000e-01, -6.00000000e-01, -4.00000000e-01,
       -2.00000000e-01, -2.22044605e-16,  2.00000000e-01,  4.00000000e-01,
        6.00000000e-01,  8.00000000e-01,  1.00000000e+00])