# Regression problem where we are predicting audit risk

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Sklearn 
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

###### feature engineering packages
from feature_engine import missing_data_imputers as mdi
from feature_engine import discretisers as dsc
from feature_engine import categorical_encoders as ce
from feature_engine.categorical_encoders import WoERatioCategoricalEncoder
from feature_engine.discretisers import DecisionTreeDiscretiser
from feature_engine.outlier_removers import Winsorizer
from feature_engine.categorical_encoders import MeanCategoricalEncoder

######## Feature selection packages 
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [2]:
data = pd.read_csv('houseprice.csv', nrows=50000)
print(data.shape)

(1460, 81)


In [9]:
 data.isnull().mean().sort_values(ascending= False)

PoolQC           0.995205
MiscFeature      0.963014
Alley            0.937671
Fence            0.807534
FireplaceQu      0.472603
                   ...   
CentralAir       0.000000
SaleCondition    0.000000
Heating          0.000000
TotalBsmtSF      0.000000
Id               0.000000
Length: 81, dtype: float64

In [7]:
data.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [12]:
data.drop('Id',axis=1, inplace = True)

In [15]:
discrete= [ var for var in data.columns   if data[var].dtype != 'O' and var!='SalePrice' and data[var].nunique() < 5 ]
contin= [ var for var in data.columns     if data[var].dtype != 'O' and var!='SalePrice' and var not in discrete]
categorical = [var for var in data.columns if data[var].dtype =='O']


print("there are {} discrete features".format(len(discrete)))
print("there are {} continous or numeric features".format(len(contin)))
print("there are {} categorical features".format(len(categorical)))

there are 6 discrete features
there are 31 continous or numeric features
there are 43 categorical features


In [9]:
data[categorical].nunique()

MSZoning          5
Street            2
Alley             2
LotShape          4
LandContour       4
Utilities         2
LotConfig         5
LandSlope         3
Neighborhood     25
Condition1        9
Condition2        8
BldgType          5
HouseStyle        8
RoofStyle         6
RoofMatl          8
Exterior1st      15
Exterior2nd      16
MasVnrType        4
ExterQual         4
ExterCond         5
Foundation        6
BsmtQual          4
BsmtCond          4
BsmtExposure      4
BsmtFinType1      6
BsmtFinType2      6
Heating           6
HeatingQC         5
CentralAir        2
Electrical        5
KitchenQual       4
Functional        7
FireplaceQu       5
GarageType        6
GarageFinish      3
GarageQual        5
GarageCond        5
PavedDrive        3
PoolQC            3
Fence             4
MiscFeature       4
SaleType          9
SaleCondition     6
dtype: int64

In [10]:
for ft in categorical:
    print( ft ,"   ", data[ft].unique() )

MSZoning     ['RL' 'RM' 'C (all)' 'FV' 'RH']
Street     ['Pave' 'Grvl']
Alley     [nan 'Grvl' 'Pave']
LotShape     ['Reg' 'IR1' 'IR2' 'IR3']
LandContour     ['Lvl' 'Bnk' 'Low' 'HLS']
Utilities     ['AllPub' 'NoSeWa']
LotConfig     ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']
LandSlope     ['Gtl' 'Mod' 'Sev']
Neighborhood     ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes'
 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR'
 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill'
 'Blmngtn' 'BrDale' 'SWISU' 'Blueste']
Condition1     ['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']
Condition2     ['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']
BldgType     ['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs']
HouseStyle     ['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin']
RoofStyle     ['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed']
RoofMatl     ['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Membran' '

# FEATURE ENGINEERING

In [16]:
#### split the data 
X_train, X_test, y_train, y_test = train_test_split(
data.drop('SalePrice',axis=1), ### predictors 
data['SalePrice'] ,      ### target
test_size=0.2,
    random_state =0
)

In [17]:
X_train[discrete] = X_train[discrete].astype('O')
X_test[discrete] = X_test[discrete].astype('O')

In [20]:
X_train[categorical].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 618 to 684
Data columns (total 43 columns):
MSZoning         1168 non-null object
Street           1168 non-null object
Alley            71 non-null object
LotShape         1168 non-null object
LandContour      1168 non-null object
Utilities        1168 non-null object
LotConfig        1168 non-null object
LandSlope        1168 non-null object
Neighborhood     1168 non-null object
Condition1       1168 non-null object
Condition2       1168 non-null object
BldgType         1168 non-null object
HouseStyle       1168 non-null object
RoofStyle        1168 non-null object
RoofMatl         1168 non-null object
Exterior1st      1168 non-null object
Exterior2nd      1168 non-null object
MasVnrType       1162 non-null object
ExterQual        1168 non-null object
ExterCond        1168 non-null object
Foundation       1168 non-null object
BsmtQual         1140 non-null object
BsmtCond         1140 non-null object
BsmtExposure     114

In [17]:
fe_seq=Pipeline([
    
    ### IMPUTE NUMERIC ########
    ('imputer_num',
    mdi.ArbitraryNumberImputer(arbitrary_number= -100,variables = contin)),
    ##### IMPUTE CATEGORICAL #######
    ('imputer_cat',
    mdi.CategoricalVariableImputer(variables=categorical + discrete)),
    ##### REMOVE OUTLIERS##############
    ('outlier_rem',Winsorizer(distribution='skewed',
                             tail='both',
                             fold=2.0,
                             variables=contin)),
    ####### REMOVE RARE LABELS ###########
    ('encoder_rare_label',
     ce.RareLabelCategoricalEncoder(tol=0.03,
                                   n_categories=5,
                                   variables=categorical + discrete)),
     ##########  ENCODE CATEGORICAL VARIABLES ##########
     #('categorical_encoder',
    # ce.OrdinalCategoricalEncoder(encoding_method='ordered',
       #                          variables=categorical))
     ########### ENCODE CATEGORICAL VARIABLES ##############
    ('categorical_encoder',
     MeanCategoricalEncoder( variables = categorical + discrete)
    )
    #################  BIN NUMERICAL VARIABLES ##################
    #('BinDTE',
     # DecisionTreeDiscretiser(variables=contin,regression=False)),
     
     ###################### we will transform and then fit 
])

In [18]:
fe_seq.fit(X_train,y_train)

TypeError: variable {} is not of type object, check that all indicated variables are of type object before calling the transformer

In [None]:
%debug

> [1;32mc:\users\obaid\anaconda3\lib\site-packages\feature_engine\base_transformers.py[0m(173)[0;36mfit[1;34m()[0m
[1;32m    171 [1;33m[1;31m#            for var in self.variables:[0m[1;33m[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    172 [1;33m[1;31m#                if X[var].dtypes != 'O':[0m[1;33m[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m--> 173 [1;33m                [1;32mraise[0m [0mTypeError[0m[1;33m([0m[1;34m"variable {} is not of type object, check that all indicated variables are of type object before calling the transformer"[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    174 [1;33m            [0mself[0m[1;33m.[0m[0mvariables[0m [1;33m=[0m [0mself[0m[1;33m.[0m[0mvariables[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    175 [1;33m[1;33m[0m[0m
[0m
