# Regression problem where we are predicting audit risk

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Sklearn 
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

###### feature engineering packages
from feature_engine import missing_data_imputers as mdi
from feature_engine import discretisers as dsc
from feature_engine import categorical_encoders as ce
from feature_engine.categorical_encoders import WoERatioCategoricalEncoder
from feature_engine.discretisers import DecisionTreeDiscretiser
from feature_engine.outlier_removers import Winsorizer
from feature_engine.categorical_encoders import MeanCategoricalEncoder

######## Feature selection packages 
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

In [30]:
df1 = pd.read_csv("audit_risk.csv")
df2 = pd.read_csv("trial.csv")

### combine the two files in one dataframe
df = pd.concat([df1, df2], axis=1)
df.columns = map(str.lower, df.columns)
print(df.shape)

## remove duplicate columns 
data = df.loc[:,~df.columns.duplicated()]
data = data.drop(['risk'],axis =1)
print("the new dimensions are:",data.shape)

(776, 45)
the new dimensions are: (776, 31)


In [31]:
data.head(5)

Unnamed: 0,sector_score,location_id,para_a,score_a,risk_a,para_b,score_b,risk_b,total,numbers,...,inherent_risk,control_risk,detection_risk,audit_risk,marks,money_marks,district,loss,loss_score,history_score
0,3.89,23,4.18,0.6,2.508,2.5,0.2,0.5,6.68,5.0,...,8.574,0.4,0.5,1.7148,2,2,2,0,2,2
1,3.89,6,0.0,0.2,0.0,4.83,0.2,0.966,4.83,5.0,...,2.554,0.4,0.5,0.5108,2,2,2,0,2,2
2,3.89,6,0.51,0.2,0.102,0.23,0.2,0.046,0.74,5.0,...,1.548,0.4,0.5,0.3096,2,2,2,0,2,2
3,3.89,6,0.0,0.2,0.0,10.8,0.6,6.48,10.8,6.0,...,17.53,0.4,0.5,3.506,6,6,2,0,2,2
4,3.89,6,0.0,0.2,0.0,0.08,0.2,0.016,0.08,5.0,...,1.416,0.4,0.5,0.2832,2,2,2,0,2,2


In [32]:
discrete= [ var for var in data.columns   if data[var].dtype != 'O' and var!='audit_risk' and data[var].nunique() < 5 ]
contin= [ var for var in data.columns     if data[var].dtype != 'O' and var!='audit_risk' and var not in discrete]
categorical = [var for var in data.columns if data[var].dtype =='O']


print("there are {} discrete features".format(len(discrete)))
print("there are {} continous or numeric features".format(len(contin)))
print("there are {} categorical features".format(len(categorical)))

there are 13 discrete features
there are 16 continous or numeric features
there are 1 categorical features


In [33]:
data[categorical].nunique()

location_id    45
dtype: int64

In [34]:
for ft in categorical:
    print( ft ,"   ", data[ft].unique() )

location_id     ['23' '6' '7' '8' '13' '37' '24' '3' '4' '14' '5' '20' '19' '21' '22' '9'
 '11' '12' '29' '30' '38' '31' '2' '32' '16' '33' '15' '36' '34' '18' '25'
 '39' '27' '35' '40' '41' '42' '1' '28' 'LOHARU' 'NUH' 'SAFIDON' '43' '44'
 '17']


In [35]:
categorical = categorical + discrete 
print(categorical)

['location_id', 'score_a', 'score_b', 'score_b.1', 'score_mv', 'district_loss', 'prob', 'detection_risk', 'marks', 'money_marks', 'district', 'loss', 'loss_score', 'history_score']


In [36]:
for ft in categorical:
    data[ft] = data[ft].astype('O')  ## ensure that all categorical variables are object typ


In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 776 entries, 0 to 775
Data columns (total 31 columns):
sector_score      776 non-null object
location_id       776 non-null object
para_a            776 non-null object
score_a           776 non-null object
risk_a            776 non-null object
para_b            776 non-null object
score_b           776 non-null object
risk_b            776 non-null object
total             776 non-null object
numbers           776 non-null object
score_b.1         776 non-null object
risk_c            776 non-null object
money_value       775 non-null object
score_mv          776 non-null object
risk_d            776 non-null object
district_loss     776 non-null object
prob              776 non-null object
risk_e            776 non-null object
history           776 non-null object
risk_f            776 non-null object
score             776 non-null object
inherent_risk     776 non-null object
control_risk      776 non-null object
detection_risk    776

In [38]:
#data.isnull().mean().sort_values(ascending= False)

# FEATURE ENGINEERING

In [39]:
#### split the data 
X_train, X_test, y_train, y_test = train_test_split(
data.drop('audit_risk',axis=1), ### predictors 
data['audit_risk'] ,      ### target
test_size=0.2,
    random_state =0
)

In [40]:
fe_seq=Pipeline([
    
    ### IMPUTE NUMERIC ########
    ('imputer_num',
    mdi.ArbitraryNumberImputer(arbitrary_number= -100,variables = contin)),
    ##### IMPUTE CATEGORICAL #######
    ('imputer_cat',
    mdi.CategoricalVariableImputer(variables=categorical)),
    ##### REMOVE OUTLIERS##############
    ('outlier_rem',Winsorizer(distribution='skewed',
                             tail='both',
                             fold=2.0,
                             variables=contin)),
    ####### REMOVE RARE LABELS ###########
    ('encoder_rare_label',
     ce.RareLabelCategoricalEncoder(tol=0.03,
                                   n_categories=5,
                                   variables=categorical)),
     ##########  ENCODE CATEGORICAL VARIABLES ##########
     #('categorical_encoder',
    # ce.OrdinalCategoricalEncoder(encoding_method='ordered',
       #                          variables=categorical))
     ########### ENCODE CATEGORICAL VARIABLES ##############
    ('categorical_encoder',
     MeanCategoricalEncoder( categorical)
    )
    #################  BIN NUMERICAL VARIABLES ##################
    #('BinDTE',
     # DecisionTreeDiscretiser(variables=contin,regression=False)),
     
     ###################### we will transform and then fit 
])

In [41]:
fe_seq.fit(X_train,y_train)

ValueError: Some of the selected variables are not numerical. Please cast them as numerical before calling the imputer

In [None]:
%debug

> [1;32mc:\users\obaid\anaconda3\lib\site-packages\feature_engine\base_transformers.py[0m(41)[0;36mfit[1;34m()[0m
[1;32m     39 [1;33m            [1;31m#if len(X[self.variables].select_dtypes(exclude=numerics).columns) != 0:[0m[1;33m[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     40 [1;33m            [1;32mif[0m [0mlen[0m[1;33m([0m[0mX[0m[1;33m[[0m[0mself[0m[1;33m.[0m[0mvariables[0m[1;33m][0m[1;33m.[0m[0mselect_dtypes[0m[1;33m([0m[0mexclude[0m[1;33m=[0m[1;34m'number'[0m[1;33m)[0m[1;33m.[0m[0mcolumns[0m[1;33m)[0m [1;33m!=[0m [1;36m0[0m[1;33m:[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m---> 41 [1;33m               [1;32mraise[0m [0mValueError[0m[1;33m([0m[1;34m"Some of the selected variables are not numerical. Please cast them as numerical before calling the imputer"[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m     42 [1;33m[1;33m[0m[0m
[0m[1;32m     43 [1;33m            [0mself[0m[1;33m.[0m[0mvariables[0m 