# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.ensemble import IsolationForest

In [2]:
# Importing the Datasets
Fcheck = pd.read_csv("Fraud_check.csv")

In [3]:
Fcheck

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


# EDA

In [4]:
# Renaming Columns
Fcheck = Fcheck.rename({'Taxable.Income':'tax_income', 'City.Population':'population', 'Work.Experience' : 'work_exp',
                       'Marital.Status' : 'marital_status'} ,axis=1)
Fcheck

Unnamed: 0,Undergrad,marital_status,tax_income,population,work_exp,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [5]:
Fcheck.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Undergrad       600 non-null    object
 1   marital_status  600 non-null    object
 2   tax_income      600 non-null    int64 
 3   population      600 non-null    int64 
 4   work_exp        600 non-null    int64 
 5   Urban           600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


# OneHotEncoder 1st method 

In [8]:
# creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore')

In [9]:
# passing bridge-types-cat column (label encoded values of bridge_types)
enc_df = pd.DataFrame(enc.fit_transform(Fcheck[['Undergrad','marital_status','Urban']]).toarray())

In [10]:
enc_df

Unnamed: 0,0,1,2,3,4,5,6
0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...
595,0.0,1.0,1.0,0.0,0.0,0.0,1.0
596,0.0,1.0,1.0,0.0,0.0,0.0,1.0
597,1.0,0.0,1.0,0.0,0.0,0.0,1.0
598,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [14]:
# merge with main df 
Fcheck_final = Fcheck.iloc[:,0:3].join(enc_df)
Fcheck_final

Unnamed: 0,tax_income,population,work_exp,0,1,2,3,4,5,6
0,68833,50047,10,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,33700,134075,18,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,36925,160205,30,1.0,0.0,0.0,1.0,0.0,0.0,1.0
3,50190,193264,15,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,81002,27533,28,1.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
595,76340,39492,7,0.0,1.0,1.0,0.0,0.0,0.0,1.0
596,69967,55369,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0
597,47334,154058,0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
598,98592,180083,17,0.0,1.0,0.0,1.0,0.0,1.0,0.0


# OneHotEncoder 2nd method using get_dummies

In [17]:
Fcheck1 = pd.get_dummies(Fcheck,prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False
                         , drop_first=False, dtype= None) 

In [18]:
Fcheck1

Unnamed: 0,tax_income,population,work_exp,Undergrad_NO,Undergrad_YES,marital_status_Divorced,marital_status_Married,marital_status_Single,Urban_NO,Urban_YES
0,68833,50047,10,1,0,0,0,1,0,1
1,33700,134075,18,0,1,1,0,0,0,1
2,36925,160205,30,1,0,0,1,0,0,1
3,50190,193264,15,0,1,0,0,1,0,1
4,81002,27533,28,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
595,76340,39492,7,0,1,1,0,0,0,1
596,69967,55369,2,0,1,1,0,0,0,1
597,47334,154058,0,1,0,1,0,0,0,1
598,98592,180083,17,0,1,0,1,0,1,0


In [19]:
# converting numerical values to caterical
# treating those who have taxable_income <= 30000 as "Risky" and others are "Good"
Fcheck1['tax_income'] = np.where(Fcheck1.tax_income <= 30000, 0 , 1)

In [20]:
Fcheck1

Unnamed: 0,tax_income,population,work_exp,Undergrad_NO,Undergrad_YES,marital_status_Divorced,marital_status_Married,marital_status_Single,Urban_NO,Urban_YES
0,1,50047,10,1,0,0,0,1,0,1
1,1,134075,18,0,1,1,0,0,0,1
2,1,160205,30,1,0,0,1,0,0,1
3,1,193264,15,0,1,0,0,1,0,1
4,1,27533,28,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
595,1,39492,7,0,1,1,0,0,0,1
596,1,55369,2,0,1,1,0,0,0,1
597,1,154058,0,1,0,1,0,0,0,1
598,1,180083,17,0,1,0,1,0,1,0


# Isolation Forest

In [21]:
# training the model
clf = IsolationForest(random_state=10,contamination=.01)
clf.fit(Fcheck1)

IsolationForest(contamination=0.01, random_state=10)

In [22]:
# predictions
y_pred_outliers = clf.predict(Fcheck1)

In [23]:
#-1 for outliers and 1 for inliers.
y_pred_outliers

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1

In [24]:
Fcheck1

Unnamed: 0,tax_income,population,work_exp,Undergrad_NO,Undergrad_YES,marital_status_Divorced,marital_status_Married,marital_status_Single,Urban_NO,Urban_YES
0,1,50047,10,1,0,0,0,1,0,1
1,1,134075,18,0,1,1,0,0,0,1
2,1,160205,30,1,0,0,1,0,0,1
3,1,193264,15,0,1,0,0,1,0,1
4,1,27533,28,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
595,1,39492,7,0,1,1,0,0,0,1
596,1,55369,2,0,1,1,0,0,0,1
597,1,154058,0,1,0,1,0,0,0,1
598,1,180083,17,0,1,0,1,0,1,0


In [26]:
## Let us add a new data point which is outlier
Fcheck1.loc[0]=[1,250000,30,1,0,0,1,0,1,0]

In [27]:
Fcheck1

Unnamed: 0,tax_income,population,work_exp,Undergrad_NO,Undergrad_YES,marital_status_Divorced,marital_status_Married,marital_status_Single,Urban_NO,Urban_YES
0,1,250000,30,1,0,0,1,0,1,0
1,1,134075,18,0,1,1,0,0,0,1
2,1,160205,30,1,0,0,1,0,0,1
3,1,193264,15,0,1,0,0,1,0,1
4,1,27533,28,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
595,1,39492,7,0,1,1,0,0,0,1
596,1,55369,2,0,1,1,0,0,0,1
597,1,154058,0,1,0,1,0,0,0,1
598,1,180083,17,0,1,0,1,0,1,0


In [29]:
# training the model
clf = IsolationForest(random_state=10,contamination=.01)
clf.fit(Fcheck1)
# predictions
y_pred_outliers = clf.predict(Fcheck1)
y_pred_outliers

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1

In [98]:
Fcheck1['scores']=clf.decision_function(Fcheck1)

In [30]:
Fcheck1['anomaly']=clf.predict(Fcheck1.iloc[:,0:10])

In [31]:
Fcheck1

Unnamed: 0,tax_income,population,work_exp,Undergrad_NO,Undergrad_YES,marital_status_Divorced,marital_status_Married,marital_status_Single,Urban_NO,Urban_YES,anomaly
0,1,250000,30,1,0,0,1,0,1,0,1
1,1,134075,18,0,1,1,0,0,0,1,1
2,1,160205,30,1,0,0,1,0,0,1,1
3,1,193264,15,0,1,0,0,1,0,1,1
4,1,27533,28,1,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
595,1,39492,7,0,1,1,0,0,0,1,1
596,1,55369,2,0,1,1,0,0,0,1,1
597,1,154058,0,1,0,1,0,0,0,1,1
598,1,180083,17,0,1,0,1,0,1,0,1


In [32]:
#Print the outlier data points
Fcheck1[Fcheck1['anomaly']==-1]

Unnamed: 0,tax_income,population,work_exp,Undergrad_NO,Undergrad_YES,marital_status_Divorced,marital_status_Married,marital_status_Single,Urban_NO,Urban_YES,anomaly
16,0,34551,29,0,1,1,0,0,1,0,-1
168,0,167629,30,1,0,1,0,0,1,0,-1
259,0,197841,7,1,0,0,1,0,1,0,-1
280,0,33184,1,1,0,1,0,0,1,0,-1
284,0,188383,22,1,0,1,0,0,0,1,-1
541,0,191874,30,0,1,1,0,0,0,1,-1


# PPS Score

In [33]:
#install the package
!pip install ppscore



In [34]:
import ppscore as pps

In [36]:
#pps.score(df, "feature_column", "target_column")  syntax
pps.score(Fcheck, "work_exp", "tax_income")

{'x': 'work_exp',
 'y': 'tax_income',
 'ppscore': 0,
 'case': 'regression',
 'is_valid_score': True,
 'metric': 'mean absolute error',
 'baseline_score': 22771.961666666666,
 'model_score': 24035.718747402032,
 'model': DecisionTreeRegressor()}

In [37]:
#calculate the whole PPS matrix
pps.matrix(Fcheck)

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,tax_income,tax_income,1.0,predict_itself,True,,0.000000,1.000000,
1,tax_income,population,0.0,regression,True,mean absolute error,42730.781667,59151.176667,DecisionTreeRegressor()
2,tax_income,work_exp,0.0,regression,True,mean absolute error,7.598333,10.091667,DecisionTreeRegressor()
3,tax_income,Undergrad_NO,0.0,regression,True,mean absolute error,0.480000,0.516667,DecisionTreeRegressor()
4,tax_income,Undergrad_YES,0.0,regression,True,mean absolute error,0.480000,0.516667,DecisionTreeRegressor()
...,...,...,...,...,...,...,...,...,...
95,Urban_YES,marital_status_Divorced,0.0,regression,True,mean absolute error,0.315000,0.430573,DecisionTreeRegressor()
96,Urban_YES,marital_status_Married,0.0,regression,True,mean absolute error,0.323333,0.439986,DecisionTreeRegressor()
97,Urban_YES,marital_status_Single,0.0,regression,True,mean absolute error,0.361667,0.463304,DecisionTreeRegressor()
98,Urban_YES,Urban_NO,1.0,regression,True,mean absolute error,0.496667,0.000000,DecisionTreeRegressor()
