## Use decision trees to prepare a model on fraud data treating those who have taxable_income <= 30000 as "Risky" and others are "Good"


## 1.Import the Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix,classification_report

## 2.Import the Data

In [2]:
fraud_data = pd.read_csv('Fraud_check.csv')
fraud_data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


## 3.Data Understanding

In [3]:
fraud_data.shape

(600, 6)

In [4]:
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


## 4.Data Cleaning

In [5]:
#Renaming the columns
fraud_data.rename(columns={"Undergrad":"undergrad","Marital.Status":"marital","City.Population":"population","Work.Experience":"experience","Urban":"urban"},inplace=True)

In [6]:
fraud_data.columns

Index(['undergrad', 'marital', 'Taxable.Income', 'population', 'experience',
       'urban'],
      dtype='object')

In [7]:
##Converting the Taxable income variable to bucketing. 
fraud_data["income"]="<=30000"
fraud_data.loc[fraud_data["Taxable.Income"]>=30000,"income"]="Good"
fraud_data.loc[fraud_data["Taxable.Income"]<=30000,"income"]="Risky"

In [8]:
##Droping the Taxable income variable
fraud_data.drop(["Taxable.Income"],axis=1,inplace=True)

In [9]:
fraud_data

Unnamed: 0,undergrad,marital,population,experience,urban,income
0,NO,Single,50047,10,YES,Good
1,YES,Divorced,134075,18,YES,Good
2,NO,Married,160205,30,YES,Good
3,YES,Single,193264,15,YES,Good
4,NO,Married,27533,28,NO,Good
...,...,...,...,...,...,...
595,YES,Divorced,39492,7,YES,Good
596,YES,Divorced,55369,2,YES,Good
597,NO,Divorced,154058,0,YES,Good
598,YES,Married,180083,17,NO,Good


In [10]:
#Converting Urban and US Columns to integers
fraud_data['undergrad'] = np.where(fraud_data['undergrad'].str.contains("yes"), 1, 0)
fraud_data['urban'] = np.where(fraud_data['urban'].str.contains("yes"), 1, 0)

In [11]:
#converting marital status
dummies = pd.get_dummies(fraud_data['marital'])
dummies

Unnamed: 0,Divorced,Married,Single
0,0,0,1
1,1,0,0
2,0,1,0
3,0,0,1
4,0,1,0
...,...,...,...
595,1,0,0
596,1,0,0
597,1,0,0
598,0,1,0


In [12]:
merged = pd.concat([fraud_data,dummies],axis=1)
merged

Unnamed: 0,undergrad,marital,population,experience,urban,income,Divorced,Married,Single
0,0,Single,50047,10,0,Good,0,0,1
1,0,Divorced,134075,18,0,Good,1,0,0
2,0,Married,160205,30,0,Good,0,1,0
3,0,Single,193264,15,0,Good,0,0,1
4,0,Married,27533,28,0,Good,0,1,0
...,...,...,...,...,...,...,...,...,...
595,0,Divorced,39492,7,0,Good,1,0,0
596,0,Divorced,55369,2,0,Good,1,0,0
597,0,Divorced,154058,0,0,Good,1,0,0
598,0,Married,180083,17,0,Good,0,1,0


In [13]:
final = merged.drop(labels = ["Single","marital"],axis=1)
final

Unnamed: 0,undergrad,population,experience,urban,income,Divorced,Married
0,0,50047,10,0,Good,0,0
1,0,134075,18,0,Good,1,0
2,0,160205,30,0,Good,0,1
3,0,193264,15,0,Good,0,0
4,0,27533,28,0,Good,0,1
...,...,...,...,...,...,...,...
595,0,39492,7,0,Good,1,0
596,0,55369,2,0,Good,1,0
597,0,154058,0,0,Good,1,0
598,0,180083,17,0,Good,0,1


## 5.Building the model

In [14]:
X = final.drop("income",axis=1).values
X

array([[     0,  50047,     10,      0,      0,      0],
       [     0, 134075,     18,      0,      1,      0],
       [     0, 160205,     30,      0,      0,      1],
       ...,
       [     0, 154058,      0,      0,      1,      0],
       [     0, 180083,     17,      0,      0,      1],
       [     0, 158137,     16,      0,      1,      0]], dtype=int64)

In [15]:
y = final['income'].values
y

array(['Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Risky', 'Good', 'Risky', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Risky', 'Good', 'Risky', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Risky',
       'Good', 'Good', 'Risky', 'Good', 'Good', 'Risky', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Risky',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Risky',
       'Good', 'Risky', 'Good', 'Good', 'Good', 'Risky', 'Risky', 'Good',
       'Risky', 'Good', 'Risky', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Risky', 'Good', 'Good', 'Good'

In [16]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [17]:
X_train.shape

(480, 6)

In [18]:
y_train.shape

(480,)

In [19]:
X_test.shape

(120, 6)

## 6.Model Training

In [21]:
from sklearn.ensemble import RandomForestClassifier

In [22]:
rf_model = RandomForestClassifier(n_estimators=100,criterion='entropy',max_depth=9)
rf_model.fit(X_train,y_train)

RandomForestClassifier(criterion='entropy', max_depth=9)

## 7.Model Testing

In [23]:
#Training data
y_train_pred = rf_model.predict(X_train)
y_train_pred

array(['Good', 'Good', 'Risky', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Risky', 'Good', 'Risky', 'Risky', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Risky',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good',
       'Good', 'Risky', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good', 'G

In [24]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [25]:
accuracy_score(y_train,y_train_pred)

0.8770833333333333

In [26]:
confusion_matrix(y_train,y_train_pred) #104 FN predictions

array([[379,   0],
       [ 59,  42]], dtype=int64)

In [27]:
print(classification_report(y_train,y_train_pred))

              precision    recall  f1-score   support

        Good       0.87      1.00      0.93       379
       Risky       1.00      0.42      0.59       101

    accuracy                           0.88       480
   macro avg       0.93      0.71      0.76       480
weighted avg       0.89      0.88      0.86       480

