# Assignment - 15  Random Forests (Fraud check)

## Import Necessary Data

In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score , confusion_matrix , classification_report
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

## Import Data

In [2]:
fraud = pd.read_csv(r'D:\Downloads\Fraud_check (1).csv')
fraud

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


## Data Understanding

In [3]:
fraud.shape

(600, 6)

In [4]:
fraud.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [5]:
fraud.dtypes

Undergrad          object
Marital.Status     object
Taxable.Income      int64
City.Population     int64
Work.Experience     int64
Urban              object
dtype: object

## Data Preparation

#### According to the given question taxable income <= 30000 as Risky and less then 30000 is 'Good'

In [6]:
fraud.loc[fraud['Taxable.Income'] <=30000,'Income']= 'Risk'
fraud.loc[fraud['Taxable.Income'] >30000,'Income']= 'Good'

In [7]:
fraud

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Income
0,NO,Single,68833,50047,10,YES,Good
1,YES,Divorced,33700,134075,18,YES,Good
2,NO,Married,36925,160205,30,YES,Good
3,YES,Single,50190,193264,15,YES,Good
4,NO,Married,81002,27533,28,NO,Good
...,...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES,Good
596,YES,Divorced,69967,55369,2,YES,Good
597,NO,Divorced,47334,154058,0,YES,Good
598,YES,Married,98592,180083,17,NO,Good


In [8]:
fraud1 = fraud.drop('Taxable.Income' ,axis=1)
fraud1

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Income
0,NO,Single,50047,10,YES,Good
1,YES,Divorced,134075,18,YES,Good
2,NO,Married,160205,30,YES,Good
3,YES,Single,193264,15,YES,Good
4,NO,Married,27533,28,NO,Good
...,...,...,...,...,...,...
595,YES,Divorced,39492,7,YES,Good
596,YES,Divorced,55369,2,YES,Good
597,NO,Divorced,154058,0,YES,Good
598,YES,Married,180083,17,NO,Good


In [9]:
fraud1['Income'].value_counts()

Good    476
Risk    124
Name: Income, dtype: int64

In [10]:
le = LabelEncoder()
fraud1['Undergrad'] = le.fit_transform(fraud1['Undergrad'])
fraud1['Marital.Status'] = le.fit_transform(fraud1['Marital.Status'])
fraud1['Urban'] = le.fit_transform(fraud1['Urban'])
fraud1['Income'] = le.fit_transform(fraud1['Income'])

In [11]:
fraud1

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Income
0,0,2,50047,10,1,0
1,1,0,134075,18,1,0
2,0,1,160205,30,1,0
3,1,2,193264,15,1,0
4,0,1,27533,28,0,0
...,...,...,...,...,...,...
595,1,0,39492,7,1,0
596,1,0,55369,2,1,0
597,0,0,154058,0,1,0
598,1,1,180083,17,0,0


In [12]:
fraud1['Income'].value_counts()

0    476
1    124
Name: Income, dtype: int64

## Model Building

In [13]:
x = fraud1.drop('Income' , axis=1)
y = fraud1[['Income']]

In [14]:
x.shape , y.shape

((600, 5), (600, 1))

In [15]:
x_train , x_test , y_train , y_test = train_test_split(x , y , test_size=0.20 , random_state=5 , shuffle= True)

In [16]:
x_train.shape , y_train.shape

((480, 5), (480, 1))

In [17]:
x_test.shape , y_test.shape

((120, 5), (120, 1))

## Model Training

In [18]:
model = RandomForestClassifier(n_estimators=100,max_depth=3,criterion="entropy")

In [19]:
model.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', max_depth=3)

## Model Testing // Model Evaluation

In [20]:
y_pred = model.predict(x_test)

In [21]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [22]:
y_pred=y_pred.reshape(120,1)

In [23]:
accuracy_score(y_test,y_pred)

0.8333333333333334

In [24]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91       100
           1       0.00      0.00      0.00        20

    accuracy                           0.83       120
   macro avg       0.42      0.50      0.45       120
weighted avg       0.69      0.83      0.76       120



In [25]:
confusion_matrix(y_test,y_pred)

array([[100,   0],
       [ 20,   0]], dtype=int64)