#  Question 1

#   Random Forest Fraud Data

In [30]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets  
import numpy as np
from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [31]:
fraud_data = pd.read_csv("Fraud_check.csv")
fraud_data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [32]:
fraud_data.describe()

Unnamed: 0,Taxable.Income,City.Population,Work.Experience
count,600.0,600.0,600.0
mean,55208.375,108747.368333,15.558333
std,26204.827597,49850.075134,8.842147
min,10003.0,25779.0,0.0
25%,32871.5,66966.75,8.0
50%,55074.5,106493.5,15.0
75%,78611.75,150114.25,24.0
max,99619.0,199778.0,30.0


###   Preprocessing the dataset 

In [33]:
fraud_data["Taxable.Income"] = pd.cut(fraud_data["Taxable.Income"], bins=(0,30000,100000), labels=["Risky","Good"])

In [34]:
fraud_data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,Good,50047,10,YES
1,YES,Divorced,Good,134075,18,YES
2,NO,Married,Good,160205,30,YES
3,YES,Single,Good,193264,15,YES
4,NO,Married,Good,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,Good,39492,7,YES
596,YES,Divorced,Good,55369,2,YES
597,NO,Divorced,Good,154058,0,YES
598,YES,Married,Good,180083,17,NO


In [35]:
label_encoder = preprocessing.LabelEncoder()
fraud_data["Undergrad"] = label_encoder.fit_transform(fraud_data["Undergrad"])
fraud_data["Marital.Status"] = label_encoder.fit_transform(fraud_data["Marital.Status"])
fraud_data["Taxable.Income"] = label_encoder.fit_transform(fraud_data["Taxable.Income"])
fraud_data["Urban"] = label_encoder.fit_transform(fraud_data["Urban"])

In [36]:
fraud_data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,2,0,50047,10,1
1,1,0,0,134075,18,1
2,0,1,0,160205,30,1
3,1,2,0,193264,15,1
4,0,1,0,27533,28,0
...,...,...,...,...,...,...
595,1,0,0,39492,7,1
596,1,0,0,55369,2,1
597,0,0,0,154058,0,1
598,1,1,0,180083,17,0


###  Splitting the dataset

In [37]:
x = fraud_data.drop(["Taxable.Income"], axis=1)
y = fraud_data.iloc[:,2]

In [38]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=0)

In [39]:
x_train = StandardScaler().fit_transform(X_train)
x_test = StandardScaler().fit_transform(X_test)

###  Building the random forest 

In [45]:
model = RandomForestClassifier(n_estimators=150, criterion='entropy',random_state=0)
model.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', n_estimators=150, random_state=0)

###  Predicting the dataset

In [46]:
y_pred = model.predict(x_test)
y_pred

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0])

###  Evaluating the dataset

In [47]:
confusion_matrix(y_test, y_pred)

array([[135,   2],
       [ 37,   6]], dtype=int64)

In [48]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7833333333333333


In [49]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.99      0.87       137
           1       0.75      0.14      0.24        43

    accuracy                           0.78       180
   macro avg       0.77      0.56      0.55       180
weighted avg       0.78      0.78      0.72       180



###  Using random forest the accuracy of model is improved to 78% 