# Random Forest(Fraud Check)


# Use Random Forest to prepare a model on fraud data treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("Fraud_check.csv")

In [4]:
df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [5]:
df.shape

(600, 6)

In [6]:
df.isnull().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [7]:
df['Undergrad']= df['Undergrad'].apply(lambda x: 1 if x=='YES' else 0)
df['Urban']= df['Urban'].apply(lambda x: 1 if x=='YES' else 0)
df['Taxable.Income']= df['Taxable.Income'].apply(lambda x: 'Risky' if x>30000 else 'Good')

In [8]:
df['Marital.Status'].value_counts()

Single      217
Married     194
Divorced    189
Name: Marital.Status, dtype: int64

In [9]:
df['Marital.Status'] = df['Marital.Status'].map({'Single':1, 'Married':2, 'Divorced':3})


In [10]:
df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,1,Risky,50047,10,1
1,1,3,Risky,134075,18,1
2,0,2,Risky,160205,30,1
3,1,1,Risky,193264,15,1
4,0,2,Risky,27533,28,0


In [11]:
df['Marital.Status'].value_counts()

1    217
2    194
3    189
Name: Marital.Status, dtype: int64

In [12]:
import seaborn as sns

In [13]:
# riskey = 0, good = 1

df['Taxable.Income']= df['Taxable.Income'].apply(lambda x: 0 if x=='Risky'else 1)

In [14]:
# Train test the Data

train = df.drop('Taxable.Income', axis=1)
test = df['Taxable.Income']

In [15]:
# Train test split

from sklearn.model_selection import train_test_split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(train, test, test_size = 0.3, random_state = 1)

In [17]:
# Standardize the Data

from sklearn.preprocessing import StandardScaler

In [18]:
scaler = StandardScaler()

In [19]:
col = x_train.columns
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [20]:
x_train.shape

(420, 5)

In [21]:
x_test.shape

(180, 5)

In [22]:
# Build random forest


In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
df.head(2)

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,1,0,50047,10,1
1,1,3,0,134075,18,1


In [25]:
rf_model = RandomForestClassifier(n_estimators= 100)
rf_model.fit(x_train, y_train)

In [26]:
rf_model_pred = rf_model.predict(x_test)

In [27]:
rf_model_actual_predict = pd.DataFrame({'Actual': y_test, 'Predict': rf_model_pred})

In [28]:
rf_model_actual_predict[20:40]

Unnamed: 0,Actual,Predict
13,0,0
510,0,0
268,0,0
358,0,0
289,0,1
483,1,0
429,0,0
47,0,0
223,0,0
185,0,0


In [29]:
rf_model.score(x_test, y_test)*100

77.22222222222223

In [30]:
# Apply Bagging

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [31]:
bg = BaggingClassifier(base_estimator = DecisionTreeClassifier(), max_samples =0.5, max_features = 1.0, n_estimators = 100, random_state = 8 )

In [32]:
bg.fit(x_train, y_train)

In [33]:
bg_prid = bg.predict(x_test)

In [34]:
bg_actual_prid = pd.DataFrame({'Actual': y_test, 'predict': bg_prid})

In [35]:
bg_actual_prid[:10]

Unnamed: 0,Actual,predict
446,1,0
404,0,0
509,1,0
455,0,0
201,0,0
23,0,0
415,0,0
548,0,0
66,0,0
165,0,0


In [36]:
bg.score(x_test, y_test)

0.7888888888888889

In [37]:
# Ada Boosting

from sklearn.ensemble import AdaBoostClassifier

In [38]:
boost = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(), n_estimators = 100, learning_rate=1)

In [39]:
boost.fit(x_train, y_train)

In [40]:
boost_pred = boost.predict(x_test)

In [41]:
boost_actual_predict = pd.DataFrame({'Actual': y_test, 'predict': boost_pred})

In [42]:
boost_actual_predict[:10]

Unnamed: 0,Actual,predict
446,1,0
404,0,1
509,1,0
455,0,0
201,0,0
23,0,0
415,0,1
548,0,1
66,0,0
165,0,1


In [43]:
boost.score(x_test, y_test)

0.6444444444444445

In [44]:
# Build the Random Forest model using kfold

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

In [45]:
kfold = KFold(n_splits=10)
num_trees = 100
max_feature = 3
rfc = RandomForestClassifier(n_estimators=num_trees, max_features = max_feature)
rfc = cross_val_score(rfc, train, test, cv = kfold)
rfc

array([0.76666667, 0.75      , 0.66666667, 0.63333333, 0.76666667,
       0.63333333, 0.86666667, 0.71666667, 0.75      , 0.85      ])

In [46]:
rfc.mean()*100

74.0

In [47]:
rfc.std()*100

7.6448966274531465