# Random Forest

## Use decision trees to prepare a model to identifying risky customers on fraud data 

## Importing Libraries

In [33]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.model_selection import train_test_split

## Importing the Data

In [34]:
Fraud_data = pd.read_csv('/content/Fraud_check.csv')
Fraud_data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


## Data Understanding

In [35]:
Fraud_data.shape

(600, 6)

In [36]:
Fraud_data.dtypes

Undergrad          object
Marital.Status     object
Taxable.Income      int64
City.Population     int64
Work.Experience     int64
Urban              object
dtype: object

In [37]:
Fraud_data['Undergrad'] = Fraud_data['Undergrad'].astype('category')
Fraud_data['Urban'] = Fraud_data['Urban'].astype('category')
Fraud_data['Marital.Status'] = Fraud_data['Marital.Status'].astype('category')

In [38]:
Fraud_data.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [39]:
print(Fraud_data.duplicated().value_counts())

False    600
dtype: int64


In [40]:
Fraud_data.describe(include='all')

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
count,600,600,600.0,600.0,600.0,600
unique,2,3,,,,2
top,YES,Single,,,,YES
freq,312,217,,,,302
mean,,,55208.375,108747.368333,15.558333,
std,,,26204.827597,49850.075134,8.842147,
min,,,10003.0,25779.0,0.0,
25%,,,32871.5,66966.75,8.0,
50%,,,55074.5,106493.5,15.0,
75%,,,78611.75,150114.25,24.0,


In [41]:
Fraud_data['Income'] = Fraud_data['Taxable.Income'].map(lambda x : 1 if x>30000 else 0)
Fraud_data['Undergrad'] = Fraud_data['Undergrad'].cat.codes
Fraud_data['Urban'] = Fraud_data['Urban'].cat.codes
Fraud_data['Marital.Status'] = Fraud_data['Marital.Status'].cat.codes
Fraud_data

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Income
0,0,2,68833,50047,10,1,1
1,1,0,33700,134075,18,1,1
2,0,1,36925,160205,30,1,1
3,1,2,50190,193264,15,1,1
4,0,1,81002,27533,28,0,1
...,...,...,...,...,...,...,...
595,1,0,76340,39492,7,1,1
596,1,0,69967,55369,2,1,1
597,0,0,47334,154058,0,1,1
598,1,1,98592,180083,17,0,1


## Data Preparation

In [42]:
X=Fraud_data.iloc[:,0:6]
y=Fraud_data['Income']
print(X)
print(y)

     Undergrad  Marital.Status  ...  Work.Experience  Urban
0            0               2  ...               10      1
1            1               0  ...               18      1
2            0               1  ...               30      1
3            1               2  ...               15      1
4            0               1  ...               28      0
..         ...             ...  ...              ...    ...
595          1               0  ...                7      1
596          1               0  ...                2      1
597          0               0  ...                0      1
598          1               1  ...               17      0
599          0               0  ...               16      0

[600 rows x 6 columns]
0      1
1      1
2      1
3      1
4      1
      ..
595    1
596    1
597    1
598    1
599    1
Name: Income, Length: 600, dtype: int64


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size= 0.2,random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((480, 6), (120, 6), (480,), (120,))

## Model Builing 

In [44]:
rf_model=RandomForestClassifier()

## Finding out the best hyperparameter for building the Tree


In [45]:
from sklearn.model_selection import GridSearchCV

In [46]:
grid_search = GridSearchCV(estimator = rf_model, param_grid = {'criterion':['gini','entropy'],'max_depth':[2,3,4,5,6,7,8,9,10,11,12]} , cv= 5)
grid_search

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]})

In [47]:
grid_search.fit(X,y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]})

In [48]:
grid_search.best_params_

{'criterion': 'gini', 'max_depth': 2}

In [49]:
grid_search.best_score_

1.0

## Model Training

In [50]:
rf_model=RandomForestClassifier(criterion='gini' , max_depth=2)

In [51]:
rf_model.fit(X_train,y_train)

RandomForestClassifier(max_depth=2)

## Model Testing

In [52]:
y_pred_train = rf_model.predict(X_train)

In [53]:
y_pred_test = rf_model.predict(X_test)

## Model Evaluation

## Confusion_Matrix :- To know the Misclassification done by the model

In [54]:
accuracy_score(y_train,y_pred_train)

1.0

In [55]:
confusion_matrix(y_train,y_pred_train)

array([[ 98,   0],
       [  0, 382]])

In [56]:
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        98
           1       1.00      1.00      1.00       382

    accuracy                           1.00       480
   macro avg       1.00      1.00      1.00       480
weighted avg       1.00      1.00      1.00       480



In [57]:
accuracy_score(y_test,y_pred_test)

0.9916666666666667

In [58]:
confusion_matrix(y_test,y_pred_test)

array([[25,  1],
       [ 0, 94]])

In [59]:
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       1.00      0.96      0.98        26
           1       0.99      1.00      0.99        94

    accuracy                           0.99       120
   macro avg       0.99      0.98      0.99       120
weighted avg       0.99      0.99      0.99       120



## Model Deployment

In [60]:
from pickle import dump

In [61]:
dump(rf_model, open('Random_Forest(Fraud_check)','wb'))

In [62]:
from pickle import load

In [63]:
rf_model = load(open('/content/Random_Forest(Fraud_check)','rb'))

In [64]:
rf_model.predict(X_test)

array([0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0])