In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
fraud = pd.read_csv('Fraud_check.csv')
fraud

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [3]:
fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [4]:
fraud['Taxable.Income'].value_counts

<bound method IndexOpsMixin.value_counts of 0      68833
1      33700
2      36925
3      50190
4      81002
       ...  
595    76340
596    69967
597    47334
598    98592
599    96519
Name: Taxable.Income, Length: 600, dtype: int64>

In [5]:
fraud["income"]="<=30000"
fraud.loc[fraud["Taxable.Income"]>=30000,"income"]="Good"
fraud.loc[fraud["Taxable.Income"]<=30000,"income"]="Risky"

In [6]:
fraud["income"].unique()
fraud["income"].value_counts()
### dropping the Taxable.Income columns
fraud = fraud.drop(['Taxable.Income'],axis=1)

In [7]:
fraud.tail()

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,income
595,YES,Divorced,39492,7,YES,Good
596,YES,Divorced,55369,2,YES,Good
597,NO,Divorced,154058,0,YES,Good
598,YES,Married,180083,17,NO,Good
599,NO,Divorced,158137,16,NO,Good


In [8]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
fraud['Undergrad'] = le.fit_transform(fraud['Undergrad'])
fraud['Marital.Status'] = le.fit_transform(fraud['Marital.Status'])
fraud['Urban'] = le.fit_transform(fraud['Urban'])
fraud['income'] = le.fit_transform(fraud['income'])

In [9]:
fraud

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,income
0,0,2,50047,10,1,0
1,1,0,134075,18,1,0
2,0,1,160205,30,1,0
3,1,2,193264,15,1,0
4,0,1,27533,28,0,0
...,...,...,...,...,...,...
595,1,0,39492,7,1,0
596,1,0,55369,2,1,0
597,0,0,154058,0,1,0
598,1,1,180083,17,0,0


In [10]:
x=fraud.iloc[:,0:5]
y=fraud.iloc[:,5]

In [11]:
#splitting
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=40)

## building random forest classifier

In [25]:
num_trees = 100
max_features = 3

In [26]:
kfold = KFold(n_splits=12, shuffle = True, random_state=7)
model = RandomForestClassifier(n_estimators=num_trees, max_features = max_features)
model.fit(x_train,y_train)
results = cross_val_score(model,x,y, cv = kfold)
print(results.mean())

0.7516666666666666


In [27]:
pred_train = model.predict(x_train)

In [30]:
from sklearn.metrics import accuracy_score
accuracy_train = accuracy_score(y_train,pred_train)

In [31]:
accuracy_train

1.0

In [32]:
pred_test = model.predict(x_test)

In [33]:
accuracy_test = accuracy_score(y_test,pred_test)

In [34]:
accuracy_test

0.7277777777777777

In [36]:
np.mean(y_test==pred_test)

0.7277777777777777

In [40]:
#confusion matrix
from sklearn.metrics import confusion_matrix
con_matrix = confusion_matrix(y_train, pred_train)
con_matrix

array([[336,   0],
       [  0,  84]], dtype=int64)

In [41]:
con_test_matrix = confusion_matrix(y_test,pred_test)
con_test_matrix

array([[131,   9],
       [ 40,   0]], dtype=int64)

In [45]:
#visualizing
from sklearn.tree import export_graphviz 
from six import StringIO
import pydotplus
colnames = list(fraud.columns)
predictors = colnames[0:5]
target = colnames[5]
tree1 = model.estimators_[20]
dot_data = StringIO()
export_graphviz(tree1,out_file = dot_data, feature_names =predictors, class_names = target, filled =True,
                rounded=True,impurity =False,proportion=False,precision =2)

In [46]:
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())


In [50]:
##Creating pdf file
graph.write_pdf('fraudrf.pdf')

True

In [51]:
##Creating png file
graph.write_png('fraudrf.png')

True