In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot

import warnings
warnings.filterwarnings('ignore')

In [2]:
fraud=pd.read_csv('Fraud_check (1).csv')
fraud


Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [3]:
fraud.columns

Index(['Undergrad', 'Marital.Status', 'Taxable.Income', 'City.Population',
       'Work.Experience', 'Urban'],
      dtype='object')

In [4]:
fraud.columns=['under_grad','marital_status','taxable_income','city_population','work_exp','urban']
fraud.head()

Unnamed: 0,under_grad,marital_status,taxable_income,city_population,work_exp,urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [5]:
cut_labels=['Risky','Good']
cut_bins=[0,30000,99620]
fraud['tax_inc']=pd.cut(fraud['taxable_income'],bins=cut_bins,labels=cut_labels)
fraud.pop('taxable_income')

0      68833
1      33700
2      36925
3      50190
4      81002
       ...  
595    76340
596    69967
597    47334
598    98592
599    96519
Name: taxable_income, Length: 600, dtype: int64

In [6]:
from sklearn import preprocessing
label_encoder=preprocessing.LabelEncoder()
fraud['under_grad']=label_encoder.fit_transform(fraud['under_grad'])
fraud['marital_status']=label_encoder.fit_transform(fraud['marital_status'])
fraud['urban']=label_encoder.fit_transform(fraud['urban'])

In [7]:
array=fraud.values
X=array[:,0:5]
Y=array[:,5]
#splitting data using K-Fold cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kfold=KFold(n_splits=10,random_state=None)

In [8]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier(n_estimators=100,max_features=3)
results=cross_val_score(model,X,Y,cv=kfold)

In [9]:
print(results.mean())

0.7416666666666667


### ============================================================================ 

##### treating those who have taxable_income <= 30000 as "Risky" and others are "Good"


In [10]:
fraud=pd.read_csv('Fraud_check (1).csv')
fraud


Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [11]:
##Converting the Taxable income variable to bucketing. 
fraud["income"]="<=30000"
fraud.loc[fraud["Taxable.Income"]>=30000,"income"]="Good"
fraud.loc[fraud["Taxable.Income"]<=30000,"income"]="Risky"

In [12]:
##Droping the Taxable income variable
fraud.drop(["Taxable.Income"],axis=1,inplace=True)

In [13]:
fraud.rename(columns={"Undergrad":"undergrad","Marital.Status":"marital","City.Population":"population","Work.Experience":"experience","Urban":"urban"},inplace=True)
## As we are getting error as "ValueError: could not convert string to float: 'YES'".
## Model.fit doesnt not consider String. So, we encode

In [14]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
for column_name in fraud.columns:
    if fraud[column_name].dtype == object:
        fraud[column_name] = le.fit_transform(fraud[column_name])
    else:
        pass

In [15]:
##Splitting the data into featuers and labels
features = fraud.iloc[:,0:5]
labels = fraud.iloc[:,5]

In [16]:
## Collecting the column names
colnames = list(fraud.columns)
predictors = colnames[0:5]
target = colnames[5]
##Splitting the data into train and test

In [17]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(features,labels,test_size = 0.2,stratify = labels)

In [18]:
##Model building
from sklearn.ensemble import RandomForestClassifier as RF
model = RF(n_jobs = 3,n_estimators = 15, oob_score = True, criterion = "entropy")
model.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', n_estimators=15, n_jobs=3,
                       oob_score=True)

In [19]:
model.estimators_
model.classes_
model.n_features_
model.n_classes_

model.n_outputs_

model.oob_score_


0.7395833333333334

In [20]:
##Predictions on train data
prediction = model.predict(x_train)


In [21]:
##Accuracy
# For accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_train,prediction)
accuracy

0.9833333333333333

In [22]:
np.mean(prediction == y_train)

0.9833333333333333

In [23]:
##Confusion matrix
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_train,prediction)
confusion

array([[381,   0],
       [  8,  91]], dtype=int64)

In [24]:
##Prediction on test data
pred_test = model.predict(x_test)
pred_test

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [25]:
##Accuracy
acc_test =accuracy_score(y_test,pred_test)
acc_test

0.7333333333333333

### =========================================================================================

# ANOTHER METHOD

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import  DecisionTreeClassifier
import warnings 
warnings.filterwarnings('ignore')
get_ipython().run_line_magic('matplotlib', 'inline')

In [27]:
fraud=pd.read_csv('Fraud_check (1).csv')
fraud

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [28]:
fraud.shape

(600, 6)

In [29]:
fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [30]:
fraud.isna().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [31]:
fraud.describe(include='all')

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
count,600,600,600.0,600.0,600.0,600
unique,2,3,,,,2
top,YES,Single,,,,YES
freq,312,217,,,,302
mean,,,55208.375,108747.368333,15.558333,
std,,,26204.827597,49850.075134,8.842147,
min,,,10003.0,25779.0,0.0,
25%,,,32871.5,66966.75,8.0,
50%,,,55074.5,106493.5,15.0,
75%,,,78611.75,150114.25,24.0,


In [32]:
fraud.dtypes

Undergrad          object
Marital.Status     object
Taxable.Income      int64
City.Population     int64
Work.Experience     int64
Urban              object
dtype: object

In [33]:
fraud['Marital.Status'].value_counts

<bound method IndexOpsMixin.value_counts of 0        Single
1      Divorced
2       Married
3        Single
4       Married
         ...   
595    Divorced
596    Divorced
597    Divorced
598     Married
599    Divorced
Name: Marital.Status, Length: 600, dtype: object>

#### Data Preprocessing

In [34]:
fraud.rename(columns = {'Marital.Status':'Marital_Status', 'Taxable.Income':'Taxable_Income',
                        'City.Population':'City_Population','Work.Experience':'Work_Experience'}, inplace = True)
fraud

Unnamed: 0,Undergrad,Marital_Status,Taxable_Income,City_Population,Work_Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [35]:
le=LabelEncoder()

In [36]:
fraud['Undergrad']=le.fit_transform(fraud.Undergrad)
fraud['Marital_Status']=le.fit_transform(fraud.Marital_Status)
fraud['Urban']=le.fit_transform(fraud.Urban)

In [37]:
fraud.dtypes

Undergrad          int32
Marital_Status     int32
Taxable_Income     int64
City_Population    int64
Work_Experience    int64
Urban              int32
dtype: object

In [38]:
fraud["Tax"] = pd.cut(fraud["Taxable_Income"], bins = [10000,30000,100000], labels = ["Risky", "Good"])

In [39]:
fraud['Tax']=le.fit_transform(fraud.Tax)

#### Model Building


In [40]:
X=fraud.drop('Tax',axis=1)
y=fraud.Tax

In [41]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.15,random_state=19)

In [42]:
classifier=DecisionTreeClassifier()

#### Model Training

 Train data
 
 plotting decision

In [43]:
from sklearn.tree import plot_tree

In [44]:
classifier=classifier.fit(X_train,y_train)

#### Model Testing
Train data

In [45]:
y_pred_train=classifier.predict(X_train)

##### Test data

In [46]:
y_pred_test=classifier.predict(X_test)

### Modal Evaluation

In [47]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print('Accuracy Score : ',accuracy_score(y_train,y_pred_train))
print('-------------------------------------------')
print('Confusion Matrix:\n',confusion_matrix(y_train,y_pred_train))
print('-------------------------------------------')
print('Classification Report:\n',classification_report(y_train,y_pred_train))

Accuracy Score :  1.0
-------------------------------------------
Confusion Matrix:
 [[405   0]
 [  0 105]]
-------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       405
           1       1.00      1.00      1.00       105

    accuracy                           1.00       510
   macro avg       1.00      1.00      1.00       510
weighted avg       1.00      1.00      1.00       510



In [48]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
print('Accuracy Score : ',accuracy_score(y_test,y_pred_test))
print('-------------------------------------------')
print('Confusion Matrix:\n',confusion_matrix(y_test,y_pred_test))
print('-------------------------------------------')
print('Classification Report:\n',classification_report(y_test,y_pred_test))


Accuracy Score :  1.0
-------------------------------------------
Confusion Matrix:
 [[71  0]
 [ 0 19]]
-------------------------------------------
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        71
           1       1.00      1.00      1.00        19

    accuracy                           1.00        90
   macro avg       1.00      1.00      1.00        90
weighted avg       1.00      1.00      1.00        90

