In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.metrics import accuracy_score
warnings.filterwarnings("ignore")
from sklearn.tree import  DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split

In [None]:
df=pd.read_csv("/content/Fraud_check.csv")
df.info()
df.shape
df.head()

In [None]:
df["Taxable.Income"].max()
#df["Taxable.Income"].min()


In [None]:
df1=df.copy()
df1['Taxable_cat'] = pd.cut(x = df1['Taxable.Income'], bins = [10002,30000,99620], labels=['Risky','Good'], right = False)
df1["Taxable_cat"].value_counts()
df1.head()

In [None]:
sns.heatmap(df.corr(), annot=True, fmt='.2g', cmap="viridis",  linewidths=0.5, linecolor='black')

In [None]:
bp=df1.copy()
fig, axes=plt.subplots(3,1,figsize=(12,10),sharex=False,sharey=False)
sns.boxplot(x='Taxable.Income',data=bp,palette='crest',ax=axes[0])
sns.boxplot(x="City.Population",data=bp,palette='crest',ax=axes[1])
sns.boxplot(x='Work.Experience',data=bp,palette='crest',ax=axes[2])

In [None]:
plt.figure(figsize=(8,8))
sns.pairplot(df, palette='Set1')
plt.show()

In [None]:
sns.countplot(x='Taxable_cat', data=df1, palette = 'viridis',
              order=df1['Taxable_cat'].value_counts().index)
plt.xticks(fontsize = 12)
plt.title('Risky or Good for Individual Taxable Income ')

In [None]:
sns.countplot(x='Marital.Status', data=df1, palette = 'viridis',
              order=df1['Marital.Status'].value_counts().index)
plt.xticks(fontsize = 12)
plt.title('Marital Status ')

In [None]:
sns.countplot(x='Undergrad', data=df1, palette = 'viridis',
              order=df1['Undergrad'].value_counts().index)
plt.xticks(fontsize = 12)
plt.title('Undergrad ')

In [None]:
sns.countplot(x='Urban', data=df1, palette = 'viridis',
              order=df1['Urban'].value_counts().index)
plt.xticks(fontsize = 12)
plt.title('Urban')

In [None]:
#Label Encoding
from sklearn.preprocessing import LabelEncoder
LE=LabelEncoder()

df1["Taxable_cat"]=LE.fit_transform(df1["Taxable_cat"])
df1["Urban"]=LE.fit_transform(df1["Urban"])
df1["Undergrad"]=LE.fit_transform(df1["Undergrad"])
df1["Marital.Status"]=LE.fit_transform(df1["Marital.Status"])

df1.drop('Taxable.Income', axis=1,inplace=True)

df1.head()

In [None]:
#Data Spliting
X=df1.iloc[:,0:5]
Y=df1[["Taxable_cat"]]


In [None]:
# data partition
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test  = train_test_split(X,Y, test_size=0.30)


from sklearn.tree import DecisionTreeClassifier
DT = DecisionTreeClassifier(max_depth=4)

DT.fit(X_train,Y_train) # Bo + b1x1 + B2x2
Y_pred_train = DT.predict(X_train)
Y_pred_test = DT.predict(X_test)
training_accuracy = accuracy_score(Y_train,Y_pred_train).round(3)
test_accuracy = accuracy_score(Y_test,Y_pred_test).round(3)

print(training_accuracy)
print(test_accuracy)

print("Number of Nodes",DT.tree_.node_count)
print("Level of Depth",DT.tree_.max_depth)

In [None]:
# validation set method --> validaton hold-off method
training_accuracy = []
test_accuracy = []
Tr = []
Ts = []

DT = DecisionTreeClassifier(max_depth=1)


for i in range(1,101,1):
    X_train,X_test,Y_train,Y_test  = train_test_split(X,Y, test_size=0.30, random_state=i)
    DT.fit(X_train,Y_train)
    Y_pred_train = DT.predict(X_train)
    Y_pred_test = DT.predict(X_test)
    training_accuracy.append(accuracy_score(Y_train,Y_pred_train).round(3))
    test_accuracy.append(accuracy_score(Y_test,Y_pred_test).round(3))

import numpy as np
print("Average trianing accuray",np.mean(training_accuracy).round(2))
print("Average test accuray",np.mean(test_accuracy).round(2))

In [None]:
#Building Decision Tree Classifier using Entropy Criteria with Alpha values Post Pruning
df1_entropy = DecisionTreeClassifier(criterion='entropy', random_state=0)
df1_entropy.fit(X_train,Y_train)

Y_pred_train = df1_entropy.predict(X_train)
Y_pred_test = df1_entropy.predict(X_test)
training_accuracy = accuracy_score(Y_train,Y_pred_train).round(3)
test_accuracy = accuracy_score(Y_test,Y_pred_test).round(3)

print(training_accuracy)
print(test_accuracy)

In [None]:
plt.figure(figsize=(12,8),dpi=500)
tree.plot_tree(df1_entropy,filled=True)
plt.show()

In [None]:
path = df1_entropy.cost_complexity_pruning_path(X_train, Y_train)
ccp_alphas = path.ccp_alphas
impurities = path.impurities

clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(criterion='entropy',random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, Y_train)
    clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(clfs[-1].tree_.node_count, ccp_alphas[-1]))

In [None]:
#Accuracy vs alpha for training and testing sets

train_scores = [clf.score(X_train, Y_train) for clf in clfs]
test_scores = [clf.score(X_test, Y_test) for clf in clfs]

fig, ax = plt.subplots(figsize=(16,9))
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
#Choosing the best value of cc_alpha where the testing and training accuracy are almost same or near or closer to each other
clf = DecisionTreeClassifier(criterion='entropy',random_state=0, ccp_alpha=0.008543)
clf.fit(X_train,Y_train)

Y_pred_train = clf.predict(X_train)
Y_pred_test = clf.predict(X_test)
training_accuracy = accuracy_score(Y_train,Y_pred_train).round(3)
test_accuracy = accuracy_score(Y_test,Y_pred_test).round(3)

print(training_accuracy)
print(test_accuracy)


In [None]:
plt.figure(figsize=(15,10))
tree.plot_tree(clf,filled=True)
plt.show()

In [None]:
#Building Decision Tree Classifier using Gini Criteria with Alpha values Post Pruning

df1_gini = DecisionTreeClassifier(criterion='gini', random_state=0)
df1_gini.fit(X_train,Y_train)

Y_pred_train = df1_gini.predict(X_train)
Y_pred_test = df1_gini.predict(X_test)
training_accuracy = accuracy_score(Y_train,Y_pred_train).round(3)
test_accuracy = accuracy_score(Y_test,Y_pred_test).round(3)

print(training_accuracy)
print(test_accuracy)


In [None]:
plt.figure(figsize=(12,8),dpi=500)
tree.plot_tree(df1_gini,filled=True)
plt.show()

In [None]:
path = df1_gini.cost_complexity_pruning_path(X_train, Y_train)
ccp_alphas = path.ccp_alphas
impurities = path.impurities

clfs = []
for ccp_alpha in ccp_alphas:
    clf = DecisionTreeClassifier(criterion='gini',random_state=0, ccp_alpha=ccp_alpha)
    clf.fit(X_train, Y_train)
    clfs.append(clf)
print("Number of nodes in the last tree is: {} with ccp_alpha: {}".format(clfs[-1].tree_.node_count, ccp_alphas[-1]))

In [None]:
#Accuracy vs alpha for training and testing sets

train_scores = [clf.score(X_train, Y_train) for clf in clfs]
test_scores = [clf.score(X_test, Y_test) for clf in clfs]

fig, ax = plt.subplots(figsize=(16,9))
ax.set_xlabel("alpha")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train",
        drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label="test",
        drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
#Choosing the best value of cc_alpha where the testing and training accuracy are almost same or near or closer to each other
clf = DecisionTreeClassifier(criterion='gini',random_state=0, ccp_alpha=0.00499)
clf.fit(X_train,Y_train)

Y_pred_train = clf.predict(X_train)
Y_pred_test = clf.predict(X_test)
training_accuracy = accuracy_score(Y_train,Y_pred_train).round(3)
test_accuracy = accuracy_score(Y_test,Y_pred_test).round(3)

print(training_accuracy)
print(test_accuracy)

In [None]:
plt.figure(figsize=(15,10))
tree.plot_tree(clf,filled=True)
plt.show()