In [57]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.utils import resample
from os import system
# On désigne le chemin vers l'exe de graphViz
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/graphviz-2.38/bin/'

In [10]:
# On importe les données
dataset = pd.read_csv('dataset.csv')
dataset.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [12]:
dataset.columns


Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')

In [13]:
dataset.drop(['step', 'type', 'nameOrig','nameDest'],axis='columns',inplace=True)


In [14]:
dataset.head(5)

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,9839.64,170136.0,160296.36,0.0,0.0,0,0
1,1864.28,21249.0,19384.72,0.0,0.0,0,0
2,181.0,181.0,0.0,0.0,0.0,1,0
3,181.0,181.0,0.0,21182.0,0.0,1,0
4,11668.14,41554.0,29885.86,0.0,0.0,0,0


In [27]:
dataset['isFraud'].value_counts()

0    1047433
1       1142
Name: isFraud, dtype: int64

In [30]:
dataset_majority=dataset[dataset.isFraud==0]
dataset_minority=dataset[dataset.isFraud==1]
dataset_majority_downsampled=resample(dataset_majority,replace=False,n_samples=400,random_state=24)
dataset_minority_downsampled=resample(dataset_minority,replace=False,n_samples=400,random_state=24)

In [31]:
dataset_downsampled=pd.concat([dataset_majority_downsampled,dataset_minority_downsampled])

In [32]:
dataset_downsampled['isFraud'].value_counts()

1    400
0    400
Name: isFraud, dtype: int64

In [33]:
dataset_downsampled.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
951754,159842.03,0.0,0.0,4366528.66,4526370.7,0,0
200970,6870.69,5175.41,0.0,0.0,0.0,0,0
28944,25266.92,43950.23,18683.31,0.0,0.0,0,0
253886,11084.26,878380.28,867296.02,0.0,0.0,0,0
793076,11903.96,0.0,0.0,0.0,0.0,0,0


In [34]:
X = dataset_downsampled.drop(['isFraud'],axis='columns')
y=dataset_downsampled.isFraud
X.head()

Unnamed: 0,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
951754,159842.03,0.0,0.0,4366528.66,4526370.7,0
200970,6870.69,5175.41,0.0,0.0,0.0,0
28944,25266.92,43950.23,18683.31,0.0,0.0,0
253886,11084.26,878380.28,867296.02,0.0,0.0,0
793076,11903.96,0.0,0.0,0.0,0.0,0


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1)

In [39]:
modelTree=tree.DecisionTreeClassifier(random_state=0,criterion='gini',max_depth=9)
modelTree.fit(X_train,y_train)
accuracyTreeTest=modelTree.score(X_test,y_test)
accuracyTreeTrain=modelTree.score(X_train,y_train)

print('Précision Arbre x_test: ', accuracyTreeTest)
print('Précision Arbre x_train: ', accuracyTreeTrain)

Précision Arbre x_test:  0.975
Précision Arbre x_train:  1.0


In [42]:
dotfile = open("test.dot", 'w')
tree.export_graphviz(modelTree, out_file=dotfile, 
                      feature_names=['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest',
       'newbalanceDest', 'isFlaggedFraud'],  
                      class_names =['0','1'],
                      filled=True, rounded=True,  
                      special_characters=True)  

dotfile.close()
system("dot -Tpng test.dot -o tree.png") 

0

![](tree.png)

In [70]:
#Prediction Test
x=np.array([181,181,0,21182,0,0]).reshape(1,6)
print('class :' ,modelTree.predict(x))
print('proba :' ,modelTree.predict_proba(x))

class : [1]
proba : [[0. 1.]]


In [69]:

#Prediction Test
x=np.array([567580.98,70615,0,70581.25,638162.23,0]).reshape(1,6)
print('class :' ,modelTree.predict(x))
print('proba :' ,modelTree.predict_proba(x))

class : [0]
proba : [[1. 0.]]
