In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import IsolationForest
from pyod.models.knn import KNN 
import pyod
from pyod.utils.data import evaluate_print
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.font_manager
import numpy as np
from pyod.models.abod import ABOD
from pyod.models.knn import KNN
from pyod.utils.data import generate_data, get_outliers_inliers
import seaborn as sns 
from sklearn.neighbors import NearestNeighbors

In [2]:
from pandas import read_csv
import pandas as pd
import matplotlib.pyplot as plt

# Import Dataset

In [3]:
df=pd.read_csv('bsNET140513_032310.csv')

In [4]:
df.head()

Unnamed: 0,Source,Target,Weight,typeTrans,fraud
0,'C1093826151','M348934600',4.55,'es_transportation',0
1,'C352968107','M348934600',39.68,'es_transportation',0
2,'C2054744914','M1823072687',26.89,'es_transportation',0
3,'C1760612790','M348934600',17.25,'es_transportation',0
4,'C757503768','M348934600',35.72,'es_transportation',0


# Total Fraud and Non fraud sample

In [5]:
df['fraud'].value_counts()

0    587443
1      7200
Name: fraud, dtype: int64

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df2=df.copy()
df2.loc[:,['typeTrans']]=le.fit_transform(df2.loc[:,['typeTrans']])
df2.loc[:,['Target']]=le.fit_transform(df2.loc[:,['Target']])
df2.loc[:,['Source']]=le.fit_transform(df2.loc[:,['Source']])

  return f(*args, **kwargs)


In [7]:
data=df2[df2.columns.drop(['fraud'])].values

In [8]:
from sklearn.ensemble import IsolationForest
iso = IsolationForest(contamination=0.05)
yhat = iso.fit_predict(data)

In [9]:
data2=df2.values
mask = yhat == -1
outlier_values_if=pd.DataFrame(data2[mask, :])

# Total Outlier

In [10]:
 len(outlier_values_if)

29733

In [11]:
outlier_values_if[4].value_counts()

0.0    23821
1.0     5912
Name: 4, dtype: int64

# Train Model Without Applying Any sampling Technique

In [12]:
train,test=train_test_split(df2,random_state=42)

In [13]:
xtest=test.drop('fraud',axis=1)
ytest=test.fraud

In [14]:
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC,LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

In [15]:
def print_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    print('True positive = ', cm[1][1])
    print('False positive = ', cm[0][1])
    print('False negative = ', cm[1][0])
    print('True negative = ', cm[0][0])

In [16]:
def print_evaluation_metrics(y_true, y_pred):
    # accuracy: (tp + tn) / (p + n)
    accuracy = accuracy_score(y_true, y_pred)
    print('Accuracy: %f' % accuracy)
    # precision tp / (tp + fp)
    precision = precision_score(y_true, y_pred)
    print('Precision: %f' % precision)
    # recall: tp / (tp + fn)
    recall = recall_score(y_true, y_pred)
    print('Recall: %f' % recall)
    # f1: 2 tp / (2 tp + fp + fn)
    f1 = f1_score(y_true, y_pred)
    print('F1 score: %f' % f1)

In [17]:
model = LinearSVC()
model.fit(train.drop('fraud',axis=1),train.fraud)



LinearSVC()

In [18]:
yhat = model.predict(xtest)

In [19]:
compare=pd.DataFrame()
compare['true']=np.array(ytest)
compare['preicted']=np.array(yhat)

In [20]:
print('Confusion Matrix \n')
print_confusion_matrix(compare['true'],compare['preicted'])
print('---------------------------------------')
print('\nEvaluation_metrics \n')
print_evaluation_metrics(compare['true'],compare['preicted'])

Confusion Matrix 

True positive =  862
False positive =  54
False negative =  888
True negative =  146857
---------------------------------------

Evaluation_metrics 

Accuracy: 0.993663
Precision: 0.941048
Recall: 0.492571
F1 score: 0.646662


In [21]:
tn, fp, fn, tp = confusion_matrix(compare['true'],compare['preicted']).ravel()
print("tn, fp, fn, tp ---> ", tn, fp, fn, tp)

tn, fp, fn, tp --->  146857 54 888 862


#  Train Model With Applying Undersampling  Technique

In [22]:
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(train.drop('fraud',axis=1),train.fraud)

# Total Fraud and Non Fraud sample after resampling

In [23]:
y_rus.value_counts()

1    5450
0    5450
Name: fraud, dtype: int64

In [24]:
model = LinearSVC(class_weight='balanced')
model.fit(X_rus, y_rus)



LinearSVC(class_weight='balanced')

In [25]:
yhat = model.predict(xtest)
compare=pd.DataFrame()
compare['true']=np.array(ytest)
compare['preicted']=np.array(yhat)

In [26]:
print('Confusion Matrix For Undersampling Technique\n')
print_confusion_matrix(compare['true'],compare['preicted'])
print('---------------------------------------')
print('\nEvaluation_metrics For Undersampling Technique\n')
print_evaluation_metrics(compare['true'],compare['preicted'])

Confusion Matrix For Undersampling Technique

True positive =  1462
False positive =  5472
False negative =  288
True negative =  141439
---------------------------------------

Evaluation_metrics For Undersampling Technique

Accuracy: 0.961254
Precision: 0.210845
Recall: 0.835429
F1 score: 0.336711


In [27]:
print_evaluation_metrics(compare['true'],compare['preicted'])

Accuracy: 0.961254
Precision: 0.210845
Recall: 0.835429
F1 score: 0.336711


 #  Train Model With Applying Oversampling  Technique

In [28]:
rus = RandomOverSampler(random_state=42)
X_ros, y_ros = rus.fit_resample(train.drop('fraud',axis=1),train.fraud)

# Normalization

In [29]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1))
x_train=scaling.fit_transform(X_ros)
x_test=scaling.fit_transform(xtest)

#  Total Fraud and Non Fraud sample after resampling

In [30]:
y_ros.value_counts()

1    440532
0    440532
Name: fraud, dtype: int64

In [31]:
model = LinearSVC(class_weight='balanced')
model.fit(x_train, y_ros)

LinearSVC(class_weight='balanced')

In [32]:
yhat = model.predict(x_test)
compare=pd.DataFrame()
compare['true']=np.array(ytest)
compare['preicted']=np.array(yhat)

In [33]:
print('Confusion Matrix For Oversampling Technique\n')
print_confusion_matrix(compare['true'],compare['preicted'])
print('---------------------------------------')
print('\nEvaluation_metrics For Oversampling Technique\n')
print_evaluation_metrics(compare['true'],compare['preicted'])

Confusion Matrix For Oversampling Technique

True positive =  1552
False positive =  7882
False negative =  198
True negative =  139029
---------------------------------------

Evaluation_metrics For Oversampling Technique

Accuracy: 0.945648
Precision: 0.164511
Recall: 0.886857
F1 score: 0.277539


#   Train Model With Applying SMOTE  Technique

In [58]:
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample((train.drop('fraud',axis=1)),train.fraud)

# Normalization

In [65]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1))
x_train=scaling.fit_transform(X_sm)
x_test=scaling.fit_transform(xtest)

#  Total Fraud and Non Fraud sample after resampling

In [66]:
y_sm.value_counts()

1    440532
0    440532
Name: fraud, dtype: int64

In [67]:
model = LinearSVC(class_weight='balanced')
model.fit(x_train, y_sm)

LinearSVC(class_weight='balanced')

In [68]:
yhat = model.predict(x_test)
compare=pd.DataFrame()
compare['true']=np.array(ytest)
compare['preicted']=np.array(yhat)

In [69]:
print('Confusion Matrix For SMOTE Technique\n')
print_confusion_matrix(compare['true'],compare['preicted'])
print('---------------------------------------')
print('\nEvaluation_metrics For SMOTE Technique\n')
print_evaluation_metrics(compare['true'],compare['preicted'])

Confusion Matrix For SMOTE Technique

True positive =  1551
False positive =  9600
False negative =  199
True negative =  137311
---------------------------------------

Evaluation_metrics For SMOTE Technique

Accuracy: 0.934085
Precision: 0.139091
Recall: 0.886286
F1 score: 0.240446


#   Train Model With Applying  SMOTETomek  Technique

In [70]:
# SMOTETomek
from imblearn.combine import SMOTETomek
smk = SMOTETomek()
X_smk, y_smk = smk.fit_resample(train.drop('fraud',axis=1),train.fraud)

In [71]:
y_smk.value_counts()

1    440337
0    440337
Name: fraud, dtype: int64

In [72]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1))
x_train=scaling.fit_transform(X_smk)
x_test=scaling.fit_transform(xtest)

In [73]:
model = LinearSVC(class_weight='balanced')
model.fit(x_train, y_smk)

LinearSVC(class_weight='balanced')

In [74]:
yhat = model.predict(x_test)
compare=pd.DataFrame()
compare['true']=np.array(ytest)
compare['preicted']=np.array(yhat)

In [75]:
print('Confusion Matrix For SMOTE Technique\n')
print_confusion_matrix(compare['true'],compare['preicted'])
print('---------------------------------------')
print('\nEvaluation_metrics For SMOTE Technique\n')
print_evaluation_metrics(compare['true'],compare['preicted'])

Confusion Matrix For SMOTE Technique

True positive =  1553
False positive =  9530
False negative =  197
True negative =  137381
---------------------------------------

Evaluation_metrics For SMOTE Technique

Accuracy: 0.934569
Precision: 0.140125
Recall: 0.887429
F1 score: 0.242032
