# Fraud Detection

https://www.accelebrate.com/blog/fraud-detection-using-python
    

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, cohen_kappa_score
from sklearn.metrics import f1_score, recall_score



In [2]:
def PrintStats(cmat, y_test, pred):
   # separate out the confusion matrix components
   tpos = cmat[0][0]
   fneg = cmat[1][1]
   fpos = cmat[0][1]
   tneg = cmat[1][0]
   # calculate F!, Recall scores
   f1Score = round(f1_score(y_test, pred), 2)
   recallScore = round(recall_score(y_test, pred), 2)
   # calculate and display metrics
   print(cmat)
   print( 'Accuracy: '+ str(np.round(100*float(tpos+fneg)/float(tpos+fneg + fpos + tneg),2))+'%')
   print( 'Cohen Kappa: '+ str(np.round(cohen_kappa_score(y_test, pred),3)))
   print("Sensitivity/Recall for Model : {recall_score}".format(recall_score = recallScore))
   print("F1 Score for Model : {f1_score}".format(f1_score = f1Score))

In [3]:
def RunModel(model, X_train, y_train, X_test, y_test):
   model.fit(X_train, y_train.values.ravel())
   pred = model.predict(X_test)
   matrix = confusion_matrix(y_test, pred)
   return matrix, pred

In [4]:
df = pd.read_csv('training.csv')
class_names = {0:'Not Fraud', 1:'Fraud'}
print(df.FraudResult.value_counts().rename(index = class_names))

Not Fraud    95469
Fraud          193
Name: FraudResult, dtype: int64


In [5]:
import numpy as np
feature_names = ['Amount']#,'PricingStrategy'
target = ['FraudResult']

data_features = df[feature_names]
#data_features = data_features.drop(['TransactionStartTime'], axis=1)
data_target = df[target]
#data_target= np.ravel(data_target)
print(data_features.head())

    Amount
0   1000.0
1    -20.0
2    500.0
3  20000.0
4   -644.0


In [6]:
from sklearn.model_selection import train_test_split
np.random.seed(123)
X_train, X_test, y_train, y_test = train_test_split(data_features,data_target, train_size=0.70, test_size=0.30, random_state=1)

In [7]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
cmat, pred = RunModel(lr, X_train, y_train, X_test, y_test)
PrintStats(cmat, y_test, pred)



[[28621    21]
 [   39    18]]
Accuracy: 99.79%
Cohen Kappa: 0.374
Sensitivity/Recall for Model : 0.32
F1 Score for Model : 0.37


In [8]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100, n_jobs =4)
cmat, pred = RunModel(rf, X_train, y_train, X_test, y_test)
PrintStats(cmat, y_test, pred)

[[28639     3]
 [   14    43]]
Accuracy: 99.94%
Cohen Kappa: 0.835
Sensitivity/Recall for Model : 0.75
F1 Score for Model : 0.83


In [18]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=300,learning_rate=0.01,max_depth=3, random_state=0,
                                 min_samples_leaf= 2, min_samples_split= 10,
                                  max_features = 'sqrt' ,subsample = 0.8).fit(X_train, y_train)
cmat, pred = RunModel(gbc, X_train, y_train, X_test, y_test)
PrintStats(cmat, y_test, pred)

  y = column_or_1d(y, warn=True)


[[28641     1]
 [   16    41]]
Accuracy: 99.94%
Cohen Kappa: 0.828
Sensitivity/Recall for Model : 0.72
F1 Score for Model : 0.83


In [10]:
fraud_records = len(df[df.FraudResult == 1])
# pull the indicies for fraud and valid rows
fraud_indices = df[df.FraudResult == 1].index
normal_indices = df[df.FraudResult == 0].index
# randomly collect equal samples of each type
under_sample_indices = np.random.choice(normal_indices, fraud_records, False)
df_undersampled = df.iloc[np.concatenate([fraud_indices,under_sample_indices]),:]
X_undersampled = df_undersampled[feature_names]
Y_undersampled = df_undersampled.FraudResult
X_undersampled_train, X_undersampled_test, Y_undersampled_train,Y_undersampled_test = train_test_split(X_undersampled,Y_undersampled,test_size = 0.3)
lr_undersampled = LogisticRegression(C=1)
# run the new model
cmat, pred = RunModel(lr_undersampled, X_undersampled_train, Y_undersampled_train,    X_undersampled_test, Y_undersampled_test)
PrintStats(cmat, Y_undersampled_test, pred)

[[26 34]
 [ 2 54]]
Accuracy: 68.97%
Cohen Kappa: 0.39
Sensitivity/Recall for Model : 0.96
F1 Score for Model : 0.75




# Now to appply the model to the data for submission¶

In [11]:
trained_model=gbc.fit(X_train, y_train)
df_test = pd.read_csv('test.csv')
X_sub = df_test [feature_names]
result=trained_model.predict(X_sub)


  y = column_or_1d(y, warn=True)


In [12]:
Submission=pd.DataFrame(result)
Submission['TransactionId']=df_test ['TransactionId']
Submission = Submission.set_index('TransactionId')
Submission.columns=["FraudResult"]
#Submission=Submission.assign(id=y_sub)
Submission.to_csv('submission.csv')
Submission.head()

Unnamed: 0_level_0,FraudResult
TransactionId,Unnamed: 1_level_1
TransactionId_50600,0
TransactionId_95109,0
TransactionId_47357,0
TransactionId_28185,0
TransactionId_22140,0


In [13]:
print(Submission.FraudResult.value_counts().rename(index = class_names))

Not Fraud    44958
Fraud           61
Name: FraudResult, dtype: int64
