# Fraud Detection

https://www.accelebrate.com/blog/fraud-detection-using-python
    

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, cohen_kappa_score
from sklearn.metrics import f1_score, recall_score


In [3]:
def PrintStats(cmat, y_test, pred):
   # separate out the confusion matrix components
   tpos = cmat[0][0]
   fneg = cmat[1][1]
   fpos = cmat[0][1]
   tneg = cmat[1][0]
   # calculate F!, Recall scores
   f1Score = round(f1_score(y_test, pred), 2)
   recallScore = round(recall_score(y_test, pred), 2)
   # calculate and display metrics
   print(cmat)
   print( 'Accuracy: '+ str(np.round(100*float(tpos+fneg)/float(tpos+fneg + fpos + tneg),2))+'%')
   print( 'Cohen Kappa: '+ str(np.round(cohen_kappa_score(y_test, pred),3)))
   print("Sensitivity/Recall for Model : {recall_score}".format(recall_score = recallScore))
   print("F1 Score for Model : {f1_score}".format(f1_score = f1Score))

In [4]:
def RunModel(model, X_train, y_train, X_test, y_test):
   model.fit(X_train, y_train.values.ravel())
   pred = model.predict(X_test)
   matrix = confusion_matrix(y_test, pred)
   return matrix, pred

In [5]:
df = pd.read_csv('training.csv')
class_names = {0:'Not Fraud', 1:'Fraud'}
print(df.FraudResult.value_counts().rename(index = class_names))

Not Fraud    95469
Fraud          193
Name: FraudResult, dtype: int64


In [6]:

import numpy as np
from sklearn.preprocessing import LabelEncoder
feature_names = ['ProviderId', 'ProductCategory','ChannelId','Amount','PricingStrategy']#'ProductId',
#target = ['FraudResult']
data_features = df[feature_names]
data_target = df.loc[:,'FraudResult']

encoder = LabelEncoder()                                                  
data_features  = encoder.fit_transform(data_features)

ValueError: bad input shape (95662, 5)

In [None]:
print(data_features.head())

In [None]:
from sklearn.model_selection import train_test_split
np.random.seed(123)
X_train, X_test, y_train, y_test = train_test_split(data_features,data_target, train_size=0.70, test_size=0.30, random_state=1)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
cmat, pred = RunModel(lr, X_train, y_train, X_test, y_test)
PrintStats(cmat, y_test, pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100, n_jobs =4)
cmat, pred = RunModel(rf, X_train, y_train, X_test, y_test)
PrintStats(cmat, y_test, pred)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators=400,learning_rate=0.01,max_depth=3, random_state=0,
                                 min_samples_leaf= 2, min_samples_split= 5,
                                  max_features = 'sqrt' ,subsample = 0.8).fit(X_train, y_train)
cmat, pred = RunModel(gbc, X_train, y_train, X_test, y_test)
PrintStats(cmat, y_test, pred)

# Now to appply the model to the data for submission¶

In [None]:
trained_model=gbc.fit(X_train, y_train)
df_test = pd.read_csv('test.csv')
X_sub = df_test [feature_names]
encoder.transform(X_sub)
result=trained_model.predict(X_sub)


In [None]:
Submission=pd.DataFrame(result)
Submission['TransactionId']=df_test ['TransactionId']
Submission = Submission.set_index('TransactionId')
Submission.columns=["FraudResult"]
#Submission=Submission.assign(id=y_sub)
Submission.to_csv('submission.csv')
Submission.head()

In [None]:
print(Submission.FraudResult.value_counts().rename(index = class_names))