<a href="https://colab.research.google.com/github/lucianomattar/ml-fraud-detection/blob/main/credit%20card%20frau%20detection%20ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Machine learning for fraud detection with kaggle db

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
from sklearn.cluster import KMeans
from google.colab import files
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
#download kaggle fraud file
#https://drive.google.com/file/d/1s_bSWBT_e5RzfzD7-x-Db0AZByyD3Xsv/view?usp=sharing

files.upload()
df = pd.read_csv("creditcard.csv") 

df.head()

In [None]:
#descriptive analysis
dfDesc = df

dfDesc.describe()

dfCT = dfDesc.loc[dfDesc['Time'] == dfDesc.Time.max()]
pd.crosstab(dfCT.Time, dfCT.Class)

dfCT = dfDesc.loc[dfDesc['V3'] == dfDesc.V3.max()]
pd.crosstab(dfCT.V3, dfCT.Class)

dfCT = dfDesc.loc[dfDesc['V1'] == dfDesc.V1.max()]
pd.crosstab(dfCT.V1, dfCT.Class)

In [None]:
#Standardization
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)

In [None]:
#descriptive
df.describe()

count = df['Class'].value_counts()
print(count)

In [None]:
#return target variable to binary format
df['Class'] = np.where(df['Class']>=1, 1, 0)

count = df['Class'].value_counts()
print(count)

Analysis

In [None]:
#kmeans
kmeans = KMeans(n_clusters=4, random_state=42)
kmeans.fit(dfDesc)

np.unique(kmeans.labels_)

dfDesc.loc[:,"cluster"] = kmeans.labels_

count = dfDesc['cluster'].value_counts()
print(count)

pd.crosstab(dfDesc.Class, dfDesc.cluster)

In [None]:
#set target and features
features=df[['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',
'Amount', 'Time']]
target=df[['Class']]

In [None]:
#create train and test
X_train, X_test, y_train, y_test = train_test_split(features, target['Class'], test_size = 0.3, random_state = 42)

In [None]:
#logistic regression
lr = LogisticRegression(max_iter=1000, random_state=42)

fit = lr.fit(X_train, y_train)
print(fit)

predict = lr.predict(X_test)
print(predict)

score = lr.score(X_test, y_test)
print(score)

In [None]:
#Multi-layer Perceptron (MLP) 
mlp = MLPClassifier(alpha=0.001, hidden_layer_sizes=(10,), activation='relu', solver='adam', random_state=1, max_iter=1000)

fit = mlp.fit(X_train, y_train)
print(fit)

mlp.predict(X_test)
print(predict)

score = mlp.score(X_test, y_test)
print(score)

Subsampling

In [None]:
# finding class==1
n_of_fraud = len (df[df.Class==1])
fraud = np.array (df[df.Class==1].index)
no_fraud=np.array (df[df.Class==0].index)

#Select no fraud data in the same amount as fraud data
np.random.seed(0)
no_fraud_choice = np.random.choice(no_fraud, n_of_fraud, replace = False)

#create data indices through concatenation of fraud and no fraud indices
indx_subsampling=np.concatenate([fraud,no_fraud_choice],axis=None)

#select the data through the chosen indices
subsampling_data = df.iloc[indx_subsampling,:]

In [None]:
#identificando os valores de features e target para a subsampling
features_sa=subsampling_data[['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',
'Amount', 'Time']]
target_sa=subsampling_data[['Class']]

In [None]:
#create train and test
X_train, X_test, y_train, y_test = train_test_split(features_sa, target_sa['Class'], test_size = 0.3, random_state = 42)

In [None]:
#counter frauds train data
count = y_train.value_counts()
print(count)

In [None]:
#logistic regression
lr = LogisticRegression(max_iter=1000, random_state=42)

fit = lr.fit(X_train, y_train)
print(fit)

predict = lr.predict(X_test)
print(predict)

score = lr.score(X_test, y_test)
print(score)

In [None]:
#recall logistic regression
print(classification_report(y_test, lr.predict(X_test)))

In [None]:
#Multi-layer Perceptron (MLP) 
mlp = MLPClassifier(alpha=0.001, hidden_layer_sizes=(10,), activation='relu', solver='adam', random_state=1, max_iter=3000)

fit = mlp.fit(X_train, y_train)
print(fit)

mlp.predict(X_test)
print(predict)

score = mlp.score(X_test, y_test)
print(score)
