In [None]:
%%writefile requirements.txt
# colabcode
fastapi
uvicorn
python-multipart
# pyngrok
kaggle

In [None]:
!pip install -r requirements.txt

In [None]:
from google.colab import files
files.upload()

In [None]:
! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/
! chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d ealaxi/paysim1
!unzip /content/paysim1.zip

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")
%matplotlib inline

In [None]:
df = pd.read_csv('PS_20174392719_1491204439457_log.csv')
pd.set_option('display.max.columns', None)
df.head()

# EDA:

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
fraud=len(df['is_Fraud'][df.isFraud==1])
not_fraud=len(df['is_Fraud'][df.isFraud==0])

arr=np.array([fraud,not_fraud])
labels = ['Not Fraudulent', 'Fraudulent']
print(f"Total Fraudulent Cases: {fraud}\nTotal Non Fraudulent Cases: {not_fraud}")

In [None]:
fig, ax = plt.subplots(figsize=(21,10))
sns.set_context('poster')
corr = df.corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns,cmap='gist_rainbow', annot = True)
ax.set_title('Collinearity of Feature Attributes')
plt.savefig('cormap.png')

In [None]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()

In [None]:
#@title 
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()

In [None]:
#@title 
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()

In [None]:
percent_fraud = (len(df.loc[df.isFraud==1])) / (len(df.loc[df.isFraud == 0])) * 100
print(f"Percentage of Fraudulent Transanctions: {percent_fraud}%")

In [None]:
df.drop(['nameOrg', 'nameDest'], axis=1,inplace=True)
data=df.copy(deep=True)
categorical_cols=[col for col in daata.columns if data[col].dtype=="O"]

le=LabelEncoder()
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

df.head()

In [None]:
def evaluate(y_test, y_pred):
    print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
    print(f"Recall Score: {recall_score(y_test, y_pred)}")
    print(f"Precision Score: {precision_score(y_test, y_pred)}")
    print(f"F1 Score: {f1_score(y_test, y_pred)}")
    print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))

# Data:

In [None]:
X=data.drop('isFraud', axis=1)
Y=data.isFraud

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.25, random_state=27)

# Models:

In [None]:
lr_clf = LogisticRegression()
rfc_clf = RandomForestClassifier(n_estimators=10)
xgb_clf = XGBClassifier(colsample_bytree = 1.0,
                        learning_rate = 0.1,
                        max_depth =4,
                        n_estimators = 200,
                        subsample = 1.0)

In [None]:
#Logistic Regression:
lr_clf.fit(X_train, Y_train)
y_preds_lr = lr_clf.predict(X_test)
evaluate(Y_test, y_preds_lr)

In [None]:
#Rnadom Forest Classifier:
rfc_clf.fit(X_train, Y_train)
y_preds_rfc = rfc_clf.predict(X_test)
evaluate(Y_test, y_preds_rfc)

In [None]:
#XGBoost:
eval_set = [(X_test,Y_test)]
xgb_clf.fit(X_train, Y_train, early_stopping_rounds=30, eval_set=eval_set, eval_metric='logloss', verbose=True)
y_preds_xgb = xgb_clf.predict(X_test)
evaluate(Y_test, y_preds_xgb)

In [None]:
joblib.dump(rfc_clf, 'credit_card_fraud.pkl')