In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score, roc_curve
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import classification_report
from collections import Counter
from imblearn.over_sampling import SMOTE



In [4]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [None]:
df.head()

In [None]:
df.info()

In [None]:
is_fraudulent = df[df['Is Laundering'] != 0]
is_fraudulent = pd.DataFrame(is_fraudulent)

In [None]:
def chunker(df, n):
    return [df[i:i + n] for i in range(0, len(df), n)]

n = 250000

chunked_list = []
chunked_list = chunker(df, n)

In [None]:
# access the first chunk
df1 = chunker(df, 250000)[0] 

In [None]:
print(df1.columns)
print(df1.shape)

In [None]:
df.columns.tolist()

In [None]:
is_fraudulent.drop(columns=['Account', 'Amount Received', 'Receiving Currency'], inplace=True)

In [None]:
is_fraudulent['Payment Currency'].value_counts

In [None]:
plt.figure(figsize=(25, 6))
sns.countplot(data=is_fraudulent, x="Payment Currency")
plt.title('Fraudulent Transactions by Payment Currency', fontsize=25)
plt.show()

In [None]:
df['USD'] = df['Payment Currency'] == 'US Dollar'

In [None]:
df.loc[df['Payment Currency'] == 'US Dollar', 'USD'] = 1
df.loc[df['Payment Currency'] != 'US Dollar', 'USD'] = 0

df

In [None]:
# assign 0 to non USD and 1 to USD

df1['USD'] = df1['Payment Currency'].apply(lambda x: 1 if x == 'US Dollar' else 0)
df1['USD']

In [None]:
cols_to_drop = ['Timestamp', 'Account.1', 'Amount Received', 'Receiving Currency']
df1.drop(cols_to_drop, axis=1, inplace=True)

df1.head()

In [None]:
# created a dictionary of payment format types and their counts

Payment_Format_Dict = df1['Payment Format'].value_counts().to_dict()

for i in Payment_Format_Dict.items():
    print(i)

In [None]:
numeric_cols = df1.select_dtypes(exclude=['object']).columns
numeric_cols

In [None]:
object_cols = df1.select_dtypes(include=['object']).columns
object_cols

In [None]:
df1.nunique()

In [None]:
df1.isnull().sum()

In [None]:
sns.countplot(data=df1, x='Is Laundering')

In [None]:
X = df1.drop('Is Laundering', axis=1)
y = df1['Is Laundering']

In [None]:
numerical_features = X.select_dtypes(exclude=['object']).columns
numerical_features

In [None]:
categorical_features = X.select_dtypes(include=['object']).columns
categorical_features

In [None]:
## Define the pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_score, recall_score

num_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", RobustScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinalencoder", OrdinalEncoder()),
    ]
)

In [None]:
from sklearn.compose import ColumnTransformer
transformer = ColumnTransformer(transformers=[
    ("OrdinalEncoder", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), categorical_features),
    ("RobustScaler", RobustScaler(), numerical_features),
], remainder="passthrough")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

In [None]:
X_train.shape, X_test.shape

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score

param_grid = {
    'max_depth': [4, 8, 16],
    'eta': [0.1,0.2,0.3],
}

xgb = XGBClassifier(
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42,
)

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='accuracy',
    cv=2,
    verbose=2,
)

grid_search.fit(X_train, y_train)

print("Best Parameters: ", grid_search.best_params_)

best_model = grid_search.best_estimator_

y_pred = best_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_pred)

In [None]:
test_probabilities = best_model.predict_proba(X_test)[:, 1]

test_auc = roc_auc_score(y_test, test_probabilities)
print("Test AUC: ", test_auc)

fpr, tpr, thresholds = roc_curve(y_test, test_probabilities)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC Curve (area = %0.2f)' % test_auc)
plt.plot([0,1], [0,1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title("Receiver Operating Characteristics ")
plt.legend(loc="lower right")
plt.show()