<a href="https://colab.research.google.com/github/meshrifalruily/Machine-learning-projects/blob/main/payments_fraud_detection_using_multiple_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'online-payments-fraud-detection-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F2093649%2F3478314%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240510%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240510T141013Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D91b712d90f1fa12168ac61b567f1476f4ad7fa058ce37946e899c8d9c1f9e796034bf9b9adb50414bd6fe331a0d06d3b87f264d96bb322eb89b03eb657b3ae4a414eb3498d865ca0355e3791323de68a02b3b24f4da637d8e552eb783fb729ae7e9c3647b9361039d0d471e3a9e7f4b4edf68cbe5393910d5dd0539545b6682b2fe23a6a06a98f2d49558c6f86dea680837ab28f2af78c0a094ee20729490a5a6aeee30e55ad374f3f9555f034c20d12aebdf2552b0faf0833aff7d24f05c2c6b767e5e2777c794cc5909e66e2cf08f461e7040c81b8b3ba98beb464f679113085085899daa1e47a28d0faae9f07f86aae34a5349c8d5e5f462389e071f0cc3f'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory



In [None]:
data = pd.read_csv('/kaggle/input/online-payments-fraud-detection-dataset/PS_20174392719_1491204439457_log.csv')
data.head(2)

In [None]:
data.info()

In [None]:
data.to_parquet('fraud_detection.parquet')

In [None]:
df = pd.read_parquet('/kaggle/working/fraud_detection.parquet')
df.head(2)

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.dtypes.value_counts()

In [None]:
df.select_dtypes('object').columns

In [None]:
def types_count(col):
    return df[col].value_counts()

for col in df.select_dtypes('object').columns:
    print(types_count(col))
    print('#'*50)

In [None]:
cleaned_df =df.drop(['nameOrig','nameDest','isFlaggedFraud'], axis=1)

In [None]:
cleaned_df.head(2)

In [None]:
cleaned_df.isFraud.value_counts()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.countplot(data=df, x= 'isFraud')
plt.show()

In [None]:
def down_sample(df):
    not_fraud = df[df['isFraud']==0].sample(n=8213, random_state=42)
    fraude = df[df['isFraud']==1]
    df = pd.concat([not_fraud , fraude], axis=0).reset_index(drop=True)
    return df

sample_df = down_sample(cleaned_df)

In [None]:
sample_df.head(2)

In [None]:
sns.countplot(data=sample_df, x= 'isFraud')
plt.show()

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.decomposition import TruncatedSVD

cat_col = make_column_selector(dtype_include= 'object')
num_col = make_column_selector(dtype_include='number')


cat_pipe=Pipeline([

    ('cat_impute', SimpleImputer(strategy= 'most_frequent')),
    ('ohe', OneHotEncoder(sparse_output= False, handle_unknown= 'ignore')),
    ('dr', TruncatedSVD(n_components=3))


])

num_pipe = Pipeline([

    ('num_impute', SimpleImputer(strategy= 'mean')),
    ('stand', StandardScaler(with_mean=True))

])


transformer = ColumnTransformer([

    ('cat', cat_pipe, cat_col),
    ('num', num_pipe, num_col)
]).set_output(transform='pandas')


print(transformer)

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

features = sample_df.drop('isFraud', axis=1)
target= sample_df.isFraud
X_train, X_test, y_train, y_test =  train_test_split(features, target, test_size=.30, shuffle=True, stratify= target, random_state=42)

In [None]:
from sklearn.metrics import *
def model_eveluation(model, X_test= X_test, y_test= y_test):
    prediction = model.predict(X_test)
    print(f'the traning accuracy of the model is {round(accuracy_score(model.predict(X_train), y_train),4)*100}%')
    print(f'the testing accuracy of the model is {round(accuracy_score(prediction, y_test),4)*100}%')
    print(f'the precision of the model is {precision_score(prediction, y_test, average= "macro")}')
    print(f'the recall of the model is {recall_score(prediction, y_test, average="macro")}')
    print(f'the f1 of the model is {f1_score(prediction, y_test, average="macro")}')
    cm = confusion_matrix(prediction, y_test)
    ConfusionMatrixDisplay(cm).plot()

In [None]:
from sklearn.linear_model import LogisticRegression

clf = Pipeline([

    ('transformer', transformer),
    ('model',LogisticRegression(max_iter=2000 , penalty= 'l2', C=1) )
])

clf.fit(X_train, y_train)

In [None]:
cross_val_score(clf, features, target, cv=5, scoring='accuracy')

In [None]:
cross_val_score(clf, features, target, cv=5, scoring='accuracy').mean()

In [None]:
sum(cross_val_predict(clf, X_test, y_test)==y_test)/len(y_test)

In [None]:
feature_names = clf['model'].feature_names_in_
model_coef = clf['model'].coef_
model_coef

In [None]:
def plot_coef(feature_names, model_coef):

    plt.bar(feature_names, model_coef)
    plt.xticks(rotation=45)
    plt.ylabel('Columns weights')
    plt.title('Model Coefficients')
    plt.show()

plot_coef(feature_names, model_coef[0])

In [None]:
model_eveluation(clf)

## Tune Logistic Regression Pipeline

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'transformer__cat__dr__n_components':[2,3],
         'transformer__num__stand__with_mean':[True, False],
          'transformer__num__num_impute__strategy':['mean', 'median'],
         'model__C':[0.001, 0.01, 0.1, 1, 10,100]}

gscv = GridSearchCV(clf, params, cv=3, scoring = 'f1', n_jobs=-1)

gscv.fit(X_train, y_train)

In [None]:
print(f'best params {gscv.best_params_}')
print(f'best f1 score {gscv.best_score_}')
best_model = gscv.best_estimator_

In [None]:
print(best_model['model'].coef_[0])
plot_coef(best_model['model'].feature_names_in_, best_model['model'].coef_[0])

In [None]:
print(best_model.score(X_train, y_train)*100)
best_model.score(X_test, y_test)*100

## Using SVC

In [None]:
from sklearn.svm import SVC

clf = Pipeline([

    ('transformer', transformer),
    ('model',SVC(kernel= 'linear', C=10) )
])

clf.fit(X_train, y_train)

In [None]:
model_eveluation(clf)

# Tune SVC Pipeline

In [None]:
from sklearn.model_selection import GridSearchCV

params = {

         'model__kernel':['rbf', 'linear','poly','sigmoid'],
         'model__C':np.random.uniform(0.001,500, 10)
         }

gscv = GridSearchCV(clf, params, cv=3, scoring = 'f1', n_jobs=-1)

gscv.fit(X_train, y_train)

In [None]:
print(f'best params {gscv.best_params_}')
print(f'best f1 score {gscv.best_score_}') # avarage score of the validation split of the traning data
best_model = gscv.best_estimator_


In [None]:
print(best_model.score(X_train, y_train))
print(best_model.score(X_test, y_test))

## Using RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = Pipeline([

    ('transformer', transformer),
    ('model',RandomForestClassifier(random_state=42) )
])

clf.fit(X_train, y_train)

In [None]:
model_eveluation(clf)

In [None]:
features_im = clf['model'].feature_importances_
col_names = clf['model'].feature_names_in_

#plt.style.use('ggplot')
plt.style.use('fivethirtyeight')
plt.bar(col_names, features_im )
plt.xticks(rotation= 90)
plt.title('Features Importance')
plt.show()

# Tune RandomForest

In [None]:
params = {
          'model__n_estimators':[50, 100, 150,200],
          'model__max_depth': [3,5,10,15,20],
          'model__max_samples':[0.7, 0.9, 1.0]

           }
gscv = GridSearchCV(clf, params, cv=3, scoring='f1', n_jobs=-1)

gscv.fit(X_train, y_train)

In [None]:
print(f'RF best params {gscv.best_params_}')
print(f'RF best f1 score {gscv.best_score_}') # avarage score of the validation split of the traning data
best_model = gscv.best_estimator_


In [None]:
print(model_eveluation(best_model))

![PHOTO-2024-05-05-18-35-37 2.jpg](attachment:c279f988-703a-4518-b391-90d3c54ead14.jpg)

In [None]:
from xgboost import XGBClassifier


clf = Pipeline([

    ('trasn', transformer),
    ('model',XGBClassifier(learning_rate = 0.1, n_estimators=100))
])


clf.fit(X_train, y_train)

In [None]:
clf.score(X_train, y_train)

In [None]:
model_eveluation(clf)

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = Pipeline([
    ('transformer', transformer),
    ('model', KNeighborsClassifier())
])

knn.fit(X_train, y_train)

In [None]:
X_test = X_test.reset_index(drop=True)

In [None]:
X_test

In [None]:
X_test.iloc[[0]]

In [None]:
knn.predict(X_test.iloc[1].to_frame().T)

In [None]:
knn.predict_proba(X_test.iloc[1].to_frame().T)

In [None]:
knn.predict_proba(X_test.iloc[1].to_frame().T).argmax()

In [None]:
X_test['prediction'] = knn.predict(X_test)
X_test['predcition_proba'] = knn.predict_proba(X_test)[:,1]
X_test

In [None]:
model_eveluation(knn)

In [None]:
params = {

    'model__n_neighbors':[5,7,10,15,30,50,100]
}

gscv = GridSearchCV(knn, params, cv=3, scoring = 'f1', n_jobs =-1)

gscv.fit(X_train, y_train)

In [None]:
best_params = gscv.best_params_
best_params

In [None]:
model = gscv.best_estimator_
model_eveluation(model)

In [None]:
# from sklearn.model_selection import cross_val_score

# for metric in ['f1', 'accuracy', 'precision', 'recall']:
#     print(metric, '= ',cross_val_score(clf, features, target, cv=10, n_jobs = -1, scoring=metric).mean())

# Voting

In [None]:
from sklearn.ensemble import VotingClassifier

vote = VotingClassifier(estimators=[

    ('logisticR', LogisticRegression(C=300, random_state=42)),
    ('RF', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', SVC(C=500, random_state=42, probability=True))
], n_jobs= -1, verbose=True,  voting='soft') # 1- soft Soft Voting uses sum probablity from each model for each class
                                             # 2- hard majority vote

vote_pipe = Pipeline([

    ('transfrom', transformer),
    ('model', vote)
])


vote_pipe.fit(X_train, y_train)


In [None]:
model_eveluation(vote_pipe)

In [None]:
params = {

    'model__estimators': [
                          [('log', LogisticRegression(C=500, max_iter=1000)), ('svc', SVC(probability= True, C=500)) ],
                          [('log', LogisticRegression(C=500, max_iter=1000)), ('svc', SVC(probability= True, C=500)), ('rf', RandomForestClassifier())],
                          [('svc', SVC(probability= True, C=500)), ('rf', RandomForestClassifier())]

                         ]
}

gscv = GridSearchCV(vote_pipe, params, cv=3, n_jobs=-1, scoring = 'f1')

gscv.fit(X_train, y_train)

In [None]:
print('best params', gscv.best_params_)
print('best score', gscv.best_score_)
best_model = gscv.best_estimator_
best_model

In [None]:
pd.DataFrame.from_dict(gscv.cv_results_)

In [None]:
model_eveluation(best_model)

# *Using Bagging*

In [None]:
from sklearn.ensemble import BaggingClassifier # default estimator Decision Tree and n_estimators= 10

#bc = BaggingClassifier(estimator= LogisticRegression(C =500, max_iter=1000, solver= 'newton-cg'), n_estimators= 200, n_jobs= -1, random_state = 42)

#bc = BaggingClassifier(estimator= KNeighborsClassifier(n_neighbors=5), n_estimators= 100, n_jobs= -1, random_state = 42)

bc = BaggingClassifier()
bc_pipe = Pipeline([


   ('transformer', transformer),
   ('model', bc)

])


bc_pipe.fit(X_train, y_train)

In [None]:
model_eveluation(bc_pipe)

In [None]:
from sklearn.tree import DecisionTreeClassifier
params = {

    'model__estimator':[LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier()],
    'model__n_estimators': [50,100,150]

}

gscv = GridSearchCV(bc_pipe, params, cv=3, n_jobs=-1, scoring='accuracy')

gscv.fit(X_train, y_train)

In [None]:
gscv.best_params_

In [None]:
model_eveluation(gscv)

# *Neural Network*

In [None]:
from sklearn.neural_network import MLPClassifier

# clf = MLPClassifier(hidden_layer_sizes= (150),
#                     activation="relu",
#                     solver="adam",
#                     learning_rate="adaptive",
#                     learning_rate_init=0.001,
#                     max_iter=500,
#                    validation_fraction=0.1)

clf = MLPClassifier(max_iter = 2000)


mlp_pipe = Pipeline([

    ('transformer', transformer),
    ('clf', clf)
])

mlp_pipe.fit(X_train, y_train)

In [None]:
model_eveluation(mlp_pipe)

In [None]:
params = {

    'clf__hidden_layer_sizes':[(50), (50,100), (50,100,150)],
    'clf__learning_rate':['constant', 'adaptive'],
    'clf__learning_rate_init':[0.001,0.01, 0.0001],
    'clf__alpha':[0.01,0.001,1,200, 500],
    'clf__activation':['relu', 'logistic', 'tanh'],




}

gscv = GridSearchCV(mlp_pipe, params, cv=3, n_jobs=-1, scoring= 'f1')

gscv.fit(X_train, y_train)

In [None]:
gscv.best_score_

In [None]:
gscv.best_params_

In [None]:
model_eveluation(gscv.best_estimator_)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

rscv = RandomizedSearchCV(mlp_pipe, params, n_jobs=-1, scoring='f1', cv=2, n_iter=150)

rscv.fit(X_train, y_train)

In [None]:
rscv.best_params_

In [None]:
pd.DataFrame(rscv.cv_results_).sort_values('mean_test_score', ascending = False)

In [None]:
model_eveluation(rscv.best_estimator_)

In [None]:
cross_val_score(rscv.best_estimator_, features, target, cv= 5, n_jobs= -1, scoring='f1' ).mean()