In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Introduction
![](https://cdn.pixabay.com/photo/2022/01/09/17/08/phishing-6926470_1280.png)

Hello everyone,

I used the "creditcardfraud" data set in this notebook. I have developed a model that decides whether the transaction is fraud or a legit transaction using the Logistic Regression model, which is the Binary Classification methods.

While reviewing our data set, I realized that the data set [belonged](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud) to a bank from past competitions and that the data was standardized and shared with us.

On EDA step I saw that there was no missing (NAN) data in our data. And it helped to me for less work. Later, when I looked at the distribution of dependent variables in the data set, I realize that the data was biased.

During the modeling phase, first eliminated this bias with the Under-sampling method (there are many other different models, I have added them below(+ Plus)). Then, I divided our data into dependent and independent variables and divided it as a test-train set. Finally, I created our model and moved on to the evaluation phase.

During the evaluation phase, I tried to use all evaluation metrics used in Classification models and tried to be as descriptive as I could.

Please keep in mind that I am still learning, so if you spot any incorrect explanations or anything, please let me know. Enjoy your journey, as well.

Thank you for reading this far.

-------------------

Herkese merhaba,

Bu notebook'umda "creditcardfraud" veri setini kullandım. Binary Classification yöntemlerinden birisi olan Logistic Regresyon modeli ile gerçekleşen işlemlerin dolandırıcılık mı yoksa gerçek bir işlem mi olduğuna karar veren bir model geliştirdim. 

Veri setimizi incelerken veri setinin geçmiş yarışmalardan bir bankaya ait olduğunu ve verinin Standartize edilerek bizlerle paylaşıldığını farkettim.

EDA adımlarımızda verimizde hiç eksik(NAN) veri olmadığın gördüm ve bu bizim işlerimizi biraz hafifletmişti. Daha sonra Veri setinde bağımlı değişkenlerin dağılımını incelediğimizde verinin yanlı olduğunu gördüm.

Modelleme aşamasında ilk olarak bu yanlılığı Under-sampling metodu ile giderdik(daha bir çok farklı model var bunları da aşağıya ekledim(+ Plus)).Daha sonra verimizi bağımlı-bağımsız değişkenler olarak ayırıp test-train seti olarak ayırdım. Son olarak modelimizi oluşturdum ve değerlendirme aşamasına geçtim.

Değerlendirme aşamasında Classification modellerinde kullanılan tüm değerlendirme metriklerini kullanmaya çalıştım ve elimden geldiğince açıklayıcı olmaya çalıştım.

Buraya kadar okuduğunuz için teşekkür ederim.


# + Plus
Handling Methods with Imbalanced Data For a Classification Problem

* [5 Techniques to Handle Imbalanced Data For a Classification Problem](https://www.analyticsvidhya.com/blog/2021/06/5-techniques-to-handle-imbalanced-data-for-a-classification-problem/)
    1. Choose Proper Evaluation Metric
    1. Resampling (Oversampling and Undersampling)
    1. SMOTE
    1. BalancedBaggingClassifier
    1. Threshold Moving

# Content
* [<font size=4>EDA</font>](#1)
     * [Adding important libraries](#1.1)
     * [First review to data ](#1.2)
     * [NAN values](#1.3)
     * [Distribution of Legit-Fraud classes](#1.4)
* [<font size=4>Modelling</font>](#2)
     * [Under-Sampling for Unbalanced dataset ](#2.1)
     * [Creating dependent-independent variables](#2.2)
     * [Splitting Test-Train Dataset](#2.3)
     * [Model Instance and Fitting](#2.4)
* [<font size=4>Model Evaluation</font>](#3)    
     * [Evaluating with Accuracy score ](#3.1)
     * [Evaluating with Classification Evaluation Metrics](#3.2)
     * [Evaluating with ROC curve and AUC score](#3.3)
         * [Plotting ROC Curve](#3.3.1)
         * [Calculating AUC score](#3.3.2)
* [<font size=4>Conclusion</font>](#4)

# EDA <a id="1"></a>

## Adding important libraries <a id="1.1"></a>

In [None]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## First review to data <a id="1.2"></a>

In [None]:
#reading data
df1=pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")

In [None]:
df1
#the Time columsn indicate that transaction completed in how many second
# we can say some transaction occured in 1 second and  also last some transactions occured in 172786 seconds which is 47 hours

## NAN values  <a id="1.3"></a>

In [None]:
# checking null values
df1.isnull().sum()

## Distribution of Legit-Fraud classes <a id="1.4"></a>

In [None]:
# checking distribution of Class column
df1["Class"].value_counts()
# as we can see  our data is highly unbalanced because difference between class object more than %99 . And we can't feed our ML model like that.

In [None]:
#  Separating datasets for analysis
legit=df1[df1.Class==0]
fraud=df1[df1.Class==1]

In [None]:
# Describing Amount column
legit.Amount.describe()

In [None]:
fraud.Amount.describe()

In [None]:
# comparing Classe's averages
df1.groupby("Class").mean()


# Modelling  <a id="2"></a>

## Under-Sampling for Unbalanced dataset  <a id="2.1"></a>

In [None]:
# we'll use Under-Sampling method for handling with imbalancy
legit_sample=legit.sample(n=492) # n=492 is 

In [None]:
#creating new dataset after undersampling legit transaction
df2=pd.concat([legit_sample,fraud],axis=0)

In [None]:
#df2 is balanced new dataset
df2

## Creating dependent-independent variables  <a id="2.2"></a>

In [None]:
X,y=df2.drop("Class",axis=1),df2["Class"]

## Splitting Test-Train Dataset  <a id="2.3"></a>

In [None]:
#Train-Test splitting
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10,stratify=y)

## Model Instance and Fitting <a id="2.4"></a>

In [None]:
# Model Training
# Logistic Regression
model=LogisticRegression()
model.fit(X_train,y_train)

# Model Evaluation <a id="3"></a>

## Evaluating with Accuracy score <a id="3.1"></a>

In [None]:
#accuracy score on training data
X_train_predicted=model.predict(X_train)
print(accuracy_score(X_train_predicted,y_train))

In [None]:
#accuracy score on test data
X_test_predicted=model.predict(X_test)
print(accuracy_score(X_test_predicted,y_test))

## Evaluating with Classification Evaluation Metrics  <a id="3.2"></a>

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test,X_test_predicted))

## Evaluating with ROC curve and AUC score <a id="3.3"></a>

In [None]:
#getting prediction probablities from model
y_test_prob=model.predict_proba(X_test)
y_test_prob[:10] # those are the first 10 data's probablities. First column mean is probablity of fraud transaction.second columns is probablity of legit transaction(or reverse idk)

In [None]:
y_test_prob_positve=y_test_prob[:,1] #getting second column in variable

#calculating fpr,tpr and thresholds
from sklearn.metrics import roc_curve
fpr,tpr,thresholds=roc_curve(y_test,y_test_prob_positve)

#check the false positive rate
fpr

### Plotting ROC Curve  <a id="3.3.1"></a>

In [None]:
# Creating function for plotting ROC Curve
import matplotlib.pyplot as plt


def plot_roc_curve(fpr,tpr):
    
    
    #plot roc curve
    plt.plot(fpr,tpr,color="orange",label="ROC")
    
    # Plot line with no predictive power(baseline)
    plt.plot([0,1],[0,1],color="darkblue",linestyle="--",label="Guessing")
    
    # Customize the plot
    plt.xlabel("False Positive Rate(fpr)")
    plt.ylabel("True Positive Rate(tpr)")
    plt.title("Reciever Operating Characteristics(ROC) Curve")
    
    
    
plot_roc_curve(fpr,tpr)

### Calculating AUC score  <a id="3.3.2"></a>

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,y_test_prob_positve)

# Conclusion <a id="4"></a>

So ı could say my model is predicting very well when we take into considaration auc score (auc score as close as 1 is better for classification model )