# Setup

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("datasets/transactions.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO
0,0,0,2023-01-01 00:00:31,596,3156,533.07,31,0,0,0
1,1,1,2023-01-01 00:02:10,4961,3412,808.56,130,0,0,0
2,2,2,2023-01-01 00:07:56,2,1365,1442.94,476,0,1,1
3,3,3,2023-01-01 00:09:29,4128,8737,620.65,569,0,0,0
4,4,4,2023-01-01 00:10:34,927,9906,490.66,634,0,0,0


#### Deleting Unnamed Column
* Deleting first column

In [4]:
del df[df.columns[0]]

In [5]:
df.head()

Unnamed: 0,TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO
0,0,2023-01-01 00:00:31,596,3156,533.07,31,0,0,0
1,1,2023-01-01 00:02:10,4961,3412,808.56,130,0,0,0
2,2,2023-01-01 00:07:56,2,1365,1442.94,476,0,1,1
3,3,2023-01-01 00:09:29,4128,8737,620.65,569,0,0,0
4,4,2023-01-01 00:10:34,927,9906,490.66,634,0,0,0


# Exploratory Analysis

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 954155 entries, 0 to 954154
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   TRANSACTION_ID     954155 non-null  int64  
 1   TX_DATETIME        954155 non-null  object 
 2   CUSTOMER_ID        954155 non-null  int64  
 3   TERMINAL_ID        954155 non-null  int64  
 4   TX_AMOUNT          954155 non-null  float64
 5   TX_TIME_SECONDS    954155 non-null  int64  
 6   TX_TIME_DAYS       954155 non-null  int64  
 7   TX_FRAUD           954155 non-null  int64  
 8   TX_FRAUD_SCENARIO  954155 non-null  int64  
dtypes: float64(1), int64(7), object(1)
memory usage: 65.5+ MB


In [7]:
df.describe()

Unnamed: 0,TRANSACTION_ID,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO
count,954155.0,954155.0,954155.0,954155.0,954155.0,954155.0,954155.0,954155.0
mean,477077.0,2504.415626,4997.540207,537.708275,4296716.0,49.231433,0.133886,0.143796
std,275440.967387,1445.805839,2886.152602,878.083762,2481134.0,28.717332,0.34053,0.384762
min,0.0,0.0,0.0,0.0,31.0,0.0,0.0,0.0
25%,238538.5,1252.0,2502.0,180.05,2137039.0,24.0,0.0,0.0
50%,477077.0,2506.0,4994.0,422.15,4285684.0,49.0,0.0,0.0
75%,715615.5,3765.0,7496.0,750.675,6442178.0,74.0,0.0,0.0
max,954154.0,4999.0,9999.0,77212.5,8594915.0,99.0,1.0,3.0


#### Checking Null Values

In [8]:
df.isnull().sum()

TRANSACTION_ID       0
TX_DATETIME          0
CUSTOMER_ID          0
TERMINAL_ID          0
TX_AMOUNT            0
TX_TIME_SECONDS      0
TX_TIME_DAYS         0
TX_FRAUD             0
TX_FRAUD_SCENARIO    0
dtype: int64

#### Checking Variables Correlations

In [9]:
df.corr().style.background_gradient(cmap='coolwarm')

  df.corr().style.background_gradient(cmap='coolwarm')


Unnamed: 0,TRANSACTION_ID,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO
TRANSACTION_ID,1.0,-0.000369,-0.001646,0.00149,0.999995,0.999949,0.002757,0.006346
CUSTOMER_ID,-0.000369,1.0,-9e-05,-0.000905,-0.000372,-0.000378,0.003939,0.002794
TERMINAL_ID,-0.001646,-9e-05,1.0,-0.001314,-0.001643,-0.001634,-0.002934,-0.004463
TX_AMOUNT,0.00149,-0.000905,-0.001314,1.0,0.001485,0.001482,0.415752,0.546737
TX_TIME_SECONDS,0.999995,-0.000372,-0.001643,0.001485,1.0,0.999973,0.002758,0.006345
TX_TIME_DAYS,0.999949,-0.000378,-0.001634,0.001482,0.999973,1.0,0.002761,0.006348
TX_FRAUD,0.002757,0.003939,-0.002934,0.415752,0.002758,0.002761,1.0,0.950552
TX_FRAUD_SCENARIO,0.006346,0.002794,-0.004463,0.546737,0.006345,0.006348,0.950552,1.0


In [10]:
df.columns

Index(['TRANSACTION_ID', 'TX_DATETIME', 'CUSTOMER_ID', 'TERMINAL_ID',
       'TX_AMOUNT', 'TX_TIME_SECONDS', 'TX_TIME_DAYS', 'TX_FRAUD',
       'TX_FRAUD_SCENARIO'],
      dtype='object')

#### Using columns to train data
* **CUSTOMER_ID** : 2
* **TERMINAL_ID** : 3
* **TX_AMOUNT** : 4
* **TX_TIME_SECONDS** : 5
* **TX_TIME_DAYS** : 6

In [11]:
x=df.iloc[:,[2,3,4,5,6]].values

In [12]:
x

array([[5.960000e+02, 3.156000e+03, 5.330700e+02, 3.100000e+01,
        0.000000e+00],
       [4.961000e+03, 3.412000e+03, 8.085600e+02, 1.300000e+02,
        0.000000e+00],
       [2.000000e+00, 1.365000e+03, 1.442940e+03, 4.760000e+02,
        0.000000e+00],
       ...,
       [4.799000e+03, 4.767000e+03, 5.711100e+02, 8.594904e+06,
        9.900000e+01],
       [1.259000e+03, 2.956000e+03, 8.130000e+01, 8.594908e+06,
        9.900000e+01],
       [2.118000e+03, 7.614000e+03, 6.065500e+02, 8.594915e+06,
        9.900000e+01]])

In [13]:
# x = x[:, np.newaxis]

In [14]:
x

array([[5.960000e+02, 3.156000e+03, 5.330700e+02, 3.100000e+01,
        0.000000e+00],
       [4.961000e+03, 3.412000e+03, 8.085600e+02, 1.300000e+02,
        0.000000e+00],
       [2.000000e+00, 1.365000e+03, 1.442940e+03, 4.760000e+02,
        0.000000e+00],
       ...,
       [4.799000e+03, 4.767000e+03, 5.711100e+02, 8.594904e+06,
        9.900000e+01],
       [1.259000e+03, 2.956000e+03, 8.130000e+01, 8.594908e+06,
        9.900000e+01],
       [2.118000e+03, 7.614000e+03, 6.065500e+02, 8.594915e+06,
        9.900000e+01]])

In [15]:
y = df.iloc[:,[-2]].values

In [16]:
y

array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]])

* **TX_FRAUD** 

* Transform `y` Matrix in a 1d array

In [17]:
y

array([[0],
       [0],
       [1],
       ...,
       [0],
       [0],
       [0]])

### Applying Stadard Scale

In [18]:
s = SMOTE()
x,y = s.fit_resample(x,y)

In [19]:
ss = StandardScaler()
x = ss.fit_transform(x)

## Creating Training and Test Dataset

In [20]:
X_train, X_test, y_train, y_test = train_test_split(x,y ,
                                   random_state=11, 
                                   test_size=0.2, 
                                   shuffle=True)

#### Changing y shape to 1d array

In [21]:
y_train

array([0, 1, 0, ..., 1, 1, 0])

In [22]:
y_train = np.ravel(y_train)

In [23]:
y_train

array([0, 1, 0, ..., 1, 1, 0])

## Apply Logistic Regression to Predict Frauds

In [24]:
model = LogisticRegression()

In [25]:
model.fit(X_train, y_train)

In [26]:
test_predict = model.predict(X_test)
test_predict

array([1, 0, 0, ..., 1, 0, 0])

#### Testing Model Accuracy

In [27]:
accuracy_score(y_test, test_predict) * 100

96.12025544298666

#### Simulation of Online Transaction 

In [28]:
online_transaction = [[ 123, 5435, 12030.00, 788, 0 ]]

In [29]:
online_transaction

[[123, 5435, 12030.0, 788, 0]]

In [30]:
online_transaction_scaled = ss.fit_transform(online_transaction)

In [31]:
online_transaction_scaled

array([[0., 0., 0., 0., 0.]])

In [32]:
test_transaction = model.predict(online_transaction_scaled)

In [33]:
test_transaction

array([1])

### FRAUD!