<a href="https://colab.research.google.com/github/ozaydiner/ozaydiner/blob/main/test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
# loading the dataset

transaction_data = pd.read_csv("/content/drive/MyDrive/train.csv")

In [11]:
# first 5 rows

transaction_data.head()

Unnamed: 0,Id,step,action,amount,nameOrig,oldBalanceOrig,newBalanceOrig,nameDest,oldBalanceDest,newBalanceDest,isFraud,isFlaggedFraud
0,0,0,TRANSFER,10.0,C0198526315,30112.0,30102.0,C4653045645,21927.84,21937.84,0,0
1,1,1,CASH_IN,104925.19,C9864462944,42.23,104967.41,C7853342674,0.0,0.0,0,0
2,2,1,CASH_IN,37383.17,C1474610910,49.83,37433.01,C3584357969,0.0,0.0,0,0
3,3,1,CASH_IN,131908.49,C1474610910,37433.01,169341.49,C4996589500,0.0,0.0,0,0
4,4,1,CASH_IN,127105.65,C1474610910,169341.49,296447.14,C5989915138,0.0,0.0,0,0


In [12]:
# dataset info

transaction_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2372805 entries, 0 to 2372804
Data columns (total 12 columns):
 #   Column          Dtype  
---  ------          -----  
 0   Id              int64  
 1   step            int64  
 2   action          object 
 3   amount          float64
 4   nameOrig        object 
 5   oldBalanceOrig  float64
 6   newBalanceOrig  float64
 7   nameDest        object 
 8   oldBalanceDest  float64
 9   newBalanceDest  float64
 10  isFraud         int64  
 11  isFlaggedFraud  int64  
dtypes: float64(5), int64(4), object(3)
memory usage: 217.2+ MB


In [13]:
# checking for missing values

transaction_data.isnull().sum()

Id                0
step              0
action            0
amount            0
nameOrig          0
oldBalanceOrig    0
newBalanceOrig    0
nameDest          0
oldBalanceDest    0
newBalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [14]:
# distribution of fraud & fraud transactions

transaction_data["isFraud"].value_counts()

0    2372627
1        178
Name: isFraud, dtype: int64

In [15]:
# fitting action and orig&dest columns

action_ec = LabelEncoder()
action_ec.fit(transaction_data["action"])

names = pd.concat([transaction_data["nameOrig"], transaction_data["nameDest"]], axis=0)
name_ec = LabelEncoder()
name_ec.fit(names)

LabelEncoder()

In [16]:
# converting columns to numerical values

transaction_data["action"] = action_ec.transform(transaction_data["action"])

transaction_data["nameOrig"] = name_ec.transform(transaction_data["nameOrig"])
transaction_data["nameDest"] = name_ec.transform(transaction_data["nameDest"])

In [17]:
# first 5 rows

transaction_data.head()

Unnamed: 0,Id,step,action,amount,nameOrig,oldBalanceOrig,newBalanceOrig,nameDest,oldBalanceDest,newBalanceDest,isFraud,isFlaggedFraud
0,0,0,4,10.0,1189,30112.0,30102.0,25734,21927.84,21937.84,0,0
1,1,1,0,104925.19,54204,42.23,104967.41,43194,0.0,0.0,0,0
2,2,1,0,37383.17,8182,49.83,37433.01,19854,0.0,0.0,0,0
3,3,1,0,131908.49,8182,37433.01,169341.49,27551,0.0,0.0,0,0
4,4,1,0,127105.65,8182,169341.49,296447.14,32953,0.0,0.0,0,0


In [18]:
# seperating for analysis

legit = transaction_data[transaction_data.isFraud == 0]
legit_X = legit.drop(columns="isFraud", axis=1)
legit_Y = legit["isFraud"]

fraud = transaction_data[transaction_data.isFraud == 1]
fraud_X = fraud.drop(columns="isFraud", axis=1)
fraud_Y = fraud["isFraud"]

In [19]:
"""
Splitting data into training data and testing data
"""

legit_X_train, legit_X_test, legit_Y_train, legit_Y_test = train_test_split(legit_X, legit_Y, test_size=0.2)
fraud_X_train, fraud_X_test, fraud_Y_train, fraud_Y_test = train_test_split(fraud_X, fraud_Y, test_size=0.2)

X_train = shuffle(pd.concat([legit_X_train, fraud_X_train], axis=0))
Y_train = shuffle(pd.concat([legit_Y_train, fraud_Y_train], axis=0))
X_test = shuffle(pd.concat([legit_X_test, fraud_X_test], axis=0))
Y_test = shuffle(pd.concat([legit_Y_test, fraud_Y_test], axis=0))

In [20]:
"""
Model Training
Logistic Regression
"""

model = LogisticRegression(solver="lbfgs", max_iter=3000)

model.fit(X_train, Y_train)

print("model score test: %.3f" % model.score(X_test, Y_test))
print("model score training: %.3f" % model.score(X_train, Y_train))

model score test: 1.000
model score training: 1.000
