In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.covariance import EllipticEnvelope
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score, roc_curve
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
def generate_model_report(y_actual, y_predicted):
    print("Accuracy = ", accuracy_score(y_actual, y_predicted))
    print("Precision = ", precision_score(y_actual, y_predicted))
    print("Recall = ", recall_score(y_actual, y_predicted))
    print("F1 = ", f1_score(y_actual, y_predicted))
    pass

In [3]:
Data=pd.read_csv("creditcard.csv")

In [4]:
Data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
for i in Data.columns:
    print(i)

Time
V1
V2
V3
V4
V5
V6
V7
V8
V9
V10
V11
V12
V13
V14
V15
V16
V17
V18
V19
V20
V21
V22
V23
V24
V25
V26
V27
V28
Amount
Class


In [6]:
Data.dtypes

Time      float64
V1        float64
V2        float64
V3        float64
V4        float64
V5        float64
V6        float64
V7        float64
V8        float64
V9        float64
V10       float64
V11       float64
V12       float64
V13       float64
V14       float64
V15       float64
V16       float64
V17       float64
V18       float64
V19       float64
V20       float64
V21       float64
V22       float64
V23       float64
V24       float64
V25       float64
V26       float64
V27       float64
V28       float64
Amount    float64
Class       int64
dtype: object

In [7]:
Data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [8]:
count =0
for i in range(len(Data["Class"])):
    if Data["Class"][i]!=0:
        count=count+1
print("Fraud Cases: ", count)
print("Non-Fraud Cases: ", i-count)

Fraud Cases:  492
Non-Fraud Cases:  284314


In [9]:
scaler = MinMaxScaler()
Data = pd.DataFrame(scaler.fit_transform(Data), columns = Data.columns)

## Undersampling

In [10]:
minor_class_len = len(Data[Data["Class"]==1])
print(minor_class_len)

492


In [11]:
major_class_indices = Data[Data["Class"]==0].index
print(major_class_indices)

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            284797, 284798, 284799, 284800, 284801, 284802, 284803, 284804,
            284805, 284806],
           dtype='int64', length=284315)


In [12]:
random_major_indices = np.random.choice(major_class_indices, minor_class_len, replace="False")
print(len(random_major_indices))

492


In [13]:
minor_class_indices = Data[Data["Class"]==1].index
print(minor_class_indices)

Int64Index([   541,    623,   4920,   6108,   6329,   6331,   6334,   6336,
              6338,   6427,
            ...
            274382, 274475, 275992, 276071, 276864, 279863, 280143, 280149,
            281144, 281674],
           dtype='int64', length=492)


In [14]:
under_sample_indices = np.concatenate([minor_class_indices,random_major_indices])

In [15]:
under_sample = Data.loc[under_sample_indices]

In [16]:
X= under_sample.loc[:, Data.columns!="Class"]
Y = under_sample.loc[:, Data.columns=="Class"]

In [17]:
X_train, X_test, Y_train,Y_test = train_test_split(X,Y,test_size=0.2, random_state=42)
clf = LogisticRegression().fit(X_train, Y_train)
Y_test_pred= clf.predict(X_test)

  y = column_or_1d(y, warn=True)


In [18]:
generate_model_report(Y_test, Y_test_pred)

Accuracy =  0.9187817258883249
Precision =  1.0
Recall =  0.8383838383838383
F1 =  0.9120879120879121


In [19]:
dls = DecisionTreeRegressor()
model = dls.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

In [20]:
generate_model_report(Y_test, Y_pred)

Accuracy =  0.9187817258883249
Precision =  0.8952380952380953
Recall =  0.9494949494949495
F1 =  0.9215686274509803
