In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [6]:
#Loading the dataset
credit_card_data_raw = pd.read_csv('/content/creditcard.csv')

In [7]:
#checking the few rows of the raw datset
credit_card_data_raw.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [8]:
#checking the last few rows of the raw data
credit_card_data_raw.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
13949,24754,1.252924,-0.182189,-0.802716,-0.210981,1.916713,3.643624,-0.778711,0.818295,1.706962,...,-0.497088,-1.211285,0.043809,0.964159,0.44203,0.261483,-0.051402,0.005112,23.74,0.0
13950,24756,-0.346979,-2.103284,-0.685061,1.961605,-0.401125,0.473632,1.133816,-0.256528,0.893409,...,0.359662,-0.316275,-0.864259,-0.279881,0.491802,-0.353996,-0.149931,0.129795,794.2,0.0
13951,24759,-6.053652,-5.988723,0.810413,-0.011811,1.308135,-0.590803,-0.725838,-0.23484,1.624646,...,-0.77197,1.474668,3.176363,-0.30241,0.052529,-0.373871,-0.700463,2.508443,60.0,0.0
13952,24759,1.169121,-1.284945,0.032717,-0.68167,0.660598,4.412578,-1.913115,1.076592,1.50123,...,-0.557596,-0.882435,-0.041523,0.975445,0.297229,0.550515,0.015029,0.032067,90.0,0.0
13953,24759,-6.917152,5.854171,-1.652458,-1.488884,-0.833891,-0.344418,0.393789,0.379968,6.133597,...,-1.404681,-1.124694,0.174333,-0.528234,0.990685,-0.035875,1.071374,-0.168831,,


In [9]:
#Information of the dataset
credit_card_data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13954 entries, 0 to 13953
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    13954 non-null  int64  
 1   V1      13954 non-null  float64
 2   V2      13954 non-null  float64
 3   V3      13954 non-null  float64
 4   V4      13954 non-null  float64
 5   V5      13954 non-null  float64
 6   V6      13954 non-null  float64
 7   V7      13954 non-null  float64
 8   V8      13954 non-null  float64
 9   V9      13954 non-null  float64
 10  V10     13954 non-null  float64
 11  V11     13954 non-null  float64
 12  V12     13954 non-null  float64
 13  V13     13954 non-null  float64
 14  V14     13954 non-null  float64
 15  V15     13954 non-null  float64
 16  V16     13954 non-null  float64
 17  V17     13954 non-null  float64
 18  V18     13954 non-null  float64
 19  V19     13954 non-null  float64
 20  V20     13954 non-null  float64
 21  V21     13954 non-null  float64
 22

Data Analysis

In [11]:
# Missing values
credit_card_data_raw.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    1
Class     1
dtype: int64

In [16]:
# number of right transactions and wrong transactions
credit_card_data_raw['Class'].value_counts()

Class
0.0    13897
1.0       56
Name: count, dtype: int64

In [17]:
credit_card_data_raw['Amount'].value_counts()

Amount
1.00      708
11.85     593
15.95     352
5.00      299
0.89      199
         ... 
44.23       1
130.12      1
242.02      1
66.58       1
794.20      1
Name: count, Length: 4674, dtype: int64

In [18]:
Right = credit_card_data_raw[credit_card_data_raw.Class==0]
Wrong = credit_card_data_raw[credit_card_data_raw.Class==1]

In [19]:
print(Right.shape)
print(Wrong.shape)

(13897, 31)
(56, 31)


In [20]:
Right.Amount.describe()

count    13897.000000
mean        63.773909
std        177.164503
min          0.000000
25%          5.770000
50%         15.900000
75%         52.370000
max       7712.430000
Name: Amount, dtype: float64

In [21]:
Wrong.Amount.describe()

count      56.000000
mean       90.815893
std       310.308450
min         0.000000
25%         1.000000
50%         1.000000
75%         1.025000
max      1809.680000
Name: Amount, dtype: float64

Data Pre-processing

handelling missing values

In [23]:
credit_card_data_raw.dropna(subset=['Class'], inplace=True)
credit_card_data_raw['Amount'].fillna(credit_card_data_raw['Amount'].median(), inplace=True)

 Separating the features and the target

In [26]:
X = credit_card_data_raw.drop(columns='Class', axis=1)
y = credit_card_data_raw['Class']

In [27]:
print(X)

        Time        V1        V2        V3        V4        V5        V6  \
0          0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388   
1          0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361   
2          1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499   
3          1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203   
4          2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921   
...      ...       ...       ...       ...       ...       ...       ...   
13948  24753 -0.565780  0.754977  0.737825  0.030154 -1.383391  0.268107   
13949  24754  1.252924 -0.182189 -0.802716 -0.210981  1.916713  3.643624   
13950  24756 -0.346979 -2.103284 -0.685061  1.961605 -0.401125  0.473632   
13951  24759 -6.053652 -5.988723  0.810413 -0.011811  1.308135 -0.590803   
13952  24759  1.169121 -1.284945  0.032717 -0.681670  0.660598  4.412578   

             V7        V8        V9  ...       V20       V21       V22  \
0      0.2395

In [28]:
print(y)

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
13948    0.0
13949    0.0
13950    0.0
13951    0.0
13952    0.0
Name: Class, Length: 13953, dtype: float64


Feature scaling for 'Amount' and 'Time'

In [25]:
scaler = StandardScaler()
X[['Amount', 'Time']] = scaler.fit_transform(X[['Amount', 'Time']])

Separate majority and minority classes

In [29]:
majority_class = credit_card_data_raw[credit_card_data_raw.Class == 0]
minority_class = credit_card_data_raw[credit_card_data_raw.Class == 1]

Undersampling the majority class ( right transactions)

In [32]:
rus = RandomUnderSampler(sampling_strategy=1.0, random_state=42)
X_undersampled, y_undersampled = rus.fit_resample(X, y)

Using SMOTE to further balance the dataset.

In [33]:
smote = SMOTE(sampling_strategy='minority', random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_undersampled, y_undersampled)

Combining the features and target into a single dataset ( a new dataset which is processed).

In [34]:
balanced_data = pd.DataFrame(X_balanced, columns=X.columns)
balanced_data['Class'] = y_balanced

In [35]:
balanced_data.to_csv('/content/credit_card_data_processed.csv', index=False)