# Preprocessing on Credit Card Dataset
## Before applying any algorithm first we need to preprocess this data


In [1]:
# Importing required libraries
import pandas as pd

## Data Gathering
Now we need to load the dataset.<br><br>
This is done using the pd.read_csv function.<br><br>
Reading csv file.<br><br>
When we have a lot of data(Bigger dataset) processing takes time.<br><br>
We can use nrows(Parameter for number of rows) to select required number of rows as the dataset is very big.<br>

In [3]:
# Importing Dataset/CSV file
df = pd.read_csv("creditcard.csv")
print("Dataset is:\n",df)

Dataset is:
             Time         V1         V2        V3        V4        V5  \
0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
2            1.0  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
3            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
4            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
...          ...        ...        ...       ...       ...       ...   
284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V21       V22  \
0       0.462388  0.239599  0.098698  0.363787 

In [4]:
# Printing Columns of Dataframe
print("\nColumns of dataset are:\n",df.columns)


Columns of dataset are:
 Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')


In [5]:
# Printing Information of Dataframe
print("\nInformation of dataset is::\n",df.info)


Information of dataset is::
 <bound method DataFrame.info of             Time         V1         V2        V3        V4        V5  \
0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
2            1.0  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
3            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
4            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
...          ...        ...        ...       ...       ...       ...   
284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V21       V22  

In [8]:
# Checking the number of missing values in each column
# This is done to ensure that the data does not have any missing value which may lead to ML algorithm not working properly
print("Count of Missing Values in Column is:")
df.isnull().sum()

Count of Missing Values in Column is:


Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [12]:
# Now since we are trying to make a ML model for credit card fraud detection first we need to ensure that the dataset has enough fraud data values
# Checking the distribution/count of Fraud and Not Fraud data values
# 0 represents Not fraud and 1 represents fraud data
print("Number/Count of Fraud and Not Fraud Data values is:")
df['Class'].value_counts()

Number/Count of Fraud and Not Fraud Data values is:


0    284315
1       492
Name: Class, dtype: int64

## Now we can see that this datset is highly imbalanced
The number of Fraud cases data is very less.<br><br>
We need to balance this dataset before model building else model will be prone to overfitting.<br><br>
In other words becuase most data is not fraud model will classify most data points as not fraud.<br>

In [14]:
# Balancing the dataset
# First we will extract the Fraud and Not Fraud Data
Not_Fraud_data = df[df.Class == 0]
Fraud_data = df[df.Class == 1]

In [16]:
# Checking the Shape of this Separated Data
print("Shape of Not Fraud data after separating is: ",Not_Fraud_data.shape)
print("\nShape of Fraud data after separating is: ",Fraud_data.shape)

Shape of Not Fraud data after separating is:  (284315, 31)

Shape of Fraud data after separating is:  (492, 31)


In [20]:
# Seeing the description of this dataset now
print("Mathematical Description of Not Fraud Data is:\n")
print(Not_Fraud_data.Amount.describe())
print("\nMathematical Description of Fraud Data is:\n")
print(Fraud_data.Amount.describe())

Mathematical Description of Not Fraud Data is:

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

Mathematical Description of Fraud Data is:

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64


In [30]:
# Comparing the values for both transactions Fraud and Not fraud
print("Mean for Not Fraud data(0) and Fraud(1) is: ")
df.groupby('Class').mean()

Mean for Not Fraud data(0) and Fraud(1) is: 


Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


## Now this is Under Sampling for Fraud Data
So we will build a new dataset which will contain required number of Fraud and Not Fraud.<br><br>
Now to ensure that both Fraud and Not Fraud data are in same amount we will extract rows from Not Fraud Data.<br><br>
Finally we will save this extracted data and Fraud data in a new dataframe.<br>

In [21]:
# We are using the sample function for extracting data
# 492 rows are selected because the number of Fraud data is 492
# To ensure correct fitting of model we will take the same number of rows for both Fraud and Not Fraud Data
Not_Fraud_Sample = Not_Fraud_data.sample(n=492)

In [25]:
# Now finally we will concatenate both the samples for Fraud and Not Fraud data
Updated_df = pd.concat([Not_Fraud_Sample, Fraud_data], axis=0)

In [26]:
# Now seeing the counts in the new dataframe
print("Number/Count of Fraud and Not Fraud Data values in Updated Dataframe is:")
Updated_df['Class'].value_counts()

Number/Count of Fraud and Not Fraud Data values in Updated Dataframe is:


0    492
1    492
Name: Class, dtype: int64

In [32]:
# Checking the mean for both Fraud and Not Fraud Data for the Updated values
# Comparing the values for both transactions Fraud and Not fraud
# We can see that the overall properties of dataframe has not changed 
# Thus now we can use this dataset for our model
print("Mean for Not Fraud data(0) and Fraud(1) is: ")
Updated_df.groupby('Class').mean()

Mean for Not Fraud data(0) and Fraud(1) is: 


Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,93446.058943,-0.062798,0.115769,0.041786,-0.074059,0.059579,0.030228,-0.044352,-0.017418,0.051942,...,0.025146,-0.04824,-0.037424,-0.024859,-0.006434,0.003275,-0.005635,-0.015735,-0.00678,83.418455
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [34]:
# Saving the dataframe as CSV file
Updated_df.to_csv("Updated_Credit_card.csv")