In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, date
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('dataset/credit-card-data.csv')

In [3]:
df.head()

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61080 entries, 0 to 61079
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   trans_date_trans_time  61080 non-null  object 
 1   cc_num                 61080 non-null  int64  
 2   merchant               61080 non-null  object 
 3   category               61080 non-null  object 
 4   amt                    61080 non-null  float64
 5   first                  61080 non-null  object 
 6   last                   61080 non-null  object 
 7   gender                 61080 non-null  object 
 8   street                 61080 non-null  object 
 9   city                   61080 non-null  object 
 10  state                  61080 non-null  object 
 11  zip                    61080 non-null  int64  
 12  lat                    61080 non-null  float64
 13  long                   61080 non-null  float64
 14  city_pop               61080 non-null  int64  
 15  jo

In [5]:
# df.isnull().sum()

In [6]:
def age(born):
    born = datetime.strptime(born, "%Y-%m-%d").date()
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
  
df['age'] = df['dob'].apply(age)

In [7]:
df.drop('dob',axis=1, inplace= True)

In [8]:
df.to_csv('dataset/eda_data.csv', index = False)

In [9]:
df['name'] = df['first']+ " "+df['last']

In [10]:
df.drop(['first', 'last'],axis=1, inplace= True)

In [11]:
df = df[['cc_num', 'name','gender', 'age','job','city_pop', 'zip', 'merchant', 'category', 'amt', 'unix_time','merch_lat','merch_long', 'is_fraud']]

## Data Dictionary

|Feature|Type|Description|
|---|---|---|
|cc_num|int64|Credit Card Number of Customer|
|name|object|Name of Credit Card Holder| 
|gender|object|Gender of Credit Card Holder| 
|age|int64|Age of Credit Card Holder| 
|job | object|Job of Credit Card Holder| 
|city_pop|int64|Credit Card Holder's City Population|
|zip|int64|Zipcode of Credit Card Holder| 
|merchant | object|Merchant Name| 
|category|object|Category of Merchant| 
|amt | float64|Amount of Transaction| 
|unix_time|int64|UNIX Time of transaction| 
|merch_lat | float64|Latitude Location of Merchant| 
|merch_long|float64|Longitude Location of Merchant| 
|is_fraud | int64| Fraud Flag <--- Target Class| 

In [12]:
LE = LabelEncoder()
for col in df.columns[1:]:
    if df[col].dtype == 'object':
        df[col] = LE.fit_transform(df[col])
        df[col+'_encoded'] = df[col][:]

In [13]:
df.head()

Unnamed: 0,cc_num,name,gender,age,job,city_pop,zip,merchant,category,amt,unix_time,merch_lat,merch_long,is_fraud,name_encoded,gender_encoded,job_encoded,merchant_encoded,category_encoded
0,2291163933867244,405,1,53,287,333497,29209,319,10,2.86,1371816865,33.986391,-81.200714,0,405,1,287,319,10
1,3573030041201292,461,0,31,404,302,84002,591,10,29.84,1371816873,39.450498,-109.960431,0,461,0,404,591,10
2,3598215285024754,74,0,50,271,34496,11710,611,5,41.28,1371816893,40.49581,-74.196111,0,74,0,271,611,5
3,3591919803438423,119,1,34,420,54767,32780,222,9,60.05,1371816915,28.812398,-80.883061,0,119,1,420,222,9
4,3526826139003047,727,1,66,204,1126,49632,292,13,3.19,1371816917,44.959148,-85.884734,0,727,1,204,292,13


In [14]:
df.to_csv('dataset/data.csv', index=False)