# Libraries

In [1]:
import pandas as pd
import numpy as np
import csv
import datetime
import pytz
import hashlib
import psycopg2
from psycopg2.extras import execute_batch
from psycopg2.extensions import ISOLATION_LEVEL_AUTOCOMMIT

In [2]:
# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
#Load csv
fraud = r'C:\Users\SANTIAGO\OneDrive - Nova Scotia Community College\4 Winter Semester 2024\DBAS 3035 Information Systems Design\Project\Dataset\Original\fraud.csv'
fraud_df = pd.read_csv(fraud, index_col = False)
fraud_df.drop(columns=['Unnamed: 0'], inplace=True)
fraud_df.head(3)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,21/06/2020 12:14,2291160000000000.0,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,19/03/1968,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,21/06/2020 12:14,3573030000000000.0,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",17/01/1990,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,21/06/2020 12:14,3598220000000000.0,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",21/10/1970,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0


### Data Dictionary

In [4]:
dictionary = []

# total rows
total_rows = len(fraud_df)

for col in fraud_df.columns:
    data_type = fraud_df[col].dtype
    max_length = None
    min_length = None
    if pd.api.types.is_string_dtype(fraud_df[col]):
        max_length = fraud_df[col].str.len().max()
        min_length = fraud_df[col].str.len().min()
    
    max_length_format = '{:,.0f}'.format(max_length) if max_length is not None else None
    min_length_format = '{:,.0f}'.format(min_length) if min_length is not None else None
    
    
    num_nulls = fraud_df[col].isnull().sum()
    num_nulls_format = '{:,}'.format(num_nulls)
    percentage_null = (num_nulls/total_rows) * 100
    percentage_null_format = '{:,.2f}%'.format(percentage_null)
    num_distinct = fraud_df[col].nunique()
    num_distinct_format = '{:,}'.format(num_distinct)
    
    # Initialize min and max values
    min_value = None
    max_value = None
    
    # Check if column is numeric
    if np.issubdtype(data_type, np.number):
        min_value = fraud_df[col].min()
        max_value = fraud_df[col].max()
        
    # Calculate average for numeric columns
    average = fraud_df[col].mean() if np.issubdtype(data_type, np.number) else None
    average_format = '{:,.2f}'.format(average) if average is not None else None
        
    #Sample values
    sample_values = fraud_df[col].dropna().sample(5).tolist()
    
    #Append all info to dictionary list
    dictionary.append([col, data_type, max_length_format, min_length_format,num_nulls_format, percentage_null_format, num_distinct_format, min_value, max_value, average_format, sample_values])
    
# Create a df for the dictionary
dictionary_df = pd.DataFrame(dictionary, columns=['Column', 'Data_Type', 'Max_Character_Length', 'Min_Character_Length','Null_Value_Count', 'Null_Percentage','Distinct_Value_Count', 'Min_Value', 'Max_Value','Average','Sample_Values'])

# So that Sample_Values are not truncated
pd.set_option('display.max_colwidth', 1000)

dictionary_df

Unnamed: 0,Column,Data_Type,Max_Character_Length,Min_Character_Length,Null_Value_Count,Null_Percentage,Distinct_Value_Count,Min_Value,Max_Value,Average,Sample_Values
0,trans_date_trans_time,object,16.0,16.0,0,0.00%,226976,,,,"[28/11/2020 12:32, 24/07/2020 20:58, 01/09/2020 09:08, 24/12/2020 17:29, 28/09/2020 22:47]"
1,cc_num,float64,,,0,0.00%,904,60416210000.0,4.99235e+18,4.1783874831987296e+17,"[3587040000000000.0, 2720430000000000.0, 6011400000000000.0, 213174000000000.0, 376657000000000.0]"
2,merchant,object,43.0,13.0,0,0.00%,693,,,,"[fraud_Fadel-Hilpert, fraud_Bernier, Volkman and Hoeger, fraud_Eichmann, Hayes and Treutel, fraud_Fadel-Hilpert, fraud_McDermott-Rice]"
3,category,object,14.0,4.0,0,0.00%,14,,,,"[food_dining, entertainment, grocery_net, personal_care, food_dining]"
4,amt,float64,,,0,0.00%,37256,1.0,22768.11,69.39,"[34.5, 27.94, 9.32, 45.91, 40.46]"
5,first,object,11.0,3.0,0,0.00%,341,,,,"[Amber, Dalton, Gina, Christopher, Meredith]"
6,last,object,11.0,2.0,0,0.00%,471,,,,"[Singh, Tucker, Fisher, Vaughan, Waters]"
7,gender,object,1.0,1.0,0,0.00%,2,,,,"[F, M, F, M, F]"
8,street,object,35.0,12.0,0,0.00%,924,,,,"[269 Sanchez Rapids, 597 Jenny Ford Apt. 543, 3433 Jones Branch, 034 Kimberly Mountains, 74890 Rodriguez Springs Apt. 540]"
9,city,object,25.0,3.0,0,0.00%,849,,,,"[Brooklin, Wilmington, Lakeland, South Londonderry, Issaquah]"


### Converting dtypes

In [5]:
fraud_df.dtypes

trans_date_trans_time     object
cc_num                   float64
merchant                  object
category                  object
amt                      float64
first                     object
last                      object
gender                    object
street                    object
city                      object
state                     object
zip                        int64
lat                      float64
long                     float64
city_pop                   int64
job                       object
dob                       object
trans_num                 object
unix_time                  int64
merch_lat                float64
merch_long               float64
is_fraud                   int64
dtype: object

In [6]:
fraud_df['cc_num'] = fraud_df['cc_num'].astype(np.int64)

In [8]:
fraud_df['trans_date_trans_time'] = pd.to_datetime(fraud_df['trans_date_trans_time'], format='%d/%m/%Y %H:%M')

In [9]:
fraud_df['dob'] = pd.to_datetime(fraud_df['dob'],format='%d/%m/%Y')

In [10]:
fraud_df['is_fraud'] = fraud_df['is_fraud'].astype(bool)

In [11]:
fraud_df['dob'] = pd.to_datetime(fraud_df['dob']).dt.date

### Data Cleansing

In [12]:
fraud_df['merchant'] = fraud_df['merchant'].str.replace('fraud_','')

In [13]:
fraud_df['category'] = fraud_df['category'].str.replace('_',' ')

In [14]:
fraud_df.head(5)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,2020-06-21 12:14:00,2291160000000000,Kirlin and Sons,personal care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,2013-06-21 12:14:25,33.986391,-81.200714,False
1,2020-06-21 12:14:00,3573030000000000,Sporer-Keebler,personal care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,2013-06-21 12:14:33,39.450498,-109.960431,False
2,2020-06-21 12:14:00,3598220000000000,"Swaniawski, Nitzsche and Welch",health fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,2013-06-21 12:14:53,40.49581,-74.196111,False
3,2020-06-21 12:15:00,3591920000000000,Haley Group,misc pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,2013-06-21 12:15:15,28.812398,-80.883061,False
4,2020-06-21 12:15:00,3526830000000000,Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,2013-06-21 12:15:17,44.959148,-85.884734,False


### cc_num

In [15]:
customer = fraud_df.groupby(['cc_num','first','last']).size().reset_index(name='count')
cust_sort = customer.sort_values(by='cc_num')
x = cust_sort[cust_sort['cc_num'] == 213161000000000]
x

#can't use cc_num as customerID, have to create customerID

Unnamed: 0,cc_num,first,last,count
274,213161000000000,Alyssa,Morgan,654
275,213161000000000,Catherine,Weber,723


In [16]:
file = r'C:\Users\SANTIAGO\OneDrive - Nova Scotia Community College\4 Winter Semester 2024\DBAS 3035 Information Systems Design\Project\Dataset\Original\customer.csv'
customer.to_csv(file, index=False)

In [17]:
customer = fraud_df.groupby(['cc_num','first','last']).size().reset_index(name='count')
cust_sort = customer.sort_values(by='cc_num')
x = cust_sort[(cust_sort['first'] == 'Jeffrey') & (cust_sort['last'] == 'Smith')]
x

Unnamed: 0,cc_num,first,last,count
485,3534330000000000,Jeffrey,Smith,842
862,4292900000000000000,Jeffrey,Smith,684


In [18]:
customers = fraud_df[['cc_num','dob','first','last']].drop_duplicates()
customers

Unnamed: 0,cc_num,dob,first,last
0,2291160000000000,1968-03-19,Jeff,Elliott
1,3573030000000000,1990-01-17,Joanne,Williams
2,3598220000000000,1970-10-21,Ashley,Lopez
3,3591920000000000,1987-07-25,Brian,Williams
4,3526830000000000,1955-07-06,Nathan,Massey
5,30407700000000,1991-10-13,Danielle,Evans
6,213181000000000,1951-01-15,Kayla,Sutton
7,3589290000000000,1972-03-05,Paula,Estrada
8,3596360000000000,1973-05-27,David,Everett
9,3546900000000000,1956-05-30,Kayla,Obrien


In [19]:
x = customers[customers.duplicated(['first','last'], keep = False)]
y = x.sort_values(by='first')
y

Unnamed: 0,cc_num,dob,first,last
488,3534330000000000,1978-01-15,Jeffrey,Smith
583,4292900000000000000,1995-08-16,Jeffrey,Smith
590,2248350000000000,1960-08-05,Jennifer,Scott
12214,6526960000000000,1950-12-14,Jennifer,Scott
316,4348790000000000,1979-01-08,John,Nichols
2216,30357400000000,1967-09-30,John,Nichols
423,4710790000000000,1986-12-13,Justin,Bell
1011,4158950000000000,1973-10-19,Justin,Bell
729,4433090000000000,1936-03-28,Linda,Davis
1641,4452370000000000,1978-03-04,Linda,Davis


In [20]:
count = len(customers)
count

924

In [21]:
#generate customerID based on the combination of cc_num, dob, first, last columns. Hashlib.sha256() will be used to generate an 8-digit customerID number

#Concatenate columns
fraud_df['combination'] = fraud_df['dob'].astype(str) + fraud_df['first'] + fraud_df['last'] + fraud_df['cc_num'].astype(str)

#generate customerID based on concatenated values using hash
fraud_df['customerID'] = fraud_df['combination'].apply(lambda x: int(hashlib.sha256(x.encode()).hexdigest(), 16) % 10**8)

fraud_df.drop(columns=['combination'], inplace=True)

fraud_df.head(5)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,customerID
0,2020-06-21 12:14:00,2291160000000000,Kirlin and Sons,personal care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,2013-06-21 12:14:25,33.986391,-81.200714,False,74078269
1,2020-06-21 12:14:00,3573030000000000,Sporer-Keebler,personal care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,2013-06-21 12:14:33,39.450498,-109.960431,False,18702918
2,2020-06-21 12:14:00,3598220000000000,"Swaniawski, Nitzsche and Welch",health fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,2013-06-21 12:14:53,40.49581,-74.196111,False,95127892
3,2020-06-21 12:15:00,3591920000000000,Haley Group,misc pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,2013-06-21 12:15:15,28.812398,-80.883061,False,59734903
4,2020-06-21 12:15:00,3526830000000000,Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,2013-06-21 12:15:17,44.959148,-85.884734,False,81017923


In [22]:
#Checking if customerID was generated properly
customerID = fraud_df[['cc_num','dob','first','last','customerID']].drop_duplicates()
customerID

Unnamed: 0,cc_num,dob,first,last,customerID
0,2291160000000000,1968-03-19,Jeff,Elliott,74078269
1,3573030000000000,1990-01-17,Joanne,Williams,18702918
2,3598220000000000,1970-10-21,Ashley,Lopez,95127892
3,3591920000000000,1987-07-25,Brian,Williams,59734903
4,3526830000000000,1955-07-06,Nathan,Massey,81017923
5,30407700000000,1991-10-13,Danielle,Evans,94593647
6,213181000000000,1951-01-15,Kayla,Sutton,59629408
7,3589290000000000,1972-03-05,Paula,Estrada,6865772
8,3596360000000000,1973-05-27,David,Everett,26474803
9,3546900000000000,1956-05-30,Kayla,Obrien,74916531


In [23]:
count = len(customerID)
count

924

In [24]:
#checking
x = customerID[(customerID['first'] == 'Jeffrey') & (customerID['last'] == 'Smith')]
x

Unnamed: 0,cc_num,dob,first,last,customerID
488,3534330000000000,1978-01-15,Jeffrey,Smith,22542243
583,4292900000000000000,1995-08-16,Jeffrey,Smith,67240582


### city

In [25]:
city = fraud_df[['city','state','city_pop']].drop_duplicates()
city = city.sort_values(by='city')
city
#city can belong to multiple state e.g. Afton
#a state can have many cities. don't want to create bridging table and city and city_pop is dependent on state so this can have 1 table

Unnamed: 0,city,state,city_pop
286,Achille,OK,608
925,Acworth,NH,477
4797,Adams,WI,3508
1933,Afton,MI,937
451,Afton,MN,2916
1793,Akron,OH,272134
500,Albany,NY,151022
2040,Albuquerque,NM,641349
2646,Alder,MT,286
123,Aledo,TX,13602


### address

In [26]:
#relationship of customer to address
address = fraud_df[['street','city','state']].drop_duplicates()
address = address.sort_values(by='street')
address

Unnamed: 0,street,city,state
202,000 Jennifer Mills,Issaquah,WA
1511,0005 Morrison Land,Mounds,OK
237,00315 Ashley Valleys,Burrton,KS
562,0043 Henry Plaza,Brantley,AL
2254,005 Cody Estates,Louisville,KY
997,0069 Robin Brooks Apt. 695,Elberta,MI
328127,007 Tonya Isle Suite 299,Wever,IA
2216,00821 Joanna Meadow,Dalton,WI
986,010 Salazar Walk,Cromona,KY
6,010 Weaver Land,Carlotta,CA


In [27]:
add_duplicate = address[address.duplicated(['street'], keep = False)]
add_duplicate = add_duplicate.sort_values(by='street')
add_duplicate

Unnamed: 0,street,city,state


In [28]:
x = len(address)
x
#we got 924 unique customers and 924 unique address therefore relationship is 1:1

924

In [29]:
# creation of address_id
fraud_df['combination'] = fraud_df['street'] + fraud_df['city'] + fraud_df['state']
fraud_df['address_id'] = fraud_df['combination'].apply(lambda x: int(hashlib.sha256(x.encode()).hexdigest(), 16)% 10**8)
fraud_df.drop(columns=['combination'], inplace=True)
fraud_df.head(5)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,customerID,address_id
0,2020-06-21 12:14:00,2291160000000000,Kirlin and Sons,personal care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,2013-06-21 12:14:25,33.986391,-81.200714,False,74078269,57867665
1,2020-06-21 12:14:00,3573030000000000,Sporer-Keebler,personal care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,2013-06-21 12:14:33,39.450498,-109.960431,False,18702918,6717278
2,2020-06-21 12:14:00,3598220000000000,"Swaniawski, Nitzsche and Welch",health fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,2013-06-21 12:14:53,40.49581,-74.196111,False,95127892,90914912
3,2020-06-21 12:15:00,3591920000000000,Haley Group,misc pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,2013-06-21 12:15:15,28.812398,-80.883061,False,59734903,62699687
4,2020-06-21 12:15:00,3526830000000000,Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,2013-06-21 12:15:17,44.959148,-85.884734,False,81017923,83216614


In [30]:
#checking
count_add = fraud_df[['address_id']].drop_duplicates()
count = len(count_add)
count

924

### transaction

In [31]:
trans_count = fraud_df.groupby('customerID')['trans_num'].size().reset_index(name='trans_count')
trans_count = trans_count.sort_values(by='trans_count', ascending = True)
trans_count

Unnamed: 0,customerID,trans_count
617,69514816,6
179,19654975,7
724,78976384,8
684,75619426,9
415,49315855,9
817,89708185,10
845,92446933,10
301,34461572,10
394,46595251,10
162,18131504,11


In [32]:
trans_count_sum = trans_count['trans_count'].sum()
trans_count_sum == len(fraud_df)

True

In [33]:
count = len(trans_count)
count
# all customers have transaction

924

### merchant

In [34]:
merchant = fraud_df.groupby('merchant')['trans_num'].size().reset_index(name='trans_count')
merchant = merchant.sort_values(by='merchant')
merchant

Unnamed: 0,merchant,trans_count
0,Abbott-Rogahn,803
1,Abbott-Steuber,766
2,Abernathy and Sons,762
3,Abshire PLC,838
4,"Adams, Kovacek and Kuhlman",414
5,Adams-Barrows,789
6,"Altenwerth, Cartwright and Koss",851
7,Altenwerth-Kilback,1091
8,Ankunding LLC,859
9,Ankunding-Carroll,334


In [35]:
num_merchant = merchant['trans_count'].sum()
num_merchant == len(fraud_df)

True

In [36]:
trans = fraud_df.groupby('trans_num')['merchant'].size().reset_index(name='merch_count')
trans = trans.sort_values(by='merch_count')
trans.head(100)
#by sampling, 1 transaction belong to only 1 merchant

Unnamed: 0,trans_num,merch_count
0,00005fc67bb45d98730559d40c9ca601,1
370486,aacb0b6d2bac913634c11dab46948635,1
370485,aacadffa0affb3de9cb3ee1e660fd5e4,1
370484,aaca939b75eaf047ea704ab0ecd18a59,1
370483,aaca5cd2c992f8ea810bd2e28a8c3355,1
370482,aaca32fa7d341c052431a01b0f920d40,1
370481,aaca0f5bed3517d4a8001d3c9f5994ee,1
370480,aac9fbc02c891842aad8871c6b3f54de,1
370479,aac9f660a3483f9dcba9dbae3aefbd00,1
370478,aac9e094a5c1bb50e31a13a2577aeb87,1


In [37]:
trans_count = trans[trans['merch_count'] > 1]
trans_count

Unnamed: 0,trans_num,merch_count


In [38]:
#merchant_id creation
fraud_df['merchant_id'] = fraud_df['merchant'].astype('category').cat.codes + 1
fraud_df.head(5)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,customerID,address_id,merchant_id
0,2020-06-21 12:14:00,2291160000000000,Kirlin and Sons,personal care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,2013-06-21 12:14:25,33.986391,-81.200714,False,74078269,57867665,320
1,2020-06-21 12:14:00,3573030000000000,Sporer-Keebler,personal care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,2013-06-21 12:14:33,39.450498,-109.960431,False,18702918,6717278,592
2,2020-06-21 12:14:00,3598220000000000,"Swaniawski, Nitzsche and Welch",health fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,2013-06-21 12:14:53,40.49581,-74.196111,False,95127892,90914912,612
3,2020-06-21 12:15:00,3591920000000000,Haley Group,misc pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,2013-06-21 12:15:15,28.812398,-80.883061,False,59734903,62699687,223
4,2020-06-21 12:15:00,3526830000000000,Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,2013-06-21 12:15:17,44.959148,-85.884734,False,81017923,83216614,293


In [39]:
#checking should have 693 unique values
merch_id = fraud_df[['merchant_id']].drop_duplicates()
count_merch = len(merch_id)
count_merch

693

### trans_date_trans_time and unix_time

In [40]:
time = fraud_df[['trans_date_trans_time','unix_time']].drop_duplicates()
time = time.sort_values(by='trans_date_trans_time')
time.head(100)
#unix_time is down to the last second. Drop trans_date_trans_time as we don't need it

Unnamed: 0,trans_date_trans_time,unix_time
0,2020-06-21 12:14:00,2013-06-21 12:14:25
1,2020-06-21 12:14:00,2013-06-21 12:14:33
2,2020-06-21 12:14:00,2013-06-21 12:14:53
3,2020-06-21 12:15:00,2013-06-21 12:15:15
4,2020-06-21 12:15:00,2013-06-21 12:15:17
5,2020-06-21 12:15:00,2013-06-21 12:15:37
6,2020-06-21 12:15:00,2013-06-21 12:15:44
7,2020-06-21 12:15:00,2013-06-21 12:15:50
8,2020-06-21 12:16:00,2013-06-21 12:16:10
9,2020-06-21 12:16:00,2013-06-21 12:16:11


In [41]:
fraud_df.drop(columns=['unix_time'],inplace=True)
fraud_df.head(5)

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud,customerID,address_id,merchant_id
0,2020-06-21 12:14:00,2291160000000000,Kirlin and Sons,personal care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,33.986391,-81.200714,False,74078269,57867665,320
1,2020-06-21 12:14:00,3573030000000000,Sporer-Keebler,personal care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,39.450498,-109.960431,False,18702918,6717278,592
2,2020-06-21 12:14:00,3598220000000000,"Swaniawski, Nitzsche and Welch",health fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,40.49581,-74.196111,False,95127892,90914912,612
3,2020-06-21 12:15:00,3591920000000000,Haley Group,misc pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,28.812398,-80.883061,False,59734903,62699687,223
4,2020-06-21 12:15:00,3526830000000000,Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,44.959148,-85.884734,False,81017923,83216614,293


# Rename columns

In [42]:
fraud_df = fraud_df.rename(columns = {'cc_num':'c_cc_num','merchant':'merch_name','category':'trans_category_type','amt':'trans_amt','first':'c_first','last':'c_last','gender':'c_gender','street':'a_street','city':'cs_city','state':'cs_state','zip':'a_zip','lat':'a_lat','long':'a_long','city_pop':'cs_city_pop','job':'c_job','dob':'c_dob','customerID':'customer_id','trans_date_trans_time':'trans_date_time','is_fraud':'trans_is_fraud'})

In [43]:
fraud_df.columns

Index(['trans_date_time', 'c_cc_num', 'merch_name', 'trans_category_type',
       'trans_amt', 'c_first', 'c_last', 'c_gender', 'a_street', 'cs_city',
       'cs_state', 'a_zip', 'a_lat', 'a_long', 'cs_city_pop', 'c_job', 'c_dob',
       'trans_num', 'merch_lat', 'merch_long', 'trans_is_fraud', 'customer_id',
       'address_id', 'merchant_id'],
      dtype='object')

In [44]:
fraud_df.head(5)

Unnamed: 0,trans_date_time,c_cc_num,merch_name,trans_category_type,trans_amt,c_first,c_last,c_gender,a_street,cs_city,cs_state,a_zip,a_lat,a_long,cs_city_pop,c_job,c_dob,trans_num,merch_lat,merch_long,trans_is_fraud,customer_id,address_id,merchant_id
0,2020-06-21 12:14:00,2291160000000000,Kirlin and Sons,personal care,2.86,Jeff,Elliott,M,351 Darlene Green,Columbia,SC,29209,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,33.986391,-81.200714,False,74078269,57867665,320
1,2020-06-21 12:14:00,3573030000000000,Sporer-Keebler,personal care,29.84,Joanne,Williams,F,3638 Marsh Union,Altonah,UT,84002,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,39.450498,-109.960431,False,18702918,6717278,592
2,2020-06-21 12:14:00,3598220000000000,"Swaniawski, Nitzsche and Welch",health fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,Bellmore,NY,11710,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,40.49581,-74.196111,False,95127892,90914912,612
3,2020-06-21 12:15:00,3591920000000000,Haley Group,misc pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,Titusville,FL,32780,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,28.812398,-80.883061,False,59734903,62699687,223
4,2020-06-21 12:15:00,3526830000000000,Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,Falmouth,MI,49632,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,44.959148,-85.884734,False,81017923,83216614,293


# Updated Data Dictionary

In [45]:
dictionary = []
total_rows = len(fraud_df)

for col in fraud_df.columns:
    data_type = fraud_df[col].dtype
    max_length = None
    min_length = None
    max_value = None
    min_value = None
    med_value = None
    
    if pd.api.types.is_string_dtype(fraud_df[col]):
        max_length = fraud_df[col].str.len().max()
        min_length = fraud_df[col].str.len().min()

    if np.issubdtype(data_type, np.number):
        min_value = fraud_df[col].min()
        max_value = fraud_df[col].max() 
        med_value = fraud_df[col].median()

    if np.issubdtype(data_type, np.datetime64):
        min_value = fraud_df[col].min()
        max_value = fraud_df[col].max()

    num_nulls = fraud_df[col].isnull().sum()
    num_nulls_format = '{:,}'.format(num_nulls)
    percentage_null = (num_nulls / total_rows) * 100
    percentage_null_format = '{:.2f}%'.format(percentage_null)
    num_distinct = fraud_df[col].nunique()
    num_distinct_format = '{:,}'.format(num_distinct)

    sample_values = fraud_df[col].dropna().sample(5).tolist()

    dictionary.append([
        col, data_type, max_length, min_length, num_nulls_format, 
        percentage_null_format, num_distinct_format, min_value, 
        med_value, max_value, sample_values
    ])

dictionary_df = pd.DataFrame(
    dictionary, 
    columns=[
        'Column', 'Data_Type', 'Max_Character_Length', 'Min_Character_Length', 
        'Null_Value_Count', 'Null_Percentage', 'Distinct_Value_Count', 
        'Min_Value', 'Med_Value', 'Max_Value', 'Sample_Values'
    ]
)

# So that Sample_Values are not truncated
pd.set_option('display.max_colwidth', 1000)

dictionary_df

Unnamed: 0,Column,Data_Type,Max_Character_Length,Min_Character_Length,Null_Value_Count,Null_Percentage,Distinct_Value_Count,Min_Value,Med_Value,Max_Value,Sample_Values
0,trans_date_time,datetime64[ns],,,0,0.00%,226976,2020-06-21 12:14:00,,2020-12-31 23:59:00,"[2020-10-09 05:11:00, 2020-12-01 09:16:00, 2020-12-29 22:42:00, 2020-10-14 10:35:00, 2020-12-29 22:15:00]"
1,c_cc_num,int64,,,0,0.00%,904,60416207185,3521420000000000.0,4992350000000000000,"[4981130000000, 4908850000000000, 38947700000000, 180067000000000, 560881000000]"
2,merch_name,object,37.0,7.0,0,0.00%,693,,,,"[Casper, Hand and Zulauf, Goodwin-Nitzsche, Dooley Inc, Cartwright PLC, Eichmann-Kilback]"
3,trans_category_type,object,14.0,4.0,0,0.00%,14,,,,"[shopping pos, shopping net, travel, misc net, health fitness]"
4,trans_amt,float64,,,0,0.00%,37256,1.0,47.29,22768.11,"[63.1, 8.47, 86.83, 1.11, 147.08]"
5,c_first,object,11.0,3.0,0,0.00%,341,,,,"[Vicki, Dawn, Brooke, Raymond, Walter]"
6,c_last,object,11.0,2.0,0,0.00%,471,,,,"[Smith, Khan, Walker, Myers, Rodriguez]"
7,c_gender,object,1.0,1.0,0,0.00%,2,,,,"[M, F, M, M, M]"
8,a_street,object,35.0,12.0,0,0.00%,924,,,,"[1833 Jeanette Stravenue, 3522 Park Wells Suite 528, 17666 David Valleys, 7854 Stephanie Island Suite 788, 551 Zachary Freeway]"
9,cs_city,object,25.0,3.0,0,0.00%,849,,,,"[Powell Butte, New York City, Alva, Monmouth Beach, San Angelo]"


In [51]:
fraud_df.to_csv('fraud.csv', index=False)

# Database

In [46]:
# Connection parameters for connecting to the PostgreSQL server
server_params = {
    'dbname': 'postgres',
    'user': 'postgres',
    'password': 'welcome1',
    'host': 'localhost',
}

In [47]:
# Connect to the default database ('postgres') to create a new database
conn = psycopg2.connect(**server_params)
conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
cursor = conn.cursor()

In [None]:
# Create a new database
cursor.execute("CREATE DATABASE fraud;")
cursor.close()
conn.close()

In [None]:
# Connection parameters for connecting to the PostgreSQL server
db_params  = {
    'dbname': 'fraud',
    'user': 'postgres',
    'password': 'welcome1',
    'host': 'localhost',
}

# Tables

In [None]:
# Connect to the newly created database to create a table
conn = psycopg2.connect(**db_params)
cursor = conn.cursor()

### customer

In [None]:
# Create a table
cursor.execute("""
CREATE TABLE customer (
    customer_id INT4 PRIMARY KEY,
    c_cc_num INT8,
    c_first VARCHAR(50),
    c_last VARCHAR(50),
    c_gender VARCHAR(2),
    c_dob date,
    c_job VARCHAR(70)    
);
""")
conn.commit()

In [None]:
# extract customer table from fraud_df
customer = fraud_df[['customer_id','c_cc_num','c_first','c_last','c_gender','c_dob','c_job']].drop_duplicates()
customer.head(3)

In [None]:
# Load to customer table
insert_query = """
INSERT INTO customer (customer_id, c_cc_num, c_first, c_last, c_gender, c_dob, c_job)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (customer_id) DO UPDATE SET
    c_cc_num = EXCLUDED.c_cc_num,
    c_first = EXCLUDED.c_first,
    c_last = EXCLUDED.c_last,
    c_gender = EXCLUDED.c_gender,
    c_dob = EXCLUDED.c_dob,
    c_job = EXCLUDED.c_job;
"""

# Inserting customer df data
data_to_insert = [
    (
        row['customer_id'], row['c_cc_num'], row['c_first'], row['c_last'],
        row['c_gender'], row['c_dob'], row['c_job']
    )
    for index, row in customer.iterrows()
]

cursor.executemany(insert_query, data_to_insert)
conn.commit()

In [None]:
# Rollback the current transaction if there is an error
conn.rollback()

### address

In [None]:
# Create a table
cursor.execute("""
CREATE TABLE address (
    address_id INT4 PRIMARY KEY,
    a_street VARCHAR(50),
    a_zip INT4,
    a_lat FLOAT8,
    a_long FLOAT8,
    customer_id INT4,
    cs_city VARCHAR(50),
    cs_state VARCHAR(3)
);
""")
conn.commit()

In [None]:
# extract address table from fraud_df
address = fraud_df[['address_id','a_street','a_zip','a_lat','a_long','customer_id','cs_city','cs_state']].drop_duplicates()
address.head(5)

In [None]:
# Load to address table
insert_query = """
INSERT INTO address (address_id, a_street, a_zip, a_lat, a_long, customer_id, cs_city, cs_state)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (address_id) DO UPDATE SET
    a_street = EXCLUDED.a_street,
    a_zip = EXCLUDED.a_zip,
    a_lat = EXCLUDED.a_lat,
    a_long = EXCLUDED.a_long,
    customer_id = EXCLUDED.customer_id,
    cs_city = EXCLUDED.cs_city,
    cs_state = EXCLUDED.cs_state;
"""

# Inserting address df data
data_to_insert = [
    (
        row['address_id'], row['a_street'], row['a_zip'], row['a_lat'],
        row['a_long'], row['customer_id'], row['cs_city'], row['cs_state']
    )
    for index, row in address.iterrows()
]

cursor.executemany(insert_query, data_to_insert)
conn.commit()

In [None]:
# Creation of FK customer_id
#Adding FK
FK_customer_id = """
    ALTER TABLE address
    ADD CONSTRAINT fk_customer_id
    FOREIGN KEY (customer_id)
    REFERENCES customer(customer_id)
    ON DELETE CASCADE;
"""

# Execute the ALTER TABLE statement
cursor.execute(FK_customer_id)

# Commit the transaction
conn.commit()

In [None]:
# Rollback the current transaction if there is an error
conn.rollback()

### city_state

In [None]:
# Create a table
cursor.execute("""
CREATE TABLE city_state (
    cs_city VARCHAR(50),
    CS_state VARCHAR(3),
    cs_city_pop INT4,
    PRIMARY KEY (cs_city,CS_state)
);
""")
conn.commit()

In [None]:
# extract city_state table from fraud_df
city_state = fraud_df[['cs_city','cs_state','cs_city_pop']].drop_duplicates()
city_state.head(3)

In [None]:
# Load to city_state table
insert_query = """
INSERT INTO city_state (cs_city, cs_state, cs_city_pop)
VALUES (%s, %s, %s)
ON CONFLICT (cs_city, cs_state) DO UPDATE SET
    cs_city_pop = EXCLUDED.cs_city_pop;
"""

# Inserting city_state df data
data_to_insert = [
    (
        row['cs_city'], row['cs_state'], row['cs_city_pop']
    )
    for index, row in city_state.iterrows()
]

cursor.executemany(insert_query, data_to_insert)
conn.commit()

In [None]:
# Creation of FK cs_city and cs_state
#Adding FK
FK_city_state = """
    ALTER TABLE address
    ADD CONSTRAINT fk_city_state
    FOREIGN KEY (cs_city, cs_state)
    REFERENCES city_state (cs_city, cs_state)
    ON DELETE CASCADE;
    """
    
# Execute the SQL command
cursor.execute(FK_city_state)

# Commit the changes to the database
conn.commit()

In [None]:
# Rollback the current transaction if there is an error
conn.rollback()

### merchant

In [None]:
# Create a table
cursor.execute("""
CREATE TABLE merchant (
    merchant_id INT4 PRIMARY KEY,
    merch_name VARCHAR(50),
    merch_lat FLOAT8,
    merch_long FLOAT8
);
""")
conn.commit()

In [49]:
# extract merchant table from fraud_df
merchant = fraud_df[['merchant_id','merch_name','merch_lat','merch_long']].drop_duplicates()
merchant.head(3)

Unnamed: 0,merchant_id,merch_name,merch_lat,merch_long
0,320,Kirlin and Sons,33.986391,-81.200714
1,592,Sporer-Keebler,39.450498,-109.960431
2,612,"Swaniawski, Nitzsche and Welch",40.49581,-74.196111


In [None]:
# Load to merchant table
insert_query = """
INSERT INTO merchant (merchant_id, merch_name, merch_lat, merch_long)
VALUES (%s, %s, %s, %s)
ON CONFLICT (merchant_id) DO UPDATE SET
    merch_name = EXCLUDED.merch_name,
    merch_lat = EXCLUDED.merch_lat,
    merch_long = EXCLUDED.merch_long;
"""

# Inserting merchant df data
data_to_insert = [
    (
        row['merchant_id'], row['merch_name'], row['merch_lat'], row['merch_long']
    )
    for index, row in merchant.iterrows()
]

cursor.executemany(insert_query, data_to_insert)
conn.commit()

In [None]:
# Rollback the current transaction if there is an error
conn.rollback()

### transaction

In [None]:
# Create a table
cursor.execute("""
CREATE TABLE transaction (
    trans_num VARCHAR(255) PRIMARY KEY,
    trans_amt FLOAT8,
    trans_category_type VARCHAR(20),
    trans_date_time TIMESTAMP,
    trans_is_fraud BOOL,
    customer_id INT4,
    merchant_id INT4
);
""")
conn.commit()

In [None]:
# extract transaction table from fraud_df
transaction = fraud_df[['trans_num','trans_amt','trans_category_type','trans_date_time','trans_is_fraud','customer_id','merchant_id']].drop_duplicates()
transaction.head(3)

In [None]:
count = len(transaction)
count

In [None]:
# Load to transaction table
insert_query = """
INSERT INTO transaction (trans_num, trans_amt, trans_category_type, trans_date_time, trans_is_fraud, customer_id, merchant_id)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (trans_num) DO UPDATE SET
    trans_amt = EXCLUDED.trans_amt,
    trans_category_type = EXCLUDED.trans_category_type,
    trans_date_time = EXCLUDED.trans_date_time,
    trans_is_fraud = EXCLUDED.trans_is_fraud,
    customer_id = EXCLUDED.customer_id,
    merchant_id = EXCLUDED.merchant_id;
"""

# Inserting transaction df data
data_to_insert = [
    (
        row['trans_num'], row['trans_amt'], row['trans_category_type'], row['trans_date_time'], row['trans_is_fraud'], row['customer_id'], row['merchant_id']
    )
    for index, row in transaction.iterrows()
]

cursor.executemany(insert_query, data_to_insert)
conn.commit()

In [None]:
# Creation of FK cs_city and cs_state
#Adding FK
FK_trans_customer_id = """
    ALTER TABLE transaction
    ADD CONSTRAINT fk_trans_customer_id
    FOREIGN KEY (customer_id)
    REFERENCES customer (customer_id)
    ON DELETE CASCADE;
    """
FK_trans_merchant_id = """
    ALTER TABLE transaction
    ADD CONSTRAINT fk_trans_merchant_id
    FOREIGN KEY (merchant_id)
    REFERENCES merchant (merchant_id)
    ON DELETE CASCADE;
    """
    
# Execute the SQL command
cursor.execute(FK_trans_customer_id)
cursor.execute(FK_trans_merchant_id)

# Commit the changes to the database
conn.commit()

In [None]:
# Rollback the current transaction if there is an error
conn.rollback()

In [85]:
# Clean up the database connections
cursor.close()
conn.close()