In [1]:
import pandas as pd
import json

In [2]:
with open("./data/raw/transactions.txt", "r") as f:
    lines = [json.loads(line) for line in f.readlines()]

In [3]:
df = pd.DataFrame(lines)

In [4]:
df.head()

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,echoBuffer,currentBalance,merchantCity,merchantState,merchantZip,cardPresent,posOnPremises,recurringAuthInd,expirationDateKeyInMatch,isFraud
0,737265056,737265056,5000.0,5000.0,2016-08-13T14:27:32,98.55,Uber,US,US,2,...,,0.0,,,,False,,,False,False
1,737265056,737265056,5000.0,5000.0,2016-10-11T05:05:54,74.51,AMC #191138,US,US,9,...,,0.0,,,,True,,,False,False
2,737265056,737265056,5000.0,5000.0,2016-11-08T09:18:39,7.47,Play Store,US,US,9,...,,0.0,,,,False,,,False,False
3,737265056,737265056,5000.0,5000.0,2016-12-10T02:14:50,7.47,Play Store,US,US,9,...,,0.0,,,,False,,,False,False
4,830329091,830329091,5000.0,5000.0,2016-03-24T21:04:46,71.18,Tim Hortons #947751,US,US,2,...,,0.0,,,,True,,,False,False


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 786363 entries, 0 to 786362
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accountNumber             786363 non-null  object 
 1   customerId                786363 non-null  object 
 2   creditLimit               786363 non-null  float64
 3   availableMoney            786363 non-null  float64
 4   transactionDateTime       786363 non-null  object 
 5   transactionAmount         786363 non-null  float64
 6   merchantName              786363 non-null  object 
 7   acqCountry                786363 non-null  object 
 8   merchantCountryCode       786363 non-null  object 
 9   posEntryMode              786363 non-null  object 
 10  posConditionCode          786363 non-null  object 
 11  merchantCategoryCode      786363 non-null  object 
 12  currentExpDate            786363 non-null  object 
 13  accountOpenDate           786363 non-null  o

In [6]:
df.columns

Index(['accountNumber', 'customerId', 'creditLimit', 'availableMoney',
       'transactionDateTime', 'transactionAmount', 'merchantName',
       'acqCountry', 'merchantCountryCode', 'posEntryMode', 'posConditionCode',
       'merchantCategoryCode', 'currentExpDate', 'accountOpenDate',
       'dateOfLastAddressChange', 'cardCVV', 'enteredCVV', 'cardLast4Digits',
       'transactionType', 'echoBuffer', 'currentBalance', 'merchantCity',
       'merchantState', 'merchantZip', 'cardPresent', 'posOnPremises',
       'recurringAuthInd', 'expirationDateKeyInMatch', 'isFraud'],
      dtype='object')

In [7]:
# Getting value coutnrs for each column
for col in df.columns:
    print(col, df[col].value_counts(dropna=False))
    print("*" * 50)

accountNumber 380680241    32850
882815134    13189
570884863    10867
246251253    10172
369308035     7229
             ...  
675656700        1
456500351        1
587921563        1
749645399        1
266636812        1
Name: accountNumber, Length: 5000, dtype: int64
**************************************************
customerId 380680241    32850
882815134    13189
570884863    10867
246251253    10172
369308035     7229
             ...  
675656700        1
456500351        1
587921563        1
749645399        1
266636812        1
Name: customerId, Length: 5000, dtype: int64
**************************************************
creditLimit 5000.0     201863
15000.0    139307
7500.0      97913
2500.0      75429
20000.0     68629
10000.0     56889
50000.0     48781
1000.0      36430
250.0       34025
500.0       27097
Name: creditLimit, dtype: int64
**************************************************
availableMoney 250.00      6015
5000.00     5400
15000.00    4254
7500.00     4069
500.

In [8]:
# The following columns have nothing in them, let's drop them
df.drop(
    [
        "echoBuffer",
        "merchantCity",
        "merchantState",
        "merchantZip",
        "posOnPremises",
        "recurringAuthInd",
    ],
    axis=1,
    inplace=True,
)

In [9]:
df[df.accountNumber != df.customerId]

Unnamed: 0,accountNumber,customerId,creditLimit,availableMoney,transactionDateTime,transactionAmount,merchantName,acqCountry,merchantCountryCode,posEntryMode,...,accountOpenDate,dateOfLastAddressChange,cardCVV,enteredCVV,cardLast4Digits,transactionType,currentBalance,cardPresent,expirationDateKeyInMatch,isFraud


In [10]:
# account number and customerid are all the same, removing one
df.drop(["accountNumber"], axis=1, inplace=True)

In [11]:
# There aren't NAs but there are empty strings
for col in df.columns:
    print(col, (df[col] == "").sum())

customerId 0
creditLimit 0
availableMoney 0
transactionDateTime 0
transactionAmount 0
merchantName 0
acqCountry 4562
merchantCountryCode 724
posEntryMode 4054
posConditionCode 409
merchantCategoryCode 0
currentExpDate 0
accountOpenDate 0
dateOfLastAddressChange 0
cardCVV 0
enteredCVV 0
cardLast4Digits 0
transactionType 698
currentBalance 0
cardPresent 0
expirationDateKeyInMatch 0
isFraud 0


In [12]:
# I'm going to remove all of the rows with empty values because there aren't that many compared to the total
print(df.shape)
df = df[df.acqCountry != ""]
df = df[df.merchantCountryCode != ""]
df = df[df.posEntryMode != ""]
df = df[df.posConditionCode != ""]
df = df[df.transactionType != ""]

(786363, 22)


In [13]:
df.shape


(776668, 22)

In [15]:
# converting date columns to datetime
df.transactionDateTime = pd.to_datetime(df.transactionDateTime)
df.accountOpenDate = pd.to_datetime(df.accountOpenDate)
df.dateOfLastAddressChange = pd.to_datetime(df.dateOfLastAddressChange)
df.currentExpDate = pd.to_datetime(df.currentExpDate)


In [16]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 776668 entries, 0 to 786362
Data columns (total 22 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   customerId                776668 non-null  object        
 1   creditLimit               776668 non-null  float64       
 2   availableMoney            776668 non-null  float64       
 3   transactionDateTime       776668 non-null  datetime64[ns]
 4   transactionAmount         776668 non-null  float64       
 5   merchantName              776668 non-null  object        
 6   acqCountry                776668 non-null  object        
 7   merchantCountryCode       776668 non-null  object        
 8   posEntryMode              776668 non-null  object        
 9   posConditionCode          776668 non-null  object        
 10  merchantCategoryCode      776668 non-null  object        
 11  currentExpDate            776668 non-null  datetime64[ns]
 12  ac

In [17]:
# Unblanaced Data set - I'll have keep that in mind later on
df[(df.cardCVV != df.enteredCVV)].isFraud.value_counts(normalize=True)

False    0.971429
True     0.028571
Name: isFraud, dtype: float64

In [18]:
df.isFraud.value_counts(normalize=True)

False    0.984593
True     0.015407
Name: isFraud, dtype: float64

In [None]:
import sweetviz as sv
my_report = sv.analyze(df)
my_report.show_html()

In [20]:
# Save types
df.dtypes.to_csv("./data/interim/transactions_dtypes.csv")
# Save df
df.to_csv("./data/interim/transactions.csv")
