In [1]:
import pandas as pd
from sklearn import preprocessing

In [2]:
df = pd.read_csv('./data/interim/transactions.csv', 
    index_col=0,
    parse_dates=['transactionDateTime', 'currentExpDate', 'accountOpenDate', 'dateOfLastAddressChange']
    )

# Adding Columns

In [3]:
# for referencing
df_ref = df.head(20)

In [4]:
df.columns

Index(['customerId', 'creditLimit', 'availableMoney', 'transactionDateTime',
       'transactionAmount', 'merchantName', 'acqCountry',
       'merchantCountryCode', 'posEntryMode', 'posConditionCode',
       'merchantCategoryCode', 'currentExpDate', 'accountOpenDate',
       'dateOfLastAddressChange', 'cardCVV', 'enteredCVV', 'cardLast4Digits',
       'transactionType', 'currentBalance', 'cardPresent',
       'expirationDateKeyInMatch', 'isFraud'],
      dtype='object')

In [5]:
# perhaps the Age of the account is important
df['accountAge'] = (df.transactionDateTime - df.accountOpenDate).dt.days

In [6]:
df[['transactionType', 'isFraud']].value_counts()

transactionType       isFraud
PURCHASE              False      725108
ADDRESS_VERIFICATION  False       19856
REVERSAL              False       19738
PURCHASE              True        11528
REVERSAL              True          324
ADDRESS_VERIFICATION  True          114
dtype: int64

In [7]:
df[df.cardCVV != df.enteredCVV]['isFraud'].value_counts(normalize=True)

False    0.971429
True     0.028571
Name: isFraud, dtype: float64

In [8]:
# Card CVV vs entered CVV
df['cvvMatch'] = df.cardCVV == df.enteredCVV

In [9]:
# time since address was changed
df['sinceDateOfLastAddressChange'] = (df.transactionDateTime - df.dateOfLastAddressChange).dt.days

In [10]:
# If country codes match
df['countryMatch'] = df.acqCountry == df.merchantCountryCode

In [11]:
# General date properties
df['dayOfMonth'] = df.transactionDateTime.dt.day
df['month'] = df.transactionDateTime.dt.month
df['dayOfYear'] = df.transactionDateTime.dt.dayofyear
df['weekOfYear'] = df.transactionDateTime.dt.weekofyear
df['dayOfWeek'] = df.transactionDateTime.dt.dayofweek
df['quarter'] = df.transactionDateTime.dt.quarter
df['hour'] = df.transactionDateTime.dt.hour

  df['weekOfYear'] = df.transactionDateTime.dt.weekofyear


In [12]:
# weekday or weekend
df['weekday'] = df.dayOfWeek < 5

In [13]:
# time of day
df['timeOfDay'] = pd.cut(df.hour, bins=3, labels=[0, 1, 2])

## Aggregations based on client ID

In [14]:
# df = df.sort_values('transactionDateTime').reset_index()
# df['cumsum(isFraud)'] = (df.groupby('customerId')['isFraud'].cumsum(axis=0))

In [15]:
# Creating Cumulative Mean. Shifted so each line shows what's happened in the past
df['cumMean'] = (
    df
    .groupby('customerId')['isFraud']
    .apply(lambda x: x.shift().expanding().mean())
    .fillna(0)
    )

In [16]:
df[(df.customerId == 152973583) & (df.isFraud == True)][['transactionDateTime', 'isFraud', 'cumMean']]

Unnamed: 0,transactionDateTime,isFraud,cumMean
616074,2016-01-03 14:26:37,True,0.0
616402,2016-02-23 09:53:21,True,0.002924
616559,2016-03-21 23:53:41,True,0.004008
616730,2016-04-15 11:24:47,True,0.004484
616889,2016-05-09 13:24:22,True,0.004848
617040,2016-06-04 07:42:22,True,0.005128
617041,2016-06-04 07:43:58,True,0.006148
617384,2016-07-24 14:52:14,True,0.005315
617431,2016-07-31 21:38:16,True,0.005865
617473,2016-08-08 18:49:58,True,0.006406


# Encoding

In [17]:
# import sweetviz as sv
# my_report = sv.analyze(df)
# my_report.show_html() 

In [18]:
df.columns

Index(['customerId', 'creditLimit', 'availableMoney', 'transactionDateTime',
       'transactionAmount', 'merchantName', 'acqCountry',
       'merchantCountryCode', 'posEntryMode', 'posConditionCode',
       'merchantCategoryCode', 'currentExpDate', 'accountOpenDate',
       'dateOfLastAddressChange', 'cardCVV', 'enteredCVV', 'cardLast4Digits',
       'transactionType', 'currentBalance', 'cardPresent',
       'expirationDateKeyInMatch', 'isFraud', 'accountAge', 'cvvMatch',
       'sinceDateOfLastAddressChange', 'countryMatch', 'dayOfMonth', 'month',
       'dayOfYear', 'weekOfYear', 'dayOfWeek', 'quarter', 'hour', 'weekday',
       'timeOfDay', 'cumMean'],
      dtype='object')

In [19]:
# Credit Limit - should be fine the way it is
# {k: v for k, v in enumerate(np.sort(df.creditLimit.unique()))}

In [20]:
# available money - cut
df['availableMoney'] = pd.cut(df.availableMoney, bins=[-5000, -1000, -500, -100, 0, 100, 500, 1000, 5000, 50000], labels=[0, 1, 2, 3, 4, 5, 6, 7, 8])

In [21]:
# transactionDateTime - drop

In [22]:
# transactionAmount - qcut
df['transactionAmount'] = pd.qcut(df.transactionAmount, 4, labels=[0, 1, 2, 3])

In [23]:
# merchantName - i'm going to remove the individual locations - like AMC #010101 - could change this choice later
df['merchantName'] = df.merchantName.str.replace(r'\s#.*$', '')
# Encode labels to ints
df['merchantName'] = preprocessing.LabelEncoder().fit_transform(df.merchantName)

  df['merchantName'] = df.merchantName.str.replace(r'\s#.*$', '')


In [24]:
# acqCountry, merchantCountryCode, posEntryMode, posConditionCode, 
# merchantCategoryCode, transactionType
# - encode lables to ints
df['acqCountry'] = preprocessing.LabelEncoder().fit_transform(df.acqCountry)
df['merchantCountryCode'] = preprocessing.LabelEncoder().fit_transform(df.merchantCountryCode)
df['posEntryMode'] = preprocessing.LabelEncoder().fit_transform(df.posEntryMode)
df['posConditionCode'] = preprocessing.LabelEncoder().fit_transform(df.posConditionCode)
df['merchantCategoryCode'] = preprocessing.LabelEncoder().fit_transform(df.merchantCategoryCode)
df['transactionType'] = preprocessing.LabelEncoder().fit_transform(df.transactionType)



In [25]:
# # # Booleans
# cardPresent
# expirationDateKeyInMatch
# isFraud
# cvvMatch
# countryMatch
# weekday
df.cardPresent = df.cardPresent.astype('int')
df.expirationDateKeyInMatch =  df.expirationDateKeyInMatch.astype('int')
df.isFraud = df.isFraud.astype('int')
df.cvvMatch = df.cvvMatch.astype('int')
df.countryMatch = df.countryMatch.astype('int')
df.weekday = df.cardPresent.astype('int')

In [26]:
# account age - leave alone for now
# sinceDateOfLastAddressChange - leave alone for now
# dayOfMonth, month, dayOfYear, weekOfYear, weekOfYear, dayOfWeek - leave alone for now
# quarter, hour - leave alone for now
# timeOfDay - leave alone for now


In [27]:
# cumMean and customerId, idk yet 

In [28]:
df.availableMoney.value_counts(dropna=False)

7    318444
8    273283
5     92224
6     73165
4     16656
3      1534
2      1279
1        82
0         1
Name: availableMoney, dtype: int64

In [29]:
# currentExpDate drop
# accountOpenDate drop
# dateOfLastAddressChange drop
# cardCVV drop
# enteredCVV drop
# cardLast4Digits maybe grouping by clientid and this would be useful, but dropping for now
df = df.drop(
    ['transactionDateTime', 'currentExpDate', 'accountOpenDate', 'dateOfLastAddressChange', 'cardCVV',
    'enteredCVV', 'cardLast4Digits'], axis=1)

In [30]:
# Saves types to load quickly
df.dtypes.to_csv('./data/processed/transactions_dtypes.csv')

In [31]:
df.to_csv("./data/processed/transactions.csv")