In [44]:
import h5py
import numpy as np
import pandas as pd
import json


In [45]:
#generate data
from faker import Faker
from sklearn.utils import shuffle

DataGenerator = Faker()
DataGenerator.random.seed(5467)

def generate_dataset(num_records, ratio, path):
    df = pd.DataFrame(columns=(
        'transaction_id'
        , 'card_id'
        , 'customer_id'
        , 'customer_zipcode'
        , 'merchant_id'
        , 'merchant_name'
        , 'merchant_category'
        , 'merchant_zipcode'
        , 'merchant_country'
        , 'transaction_amount'
        , 'authorization_response_code'
        , 'atm_network_xid'
        , 'cvv_2_response_xflg'
        , 'fraud_label'))

    num_non_fraud = int(round(num_records * (1 - ratio)))
    num_fraud = int(round(num_records * ratio))

    for i in range(num_non_fraud):
        if i%100==0: print("{} / {}".format(i, num_non_fraud))
        row = [DataGenerator.random_int(min=100000, max=999999)
            , DataGenerator.random_int(min=9000, max=9200)
            , DataGenerator.random_int(min=1000, max=1200)
            , DataGenerator.zipcode()
            , DataGenerator.random_int(min=1, max=1000)
            , DataGenerator.company()
            , DataGenerator.random_int(min=1000, max=9999)
            , DataGenerator.zipcode()
            , DataGenerator.bank_country()
            , DataGenerator.random_int(min=1, max=2500)
            , DataGenerator.random.choice(["A", "B", "C", "D"])
            , DataGenerator.random.choice(["A", "B", "C", "D"])
            , DataGenerator.random.choice(['M', 'P', 'N'])
            , 0]
        df.loc[i] = [item for item in row]

#fraud
    for i in range(num_fraud):
        if i%100==0: print("{} / {}".format(i, num_fraud))
        row = [DataGenerator.random_int(min=100000, max=999999)
            , DataGenerator.random_int(min=9000, max=9200)
            , DataGenerator.random_int(min=1000, max=1200)
            , DataGenerator.zipcode()
            , 1011
            , 'Mums Kitchen'
            , 5813
            , 10001
            , DataGenerator.bank_country()
            , DataGenerator.random_int(min=1, max=2500)
            , DataGenerator.random.choice(["A", "B", "C", "D"])
            , DataGenerator.random.choice(["A", "B", "C", "D"])
            , DataGenerator.random.choice(['M', 'P', 'N'])
            , 1]
        df.loc[num_non_fraud+i] = [item for item in row]

    df = shuffle(df)
    df.to_csv(path, sep=',', encoding='utf-8', header=True, index = False,  mode='a')
    return df


In [54]:
df = generate_dataset(1000, 0.1,'transactions.csv')

0 / 900
100 / 900
200 / 900
300 / 900
400 / 900
500 / 900
600 / 900
700 / 900
800 / 900
0 / 100


In [55]:
df=pd.read_csv("transactions.csv")
df

Unnamed: 0,transaction_id,card_id,customer_id,customer_zipcode,merchant_id,merchant_name,merchant_category,merchant_zipcode,merchant_country,transaction_amount,authorization_response_code,atm_network_xid,cvv_2_response_xflg,fraud_label
0,845944,9101,1138,61249,377,"Hatfield, Collier and Vargas",2910,9672,GB,2468,A,B,N,0
1,804464,9086,1054,22402,105,Nelson Inc,2101,50040,GB,1168,C,B,N,0
2,151495,9095,1022,95362,177,Carson-Larson,5369,82338,GB,757,C,C,M,0
3,217427,9114,1139,74968,544,Taylor-Baker,6304,64247,GB,966,B,C,N,0
4,757416,9088,1029,31011,204,Avery-Novak,8011,75734,GB,2395,B,B,P,0
5,157202,9193,1010,57192,872,Levine-Ford,3579,78745,GB,2283,B,B,M,0
6,701451,9172,1008,16728,562,Archer-Clark,5631,10226,GB,1383,D,A,N,0
7,469283,9000,1077,19564,139,Schmidt LLC,3272,63681,GB,2346,C,A,P,0
8,253931,9019,1196,93905,544,Conrad Group,3763,20037,GB,1849,C,C,M,0
9,324251,9056,1040,56485,524,Warren-Anthony,6479,96375,GB,1705,A,C,M,0


now that i have a dataset, i want to extract the metadata then reshape into sequence of transactions.  ultimately we need to adapt meta-data extractor to do this, but for now i will run ludwig with a base transaction oriented model solely for the purposes of generating the metadata.

In [67]:
from ludwig import LudwigModel
import logging
model = LudwigModel({}, model_definition_file='base_model.yaml', logging_level=logging.INFO)

In [68]:
train_stats = model.train(data_csv='transactions.csv')

In [57]:
df = pd.read_csv('transactions.csv')

In [58]:
g = df.groupby(['customer_id'])

In [71]:
customer_id = []
merchant_category = []
merchant_zipcode = []
transaction_amount = []
label = []

#this is quick and dirty just to get the right shape..in reality there are more examples that should be extracted.
for n, grp in g:
    s = grp.reset_index(drop=True)
  
    if s.shape[0] < 6:
        continue
    label.append(s['fraud_label'][0])
    customer_id.append(str(n))
    merchant_category.append(np.array(s['merchant_category'][0:5].astype(str)))
    merchant_zipcode.append(np.array(s['merchant_zipcode'][0:5].astype(str)))
    transaction_amount.append(np.array(s['transaction_amount'][0:5]))


In [61]:
split = np.random.choice([0,1,2], size=len(customer_id), p=[0.8, 0.1, 0.1])

In [63]:
def load_json(data_fp):
    with open(data_fp, 'r') as input_file:
        data = json.load(input_file)
        return data


def load_hdf5(data_fp):
    data = {}
    with h5py.File(data_fp, 'r') as h5_file:
        for key in h5_file.keys():
            data[key] = h5_file[key].value
    return data

In [72]:
#integerize
metadata = load_json('transactions.json')

mc = [[metadata['merchant_category']['str2idx'][a] for a in b] for b in merchant_category]
mz = [[metadata['merchant_zipcode']['str2idx'][a] for a in b] for b in merchant_zipcode]
ci = [metadata['customer_id']['str2idx'][a] for a in customer_id]





In [80]:
metadata = load_json('transactions.json')
with h5py.File('sequence_dataset.hdf5', 'w') as h5_file:
    h5_file.create_dataset('customer_id', data=np.array(ci))
    h5_file.create_dataset('merchant_category', data=np.array(mc))
    h5_file.create_dataset('merchant_zipcode', data=np.array(mz))
    h5_file.create_dataset('transaction_amount', data=np.array(transaction_amount))
    h5_file.create_dataset('fraud_label', data=np.array(label))
    h5_file.create_dataset('split', data=np.array(split))

Now we build the real model.
note that we first need to adjust the metadata file

In [75]:
!cp transactions.json sequence_dataset.json
#edit manually to add ax sequence lengths for all of the features what will be sequences. (this is already done)

In [84]:
model = LudwigModel({}, model_definition_file='model.yaml', logging_level=logging.INFO)

In [85]:
train_stats = model.train(data_hdf5='sequence_dataset.hdf5', train_set_metadata_json='sequence_dataset.json')

Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
This class is equivalent as tf.keras.layers.SimpleRNNCell, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
This class is equivalent as tf.keras.layers.SimpleRNNCell, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API


ValueError: Cannot feed value of shape (65, 5) for Tensor 'transaction_amount/transaction_amount_placeholder:0', which has shape '(?, 6)'

In [86]:
train_stats

{'train': OrderedDict([('fraud_label',
               OrderedDict([('loss',
                             [59.635194242670295,
                              23.02798599071717,
                              1.0479783797531985,
                              2.92749182026038,
                              3.6576714676417663,
                              3.727519838997487,
                              3.4118640342455233,
                              2.877492326029231]),
                            ('accuracy',
                             [0.10814606741573034,
                              0.11376404494382023,
                              0.9002808988764045,
                              0.9002808988764045,
                              0.9030898876404494,
                              0.9058988764044944,
                              0.9058988764044944,
                              0.9143258426966292])])),
              ('combined',
               {'loss': [59.635194242670295,
       