# Transaction Amount Generator Using Gaussian Process Regression
This notebooks details how to generate realistic transaction amounts using Gaussian Process Regression based on customer information. This method is detailed in the report.

In [1]:
import numpy as np
import pandas as pd
import GPy

In [5]:
# Load the data
customers_path = '../customers.csv'
customers = pd.read_csv(customers_path)

transactions_path = '../database.csv'
transactions = pd.read_csv(transactions_path)

In [7]:
customers.head()

Unnamed: 0.1,Unnamed: 0,account_id,district_id,frequency,date,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,0,576,55,0,1993-01-01,Brno - venkov,south Moravia,157042,49,70,18,0,9,33.9,8743,1.88,2.43,111,3659,3894
1,1,704,55,0,1993-01-01,Brno - venkov,south Moravia,157042,49,70,18,0,9,33.9,8743,1.88,2.43,111,3659,3894
2,2,192,55,0,1993-01-08,Brno - venkov,south Moravia,157042,49,70,18,0,9,33.9,8743,1.88,2.43,111,3659,3894
3,3,10364,55,0,1993-01-17,Brno - venkov,south Moravia,157042,49,70,18,0,9,33.9,8743,1.88,2.43,111,3659,3894
4,4,497,55,0,1993-04-15,Brno - venkov,south Moravia,157042,49,70,18,0,9,33.9,8743,1.88,2.43,111,3659,3894


In [8]:
transactions.head()

Unnamed: 0.1,Unnamed: 0,account_id,trans_id,category,k-symbol,date,amount,mean_income,Initial Balance
0,0,1,5,0,-1,1995-03-24,122.633333,1358.788889,33.333333
1,1,1,199,29,-1,1995-04-13,420.0,1358.788889,33.333333
2,2,1,3530438,40,2,1995-04-23,0.64,1358.788889,33.333333
3,3,1,6,0,-1,1995-04-30,122.633333,1358.788889,33.333333
4,4,1,200,46,-1,1995-05-13,70.0,1358.788889,33.333333


In [9]:
# Merge the two datasets and select the relevat attributes only
data = pd.merge(customers, transactions, on='account_id')[['account_id', 'category','amount', 'mean_income', 'A4', 'A11', 'A13', 'A16']]

In [10]:
# Derive Crime Rate from number of crimes and population of area
data['A16'] = data['A16']/data['A4']

In [11]:
# Select outgoing transactions only
data = data[data['amount']>0]

In [13]:
# Create a dataframe containing 200 random samples for each transaction category and store in a list
strats = []
for cat in data['category'].unique():
    strats.append((cat, data[data['category']==cat].sample(n=200, replace=True)))

In [14]:
# Assign X and Y values
X = []
Y = []
for _, datum in strats:
    X.append(datum[['mean_income','A4','A11','A13','A16']].as_matrix())
    Y.append(datum['amount'].as_matrix()[:,None])

In [15]:
# Fit GP models for each category
models = []
for XX, YY in zip(X,Y):
    K = GPy.kern.RBF(5, ARD=True)
    m = GPy.models.GPRegression(XX,YY,K)
    m.optimize()
    models.append(m)



In [16]:
# Sample another 200 samples for each category for testing purposes
test_strats = []
for cat in data['category'].unique():
    test_strats.append((cat, data[data['category']==cat].sample(n=200, replace=True)))

In [17]:
# Extract X (customer info) for "test" cases
test_X = []
for _, datum in strats:
    test_X.append(datum[['mean_income','A4','A11','A13','A16']].as_matrix())

In [18]:
# Generate samples from the GP posteriors - These are the synthetic data samples
test_Y = []
for m, XX in zip(models, test_X):
    test_Y.append(m.posterior_samples(XX,size=1))



In [19]:
# Format data
categories = []
for cat, _ in strats:
    for i in range(200):
        categories.append(cat)

categories=np.vstack(categories)

amounts = np.vstack(test_Y)
features = np.vstack(test_X)

synthetic_data = np.hstack([features, categories, amounts])

synthetic_data.shape

(10000, 7)

In [21]:
# Store synthetic data in a dataframe
synthetic_data=pd.DataFrame(synthetic_data)
synthetic_data.columns=[['mean_income','A4','A11','A13','A16', 'category', 'amount']]
synthetic_data.head()

Unnamed: 0,mean_income,A4,A11,A13,A16,category,amount
0,3246.595714,138032.0,8819.0,5.66,0.032637,6.0,217.411486
1,2200.562564,51313.0,8930.0,4.2,0.028297,6.0,205.749504
2,2069.008287,88884.0,8507.0,1.85,0.030084,6.0,192.43471
3,3072.19527,157042.0,8743.0,2.43,0.024796,6.0,209.547184
4,3372.832727,87419.0,8624.0,2.66,0.036582,6.0,218.151201


In [22]:
# Save synthetic data
synthetic_data.to_csv('./example_synthetic_data.csv')