# Probabilitic Data Generator
This code generates synthetic data based on a probabilistic formulation specified in the report.

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as st
import pickle

In [2]:
# Model Parameters
I = 100 # Number of customers
J = 100 # Number of trasactions per customer
K = 2 # Number of transactions types
R = 3 # Regression dimensionality

eta = 1000 # Gamma parameter 1 - Prior Parameter
zeta = 500 # Gamma parameter 2 - Prior Parameter

# Randomly generated features - Can be subsituted with real ones based on true data
b = np.random.normal(0, 1, size=(I, R)) 

# Hyperparamers - Usually learned from data
lambdas = np.random.exponential(scale=0.2, size=(K, R))

In [3]:
# Regression coefficients - Usually Learned from data
w = np.random.normal(0, scale=lambdas)

# Calculate probability vector of types for each customer
logits = np.matmul(w,b.T).T
theta = np.exp(logits)/np.sum(np.exp(logits), axis=1)[:,None]

# Sample model parameters from prior
phi_alpha = np.random.exponential(eta, K)
phi_beta = np.random.exponential(zeta, K)

In [5]:
# Generate data from model
cid = []
y = []
x = []
for i in range(I): 
    alpha = np.dot(phi_alpha, theta[i,:])
    beta = np.dot(phi_beta, theta[i,:])
    for j in range(J):
        cid.append(i)
        y.append(np.random.choice(K, p=theta[i,:]))
        x.append(np.random.gamma(alpha, beta))

In [6]:
# Format data
x = np.vstack(x)
y = np.vstack(y)
cid = np.vstack(cid)

data = np.hstack([cid,y,x])

data = pd.DataFrame(data=data, columns=['cid', 'y', 'x'])

bs = [b[i,:] for i in range(b.shape[0])]

x = [list(data[data['cid']==i]['x']) for i in range(I)]
y = [list(data[data['cid']==i]['y'].astype(int)) for i in range(I)]

In [7]:
#Store data in a dictionary
model_data = {
    'I': I,
    'J': J,
    'K': K,
    'R': R,
    'eta': eta,
    'zeta': zeta,
    'b': bs,
    'x': x,
    'y': y,
    'lambda': lambdas
}

In [8]:
# Save data
with open('./model_data.pickle', 'wb') as f:
    pickle.dump(model_data, f)

In [9]:
# Store model parameters in a dictionary
init_dict = {
    'omega': w,
    'phi_alpha': phi_alpha,
    'phi_beta': phi_beta,
    'theta': theta
}

In [10]:
# Save model parameters
with open('./init_dict.pickle', 'wb') as f:
    pickle.dump(init_dict, f)