## Sample Data Generation

This script generates artificial log files by simulating user activities 

The data is used to train and test different models (like an hidden markov model, recurrent neural networks and generative adversial networks to detect outliers / fraud in user transactional behavior. For example the employee usage of a CRM system to detect data theft or customer behavior in a online shop to sport fraudulent orders.

Simulation is done via Markov Chain Monte Carlo Simulation.

We define a set of possible activities like (start, end, action_1, ..., action_n) and a set of users. Each user belong to one of n classes (eg. normal behavior and fraudulent behavior). For each class we have a 
transition matrix giving the probabilities that a user perform a specific action given his previous action and his state.


In [2]:
import os
import numpy as np
import pandas as pd

In [42]:
# Configuration

# first two actions need to be start and end
actions = ['start', 'end', 'action_1', 'action_2', 'action_3']

n_users = 200

n_states = 1

# Probability that a user is one of these states
p_fraudulent_user = 0.02
p_commit_fraud = 0.25

p_states = [1]

n_activities_per_user_per_day = 100

n_days = 1

transitions = [
    # Normal behavior
    np.array([
        [0.00, 0.00, 1/3,  1/3,  1/3],
        [1.00, 0.00, 0.00, 0.00, 0.00],
        [0.00, 0.01, 0.09, 0.45, 0.45],
        [0.00, 0.88, 0.01, 0.10, 0.01],
        [0.00, 0.68, 0.01, 0.30, 0.01],
    ]),
]

fraud_transitions = [
    # Fraudulent Behavior
    np.array([
        [0.00, 0.00, 1.00, 0.00, 0.00],
        [1.00, 0.00, 0.00, 0.00, 0.00],
        [0.00, 0.19, 0.80, 0.005, 0.005],
        [0.00, 0.88, 0.01, 0.10, 0.01],
        [0.00, 0.68, 0.01, 0.30, 0.01],
    ])
]

assert len(p_states)==n_states, 'Inconsitent number of states and state probs'
assert np.sum(p_states)==1, 'State probs doesnt sum up to one'
assert len(transitions)==n_states, 'Inconsitent number of transition matrixes and state'
for i in range(n_states):
    assert np.allclose(transitions[i].sum(), len(actions))
    assert np.allclose(transitions[i].cumsum(axis=1)[:,-1],1)
assert np.allclose(fraud_transitions[0].sum(), len(actions))
assert np.allclose(fraud_transitions[0].cumsum(axis=1)[:,-1],1)

In [43]:
# Simulation
def simulate_markov_chain(transition_matrix, actions):
    '''
    Simulate a user session using the corresponding transition matrix
    given the current user state
    
    Parameter:
    
        transitions : list of transition matrixes
        state_id: int of the current user state
        actions: list of all available actions
        
    Returns:
    
        list of artificial activities in one session
    '''
    activities = [actions[0]]
    while activities[-1] != 'end':
        prev_action = actions.index(activities[-1])
        transition_probs = transition_matrix[prev_action]
        next_action = np.random.choice(actions, size=1, p=transition_probs)[0]
        activities.append(next_action)
    return activities


np.random.seed(42)
log_data = []
for user in range(n_users):
    user_class = np.random.choice(np.arange(0, n_states), size=1, p=p_states)[0]
    user_pot_fraud = np.random.binomial(1, p_fraudulent_user)
    if user_pot_fraud == 0:
        transitions_matrices = [transitions[user_class]]
        probs = [1]
    else:
        transitions_matrices = [transitions[user_class], fraud_transitions[0]]
        probs = [1-p_commit_fraud, p_commit_fraud]
    for day in range(n_days):
        commit_fraud = np.random.choice(np.arange(len(transitions_matrices)), size=n_activities_per_user_per_day, p=probs)
        for fraud_flat in commit_fraud:
            activities = simulate_markov_chain(transitions_matrices[fraud_flat], actions)
            log_data.append((user, day, i, activities, user_pot_fraud, fraud_flat))

In [44]:
logfile = pd.DataFrame(log_data, columns=('UserID', 'Day', 'UserSessionId', 'SessionActivity', 'PotentialFraudster', 'FraudulentActivity'))

In [45]:
logfile[logfile.PotentialFraudster==1]

Unnamed: 0,UserID,Day,UserSessionId,SessionActivity,PotentialFraudster,FraudulentActivity
9500,95,0,0,"[start, action_3, end]",1,0
9501,95,0,0,"[start, action_3, end]",1,0
9502,95,0,0,"[start, action_1, action_3, end]",1,0
9503,95,0,0,"[start, action_1, action_3, action_2, end]",1,0
9504,95,0,0,"[start, action_3, action_2, end]",1,0
9505,95,0,0,"[start, action_1, end]",1,1
9506,95,0,0,"[start, action_1, action_1, end]",1,1
9507,95,0,0,"[start, action_2, end]",1,0
9508,95,0,0,"[start, action_3, end]",1,0
9509,95,0,0,"[start, action_3, end]",1,0


In [6]:
%%bash 
mkdir Data

In [46]:
logfile.to_pickle('./Data/logfile_fraudulent_activities.pkl')

In [47]:
id2actions = np.array(['<pad>']+actions)
np.save('./Data/id2action', id2actions)

In [48]:
%%bash

cd Data
ls -lisa

total 12752
8604106216    0 drwxr-xr-x   5 matthiasgroncki  staff      160 Nov  9 07:10 .
8604100559    0 drwxr-xr-x  13 matthiasgroncki  staff      416 Nov  9 07:10 ..
8604106313    8 -rw-r--r--   1 matthiasgroncki  staff      320 Nov  9 07:10 id2action.npy
8604126099 8544 -rw-r--r--   1 matthiasgroncki  staff  4373842 Nov  9 07:10 logfile_fraudulent_activities.pkl
8604106218 4200 -rw-r--r--   1 matthiasgroncki  staff  2147215 Nov  8 16:10 logfile_v1.pkl
