## Sample Data Generation

This script generates artificial log files by simulating user activities 

The data is used to train and test different models (like an hidden markov model, recurrent neural networks and generative adversial networks to detect outliers / fraud in user transactional behavior. For example the employee usage of a CRM system to detect data theft or customer behavior in a online shop to sport fraudulent orders.

Simulation is done via Markov Chain Monte Carlo Simulation.

We define a set of possible activities like (start, end, action_1, ..., action_n) and a set of users. Each user belong to one of n classes (eg. normal behavior and fraudulent behavior). For each class we have a 
transition matrix giving the probabilities that a user perform a specific action given his previous action and his state.


In [1]:
import os
import numpy as np
import pandas as pd

In [58]:
# Configuration

# first two actions need to be start and end
actions = ['start', 'end', 'action_1', 'action_2', 'action_3', 'action_4', 'action_5']

n_users = 200

n_states = 2

# Probability that a user is one of these states
p_fraudulent_user = 0.02
p_commit_fraud = 0.2

p_states = [0.8, 0.2]

n_activities_per_user_per_day = 200

n_days = 1

transitions = [
    # Normal behavior Role 1
    np.array([
        [0.00, 0.00, 0.20, 0.20, 0.20, 0.20, 0.20],
        [1.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
        [0.00, 0.01, 0.09, 0.30, 0.30, 0.15, 0.15],
        [0.00, 0.60, 0.05, 0.10, 0.05, 0.05, 0.15],
        [0.00, 0.50, 0.05, 0.25, 0.05, 0.10, 0.05],
        [0.00, 0.60, 0.01, 0.10, 0.10, 0.10, 0.09],
        [0.00, 0.60, 0.09, 0.10, 0.10, 0.10, 0.01],
    ]),
    # Normal behavior Role 2
    np.array([
        [0.00, 0.00, 0.20, 0.10, 0.10, 0.30, 0.30],
        [1.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
        [0.00, 0.10, 0.20, 0.20, 0.20, 0.10, 0.20],
        [0.00, 0.70, 0.05, 0.05, 0.05, 0.05, 0.10],
        [0.00, 0.70, 0.05, 0.05, 0.05, 0.10, 0.05],
        [0.00, 0.50, 0.01, 0.01, 0.01, 0.10, 0.37],
        [0.00, 0.60, 0.09, 0.10, 0.10, 0.10, 0.01],
    ]),
]

fraud_transitions = [
    # Fraudulent Behavior
    np.array([
        [0.00, 0.00, 1.00, 0.00, 0.00, 0.00, 0.00],
        [1.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00],
        [0.00, 0.20, 0.70, 0.025, 0.025, 0.025, 0.025],
        [0.00, 0.40, 0.40, 0.05, 0.05, 0.05, 0.05],
        [0.00, 0.40, 0.40, 0.05, 0.05, 0.05, 0.05],
        [0.00, 0.50, 0.01, 0.01, 0.01, 0.10, 0.37],
        [0.00, 0.60, 0.09, 0.10, 0.10, 0.10, 0.01],
    ])
]

assert len(p_states)==n_states, 'Inconsitent number of states and state probs'
assert np.sum(p_states)==1, 'State probs doesnt sum up to one'
assert len(transitions)==n_states, 'Inconsitent number of transition matrixes and state'
for i in range(n_states):
    assert np.allclose(transitions[i].sum(), len(actions))
    assert np.allclose(transitions[i].cumsum(axis=1)[:,-1],1)
assert np.allclose(fraud_transitions[0].sum(), len(actions))
assert np.allclose(fraud_transitions[0].cumsum(axis=1)[:,-1],1)

In [59]:
# Simulation
def simulate_markov_chain(transition_matrix, actions):
    '''
    Simulate a user session using the corresponding transition matrix
    given the current user state
    
    Parameter:
    
        transitions : list of transition matrixes
        state_id: int of the current user state
        actions: list of all available actions
        
    Returns:
    
        list of artificial activities in one session
    '''
    activities = [actions[0]]
    while activities[-1] != 'end':
        prev_action = actions.index(activities[-1])
        transition_probs = transition_matrix[prev_action]
        next_action = np.random.choice(actions, size=1, p=transition_probs)[0]
        activities.append(next_action)
    return activities


np.random.seed(42)
log_data = []
for user in range(n_users):
    user_class = np.random.choice(np.arange(0, n_states), size=1, p=p_states)[0]
    user_pot_fraud = np.random.binomial(1, p_fraudulent_user)
    if user_pot_fraud == 0:
        transitions_matrices = [transitions[user_class]]
        probs = [1]
    else:
        transitions_matrices = [transitions[user_class], fraud_transitions[0]]
        probs = [1-p_commit_fraud, p_commit_fraud]
    for day in range(n_days):
        commit_fraud = np.random.choice(np.arange(len(transitions_matrices)), size=n_activities_per_user_per_day, p=probs)
        for fraud_flat in commit_fraud:
            activities = simulate_markov_chain(transitions_matrices[fraud_flat], actions)
            log_data.append((user, day, i, activities, user_class, user_pot_fraud, fraud_flat))

In [60]:
logfile = pd.DataFrame(log_data, columns=('UserID', 'Day', 'UserSessionId', 'SessionActivity', 'UserRole', 'PotentialFraudster', 'FraudulentActivity'))

In [61]:
logfile.shape

(40000, 7)

In [62]:
logfile[logfile.FraudulentActivity==1]

Unnamed: 0,UserID,Day,UserSessionId,SessionActivity,UserRole,PotentialFraudster,FraudulentActivity
10600,53,0,1,"[start, action_1, action_1, action_1, end]",0,1,1
10612,53,0,1,"[start, action_1, action_1, action_1, end]",0,1,1
10620,53,0,1,"[start, action_1, end]",0,1,1
10626,53,0,1,"[start, action_1, end]",0,1,1
10645,53,0,1,"[start, action_1, action_1, action_1, action_1...",0,1,1
10649,53,0,1,"[start, action_1, action_3, action_1, action_5...",0,1,1
10670,53,0,1,"[start, action_1, action_1, action_1, action_1...",0,1,1
10672,53,0,1,"[start, action_1, action_1, end]",0,1,1
10677,53,0,1,"[start, action_1, action_1, action_1, action_1...",0,1,1
10684,53,0,1,"[start, action_1, action_1, action_1, action_1...",0,1,1


In [63]:
%%bash 
mkdir Data

mkdir: Data: File exists


In [64]:
logfile.to_pickle('./Data/logfile.pkl')

In [65]:
id2actions = np.array(['<pad>']+actions)
np.save('./Data/id2action', id2actions)

In [66]:
%%bash

cd Data
ls -lisa

total 33408
8604356875     0 drwxr-xr-x   6 matthiasgroncki  staff       192 Nov 10 20:43 .
8604356849     0 drwxr-xr-x  13 matthiasgroncki  staff       416 Nov 10 20:51 ..
8604356877     8 -rw-r--r--   1 matthiasgroncki  staff       384 Nov 10 20:53 id2action.npy
8604364278 19944 -rw-r--r--   1 matthiasgroncki  staff  10207313 Nov 10 20:53 logfile.pkl
8604356876  9256 -rw-r--r--   1 matthiasgroncki  staff   4736465 Nov 10 16:12 logfile_fraudulent_activities.pkl
8604356878  4200 -rw-r--r--   1 matthiasgroncki  staff   2147215 Nov  8 16:10 logfile_v1.pkl


In [67]:
111/40000

0.002775