In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

monday=pd.read_csv('./data/monday.csv', parse_dates=True,index_col=0,sep=';')
tuesday=pd.read_csv('./data/tuesday.csv', parse_dates=True,index_col=0,sep=';')
wednesday=pd.read_csv('./data/wednesday.csv', parse_dates=True,index_col=0,sep=';')
thursday=pd.read_csv('./data/thursday.csv', parse_dates=True,index_col=0,sep=';')
friday=pd.read_csv('./data/friday.csv', parse_dates=True,index_col=0,sep=';')

In [2]:
df=monday.append([tuesday,wednesday,thursday, friday])
df['weekday'] = df.index.day_name()
df['time']=df.index.time
df['customer_no']=df.apply(lambda x:'%s_%s' % (x['customer_no'],x['weekday']),axis=1)

#### Initial state vector

In [3]:
df['first_location'] = df.duplicated('customer_no')
first_location = df[df['first_location'] == False]

In [4]:
#number of customers by first location
initial_state_count=first_location.groupby('location').count()['customer_no']
initial_state_vector=initial_state_count/initial_state_count.sum()

In [5]:
initial_state_vector

location
dairy     0.287576
drinks    0.153526
fruit     0.377435
spices    0.181464
Name: customer_no, dtype: float64

#### Transition probability matrix without resampling

In [6]:
df['next'] = df.groupby(['customer_no'])['location'].shift(-1)
trans_prob_matrix = pd.crosstab(df['location'], df['next'], normalize='index')
trans_prob_matrix

next,checkout,dairy,drinks,fruit,spices
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dairy,0.391211,0.0,0.223151,0.189925,0.195713
drinks,0.537023,0.027159,0.0,0.219062,0.216756
fruit,0.499511,0.238319,0.136266,0.0,0.125904
spices,0.251,0.323553,0.27314,0.152307,0.0


In [7]:
np.dot(initial_state_vector,trans_prob_matrix)

array([0.42902996, 0.15283247, 0.16516926, 0.1158878 , 0.1370805 ])

#### Transition probability matrix with resampling

In [8]:
resampled = df.groupby('customer_no')[['location']].resample(rule='60S').ffill()
resampled['before'] = resampled['location'].shift(1)

In [9]:
#adding entrance locations instead of the locations which were from previous customers due to resampling
mask1 = resampled['before'] == 'checkout'  
mask2 = resampled['location'] != 'checkout'
resampled.loc[mask1 & mask2, 'before'] = np.nan
resampled['before'].fillna('entrance', inplace=True)

In [21]:
TM = pd.crosstab(resampled['before'], resampled['location'], normalize=1)
TM

location,checkout,dairy,drinks,fruit,spices
before,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dairy,0.246056,0.736919,0.107341,0.070113,0.145906
drinks,0.282594,0.005964,0.598602,0.067359,0.134756
entrance,0.0,0.120176,0.116903,0.220255,0.21424
fruit,0.344479,0.068696,0.071869,0.597025,0.102899
spices,0.126871,0.068246,0.105285,0.045247,0.402198


#### Number of people not getting to checkout
* number of customers - number of times customers went through checkout point 

In [11]:
locations = df.groupby('location')['customer_no'].count()
len(df['customer_no'].unique()) - locations.iloc[0]

28

### MC Simulation
- use your transition probability matrix to propagate the states of an idealized population
- assume that there are infinite customers, so you can consider a state distribution

In [31]:
#Set an initial state distribution vector with all customers in the entrance. 
#Store the state distribution in a result object (list, DataFrame or similar).
initial_state= pd.DataFrame({'probability':[0.0, 0.0, 0.0, 0.0, 1.0], 
                             'location': ['dairy','drinks','entrance','fruit','spices']})
initial_state.set_index('location',inplace=True)
initial_state

Unnamed: 0_level_0,probability
location,Unnamed: 1_level_1
dairy,0.0
drinks,0.0
entrance,0.0
fruit,0.0
spices,1.0


In [32]:
TM

location,checkout,dairy,drinks,fruit,spices
before,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dairy,0.246056,0.736919,0.107341,0.070113,0.145906
drinks,0.282594,0.005964,0.598602,0.067359,0.134756
entrance,0.0,0.120176,0.116903,0.220255,0.21424
fruit,0.344479,0.068696,0.071869,0.597025,0.102899
spices,0.126871,0.068246,0.105285,0.045247,0.402198


In [34]:
#Calculate the next state as a dot product of your transition probability matrix P
np.dot(TM,initial_state)

array([[0.14590634],
       [0.13475629],
       [0.2142402 ],
       [0.10289901],
       [0.40219815]])

In [38]:
#Repeat from 2 for a number of steps
next_steps=np.dot(TM,np.dot(TM,np.dot(TM,initial_state)))