## Project 08. Markov Chain Monte Carlo (MCMC): Predicting and simulating customer behaviour in a supermarket.

## 1. Defining Project Goal 

The goal is to create a transition matrix which shows the order of the aisles that the customers visited and the duration the customers spent there.  

Each transition in a Markov Chain happens with a transition probability that is conditional on the present state. These probabilities can be written as a transition probability matrix. 

Markov Assumption: a state only depends on the previous state. 

## 2. Importing Libraries 

In [1]:
import os
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

## 3. Getting Data

In [2]:
df_list = []

for file in os.listdir('data/'):
    if file.endswith(".csv"):
        df = pd.read_csv(f'data/{file}', parse_dates=True, sep=';', index_col='timestamp')
        df_list.append(df)

df = pd.concat(df_list)

In [3]:
df.sort_values(by='timestamp', inplace=True)

In [4]:
df

Unnamed: 0_level_0,customer_no,location
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-09-02 07:03:00,1,dairy
2019-09-02 07:03:00,2,dairy
2019-09-02 07:04:00,3,dairy
2019-09-02 07:04:00,4,dairy
2019-09-02 07:04:00,5,spices
...,...,...
2019-09-06 21:50:00,1509,drinks
2019-09-06 21:50:00,1507,checkout
2019-09-06 21:50:00,1508,checkout
2019-09-06 21:50:00,1496,fruit


In [5]:
df.shape

(24877, 2)

## 4. Data Wrangling

### 4.1. Adding missing 'checkout' values

In [6]:
# some customers don"t have 'checkout' value as last state in location column. so it is like they never left the shop. 
# add columns for day and time 
df["time"] = df.index.time
df["day"] = df.index.dayofweek
df['date'] = df.index.date
df.head()

Unnamed: 0_level_0,customer_no,location,time,day,date
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-09-02 07:03:00,1,dairy,07:03:00,0,2019-09-02
2019-09-02 07:03:00,2,dairy,07:03:00,0,2019-09-02
2019-09-02 07:04:00,3,dairy,07:04:00,0,2019-09-02
2019-09-02 07:04:00,4,dairy,07:04:00,0,2019-09-02
2019-09-02 07:04:00,5,spices,07:04:00,0,2019-09-02


In [7]:
# at first, creating unique customer ID from day and customer_no  
df['customer_no'] = df['day'].astype(str) + '_' + df['customer_no'].astype(str)

In [8]:
df

Unnamed: 0_level_0,customer_no,location,time,day,date
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-09-02 07:03:00,0_1,dairy,07:03:00,0,2019-09-02
2019-09-02 07:03:00,0_2,dairy,07:03:00,0,2019-09-02
2019-09-02 07:04:00,0_3,dairy,07:04:00,0,2019-09-02
2019-09-02 07:04:00,0_4,dairy,07:04:00,0,2019-09-02
2019-09-02 07:04:00,0_5,spices,07:04:00,0,2019-09-02
...,...,...,...,...,...
2019-09-06 21:50:00,4_1509,drinks,21:50:00,4,2019-09-06
2019-09-06 21:50:00,4_1507,checkout,21:50:00,4,2019-09-06
2019-09-06 21:50:00,4_1508,checkout,21:50:00,4,2019-09-06
2019-09-06 21:50:00,4_1496,fruit,21:50:00,4,2019-09-06


In [9]:
# supermarket closing time, we need for the function calculating the customers without checkout location 
closing_time = df.index.max().time()
print(closing_time)

21:50:00


In [10]:
def add_missing_customer_checkout(df):
    
    customer_ids = df['customer_no'].unique()
    for id in customer_ids:
        customer_df = df[df['customer_no']==id]
        
        if 'checkout' in customer_df['location'].values:
            pass
        else:
            checkout_dict = {'customer_no':id, 'location':'checkout', 'time': closing_time, 'day':np.nan, "date": np.nan}
            print(id)
            df = df.append(checkout_dict, ignore_index=True)
    
    return df

In [11]:
df = add_missing_customer_checkout(df)

0_1430
0_1433
0_1437
0_1440
0_1439
0_1441
0_1443
0_1445
0_1446
0_1447
1_1411
1_1422
2_1520
2_1527
2_1529
2_1528
2_1530
3_1527
3_1532
3_1533
4_1494
4_1496
4_1500
4_1503
4_1505
4_1506
4_1510
4_1509


In [12]:
df.tail()

Unnamed: 0,customer_no,location,time,day,date
24900,4_1503,checkout,21:50:00,,
24901,4_1505,checkout,21:50:00,,
24902,4_1506,checkout,21:50:00,,
24903,4_1510,checkout,21:50:00,,
24904,4_1509,checkout,21:50:00,,


In [13]:
# checking wether "checkout" value has been added to random customer  
df[df['customer_no']=='4_1503']

Unnamed: 0,customer_no,location,time,day,date
24858,4_1503,dairy,21:47:00,4.0,2019-09-06
24867,4_1503,drinks,21:49:00,4.0,2019-09-06
24900,4_1503,checkout,21:50:00,,


In [14]:
# checking the NaN values 
df.isna().sum()

customer_no     0
location        0
time            0
day            28
date           28
dtype: int64

In [15]:
# fill missing values in each column using forward fill method

df = df.sort_values(by=['customer_no', 'time']).fillna(method='ffill')

df.isna().sum()

customer_no    0
location       0
time           0
day            0
date           0
dtype: int64

In [16]:
# checking wether "checkout" value has been added to random customer  
df[df['customer_no']=='4_1503']

Unnamed: 0,customer_no,location,time,day,date
24858,4_1503,dairy,21:47:00,4.0,2019-09-06
24867,4_1503,drinks,21:49:00,4.0,2019-09-06
24900,4_1503,checkout,21:50:00,4.0,2019-09-06


### 4. 2. Resampling data by minutes 

In [17]:
# creating timestamp again from data and time columns and setting it as index 
df['timestamp'] = pd.to_datetime(df['date'].astype(str) + ' ' + df['time'].astype(str))

In [18]:
df = df.set_index(['timestamp'])

In [19]:
df.columns

Index(['customer_no', 'location', 'time', 'day', 'date'], dtype='object')

In [20]:
# since we have timestamp in the index again, we can drop the date and time columns
# df = df.drop(['date', 'time'], axis=1)
df = df.drop(columns=['date', 'time'])

In [21]:
df.head(1)

Unnamed: 0_level_0,customer_no,location,day
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-09-02 07:03:00,0_1,dairy,0.0


In [24]:
# sort values by customer_no and timestamp 
df = df.sort_values(by=['customer_no', 'timestamp'])
df.head(10)

Unnamed: 0_level_0,customer_no,location,day
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-09-02 07:03:00,0_1,dairy,0.0
2019-09-02 07:05:00,0_1,checkout,0.0
2019-09-02 07:06:00,0_10,fruit,0.0
2019-09-02 07:08:00,0_10,checkout,0.0
2019-09-02 07:56:00,0_100,drinks,0.0
2019-09-02 08:03:00,0_100,checkout,0.0
2019-09-02 17:44:00,0_1000,dairy,0.0
2019-09-02 17:50:00,0_1000,drinks,0.0
2019-09-02 17:51:00,0_1000,checkout,0.0
2019-09-02 17:45:00,0_1001,fruit,0.0


In [25]:
# resempling by minutes, rule='T' means by Minute, 
# filling NaN cells with ffill method, which copies everything from above cells
df = df.groupby('customer_no').resample(rule='T').last().fillna(method='ffill')
df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,customer_no,location,day
customer_no,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_1,2019-09-02 07:03:00,0_1,dairy,0.0
0_1,2019-09-02 07:04:00,0_1,dairy,0.0
0_1,2019-09-02 07:05:00,0_1,checkout,0.0
0_10,2019-09-02 07:06:00,0_10,fruit,0.0
0_10,2019-09-02 07:07:00,0_10,fruit,0.0
0_10,2019-09-02 07:08:00,0_10,checkout,0.0
0_100,2019-09-02 07:56:00,0_100,drinks,0.0
0_100,2019-09-02 07:57:00,0_100,drinks,0.0
0_100,2019-09-02 07:58:00,0_100,drinks,0.0
0_100,2019-09-02 07:59:00,0_100,drinks,0.0


### 4.3. Creating Before-After Location Columns

In [26]:
# creating before column
df = df.rename(columns={"location": "before"})
df.head(1)

Unnamed: 0_level_0,Unnamed: 1_level_0,customer_no,before,day
customer_no,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_1,2019-09-02 07:03:00,0_1,dairy,0.0


In [27]:
# creating after column as a copy of before column
df['after'] = df['before']
df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,customer_no,before,day,after
customer_no,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0_1,2019-09-02 07:03:00,0_1,dairy,0.0,dairy
0_1,2019-09-02 07:04:00,0_1,dairy,0.0,dairy
0_1,2019-09-02 07:05:00,0_1,checkout,0.0,checkout
0_10,2019-09-02 07:06:00,0_10,fruit,0.0,fruit
0_10,2019-09-02 07:07:00,0_10,fruit,0.0,fruit


### 4.4. Creating "entrance" value in Before column

In [28]:
# shifting before column with one row down 
df['before'] = df['before'].shift(1)
df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,customer_no,before,day,after
customer_no,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0_1,2019-09-02 07:03:00,0_1,,0.0,dairy
0_1,2019-09-02 07:04:00,0_1,dairy,0.0,dairy
0_1,2019-09-02 07:05:00,0_1,dairy,0.0,checkout
0_10,2019-09-02 07:06:00,0_10,checkout,0.0,fruit
0_10,2019-09-02 07:07:00,0_10,fruit,0.0,fruit


In [29]:
# checking the values: nan and checkout will be changed to entrance in 'before' column
df['before'].unique()

array([nan, 'dairy', 'checkout', 'fruit', 'drinks', 'spices'],
      dtype=object)

In [30]:
# replacing 'checkout' and NaN values in the before column  
df["before"].replace({"checkout": "entrance"}, inplace=True)
df["before"].fillna('entrance', inplace=True)

In [31]:
df['before'].unique()

array(['entrance', 'dairy', 'fruit', 'drinks', 'spices'], dtype=object)

In [32]:
df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,customer_no,before,day,after
customer_no,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0_1,2019-09-02 07:03:00,0_1,entrance,0.0,dairy
0_1,2019-09-02 07:04:00,0_1,dairy,0.0,dairy
0_1,2019-09-02 07:05:00,0_1,dairy,0.0,checkout
0_10,2019-09-02 07:06:00,0_10,entrance,0.0,fruit
0_10,2019-09-02 07:07:00,0_10,fruit,0.0,fruit
0_10,2019-09-02 07:08:00,0_10,fruit,0.0,checkout
0_100,2019-09-02 07:56:00,0_100,entrance,0.0,drinks
0_100,2019-09-02 07:57:00,0_100,drinks,0.0,drinks
0_100,2019-09-02 07:58:00,0_100,drinks,0.0,drinks
0_100,2019-09-02 07:59:00,0_100,drinks,0.0,drinks


In [33]:
# there are 7445 customers who enterred
df[df['before'] == 'entrance']

Unnamed: 0_level_0,Unnamed: 1_level_0,customer_no,before,day,after
customer_no,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0_1,2019-09-02 07:03:00,0_1,entrance,0.0,dairy
0_10,2019-09-02 07:06:00,0_10,entrance,0.0,fruit
0_100,2019-09-02 07:56:00,0_100,entrance,0.0,drinks
0_1000,2019-09-02 17:44:00,0_1000,entrance,0.0,dairy
0_1001,2019-09-02 17:45:00,0_1001,entrance,0.0,fruit
...,...,...,...,...,...
4_995,2019-09-06 17:17:00,4_995,entrance,4.0,spices
4_996,2019-09-06 17:17:00,4_996,entrance,4.0,spices
4_997,2019-09-06 17:18:00,4_997,entrance,4.0,fruit
4_998,2019-09-06 17:18:00,4_998,entrance,4.0,drinks


In [34]:
# there are 7445 customers who left the shop
df[df['after'] == 'checkout']

Unnamed: 0_level_0,Unnamed: 1_level_0,customer_no,before,day,after
customer_no,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0_1,2019-09-02 07:05:00,0_1,dairy,0.0,checkout
0_10,2019-09-02 07:08:00,0_10,fruit,0.0,checkout
0_100,2019-09-02 08:03:00,0_100,drinks,0.0,checkout
0_1000,2019-09-02 17:51:00,0_1000,drinks,0.0,checkout
0_1001,2019-09-02 17:46:00,0_1001,fruit,0.0,checkout
...,...,...,...,...,...
4_995,2019-09-06 17:18:00,4_995,spices,4.0,checkout
4_996,2019-09-06 17:23:00,4_996,dairy,4.0,checkout
4_997,2019-09-06 17:31:00,4_997,drinks,4.0,checkout
4_998,2019-09-06 17:23:00,4_998,drinks,4.0,checkout


## 5. Transition Matrix

In [35]:
# transition matrix with crosstabs, normalize=0 calculates probabilities for each row 
matrix = pd.crosstab(df['before'], df['after'], normalize=0)
matrix

after,checkout,dairy,drinks,fruit,spices
before,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
dairy,0.103313,0.737023,0.058546,0.049772,0.051347
drinks,0.21563,0.0109,0.59856,0.087918,0.086992
entrance,0.000537,0.287441,0.153392,0.3773,0.18133
fruit,0.201558,0.095744,0.054834,0.597199,0.050665
spices,0.150526,0.193214,0.163109,0.090953,0.402198


In [36]:
matrix.sum(axis=1)

before
dairy       1.0
drinks      1.0
entrance    1.0
fruit       1.0
spices      1.0
dtype: float64

In [37]:
# The probability of entrance-checkout is not 0. 
# There are 4 weird customers, who went from the entrance to the checkout.
df[(df['before'] == 'entrance') & (df['after'] == 'checkout')]

Unnamed: 0_level_0,Unnamed: 1_level_0,customer_no,before,day,after
customer_no,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0_1446,2019-09-02 21:50:00,0_1446,entrance,0.0,checkout
0_1447,2019-09-02 21:50:00,0_1447,entrance,0.0,checkout
4_1509,2019-09-06 21:50:00,4_1509,entrance,4.0,checkout
4_1510,2019-09-06 21:50:00,4_1510,entrance,4.0,checkout


### 5.1. Adding exit to the matrix 

In [38]:
# adding exit to the matrix 
matrix['exit'] = [0,0,0,0,0]


In [39]:
matrix

after,checkout,dairy,drinks,fruit,spices,exit
before,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
dairy,0.103313,0.737023,0.058546,0.049772,0.051347,0
drinks,0.21563,0.0109,0.59856,0.087918,0.086992,0
entrance,0.000537,0.287441,0.153392,0.3773,0.18133,0
fruit,0.201558,0.095744,0.054834,0.597199,0.050665,0
spices,0.150526,0.193214,0.163109,0.090953,0.402198,0


In [40]:
checkout_row = {'checkout': [0],  'dairy': [0],  'drinks': [0],  'fruit': [0],  'spices': [0],  'exit': [1]}


In [41]:
checkout = pd.DataFrame(data=checkout_row).rename(index={0: 'checkout'})
checkout

Unnamed: 0,checkout,dairy,drinks,fruit,spices,exit
checkout,0,0,0,0,0,1


In [42]:
matrix = pd.concat([matrix, checkout])

In [43]:
matrix

Unnamed: 0,checkout,dairy,drinks,fruit,spices,exit
dairy,0.103313,0.737023,0.058546,0.049772,0.051347,0
drinks,0.21563,0.0109,0.59856,0.087918,0.086992,0
entrance,0.000537,0.287441,0.153392,0.3773,0.18133,0
fruit,0.201558,0.095744,0.054834,0.597199,0.050665,0
spices,0.150526,0.193214,0.163109,0.090953,0.402198,0
checkout,0.0,0.0,0.0,0.0,0.0,1


In [44]:
list(matrix.loc['checkout'])[0]

0.0

In [45]:
matrix.loc['drinks']*100

checkout    21.562982
dairy        1.089974
drinks      59.856041
fruit        8.791774
spices       8.699229
exit         0.000000
Name: drinks, dtype: float64

In [46]:
# Saving matrix to csv  
# matrix.to_csv('mx.csv')