In [211]:
from ipywidgets import interact
import numpy as np
import pandas as pd

from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import Legend, LegendItem

import warnings
warnings.filterwarnings('ignore')
output_notebook()

## Import Data

In [51]:
data_1 = pd.read_csv('customer_transactions_history-main/transactions_1.csv')
data_2 = pd.read_csv('customer_transactions_history-main/transactions_2.csv')

# concat two csv file to one dataframe
data_all = pd.concat([data_1, data_2]).drop(columns=['Unnamed: 0']).reset_index()
data_all = data_all.drop(columns=['index'])

# parsing datetime feature with pd.Timestamp()
data_all['date'] = [pd.Timestamp(i) for i in data_all.date]

data_all

Unnamed: 0,customer_id,product_id,date
0,6689489,0A4G5LZWCP,2017-01-31 09:31:08.873000+00:00
1,6689489,XZ6UVEYK40,2017-02-03 16:59:16.610000+00:00
2,6689489,XZ6UVEYK40,2017-02-05 19:47:55.353000+00:00
3,6689489,XZ6UVEYK40,2017-02-05 19:48:14.633000+00:00
4,6689489,XZ6UVEYK40,2017-02-06 09:59:49.053000+00:00
...,...,...,...
975950,9770266,5RPSDK28JR,2019-05-17 10:06:43.037000+00:00
975951,2552305,5RPSDK28JR,2017-10-27 17:49:54.483000+00:00
975952,6004163,I2IYFZIO6X,2019-01-09 16:31:08.133000+00:00
975953,6632569,0A4G5LZWCP,2017-03-29 16:25:05.907000+00:00


## Data slicer

In [None]:
def slice_data(data, start_year, end_year, start_month, end_month):
    data = data.copy(deep=True)
    data = data[data.date > pd.Timestamp(f'{start_year}-{start_month}-01T00:00:00.000Z')][data.date < pd.Timestamp(f'{end_year}-{end_month}-01T00:00:00.000Z')]
    return data

slice_data(data_all, '2018', '2019', '01', '01')

## Customer purchase freq

In [113]:
def plot_customer_sale_freq(data):
    x = range(len(data.customer_id.value_counts()))
    y = data.customer_id.value_counts()

    TOOLS = "crosshair,pan,wheel_zoom,box_zoom,reset,box_select,lasso_select"

    # create a new plot with the tools above, and explicit ranges
    p = figure(tools=TOOLS, x_range=(-50, max(x)+100), y_range=(-500, max(y)+500), plot_height=400, plot_width=600)

    # add a circle renderer with vectorized colors and sizes
    p.circle(x, y, radius=7, fill_color='darkslategray', fill_alpha=1.0, line_color=None)

    show(p, notebook_handle=True)
    
plot_customer_sale_freq(data=data_all)

## Product purchase freq

In [112]:
def plot_product_sale_freq(data):
    x = range(len(data.product_id.value_counts()))
    y = data.product_id.value_counts()

    TOOLS = "crosshair,pan,wheel_zoom,box_zoom,reset,box_select,lasso_select"

    # create a new plot with the tools above, and explicit ranges
    p = figure(tools=TOOLS, x_range=(-3, max(x)+3), y_range=(-10000, max(y)+10000), plot_height=400, plot_width=600)

    # add a circle renderer with vectorized colors and sizes
    p.circle(x, y, radius=0.3, fill_color='darkslategray', fill_alpha=1.0, line_color=None)

    show(p, notebook_handle=True)
    
plot_product_sale_freq(data_all)

## Purchase freq through time 

In [154]:
def plot_product_monthly_sales(year, data, product_id):
    data = data.copy(deep=True)
    data = slice_data(data, str(year), str(year+1), '01', '01')
    transactions = data[data.product_id == product_id].date
    month_sales = [0,0,0,0,0,0,0,0,0,0,0,0]
    for i in transactions:
        month_sales[i.month - 1] += 1
    
    categories = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

    p = figure(title = f'{year}_{product_id}_Sales', x_range=categories, plot_height=400, plot_width=600)
    p.vbar(x=categories, top=month_sales, width=0.9)

    show(p, notebook_handle=True)
    
plot_product_monthly_sales(year=2018, data = data_all, product_id = '5RPSDK28JR')

## Check seasonality in data

In [217]:
# plot 10 most sold product through time

year_month = [(i, j) for i in [2017, 2018, 2019] for j in [1,2,3,4,5,6,7,8,9,10,11,12]] + [(2020,1), (2020,2), (2020,3)]
agg_data = []
for product in ['5RPSDK28JR','XZ6UVEYK40','G5HEPH9A2T','0A4G5LZWCP','VD51ZQPY9D','XD4I34ED7F','O66JVFS978','GJROWT9RPV','Z4N23EOA13','I2IYFZIO6X']:
    year_month_dict = {}
    for i in year_month:
        year_month_dict[i] = 0

    for i in data_all[data_all.product_id == product].date:
        year_month_dict[(i.year, i.month)] += 1
    agg_data.append(list(year_month_dict.values()))

5RPSDK28JR
XZ6UVEYK40
G5HEPH9A2T
0A4G5LZWCP
VD51ZQPY9D
XD4I34ED7F
O66JVFS978
GJROWT9RPV
Z4N23EOA13
I2IYFZIO6X


In [224]:
from bokeh.palettes import Spectral11

p = figure(plot_width=1200, plot_height=400, x_range = [str(i[0])[2:]+'/'+str(i[1]) for i in year_month_dict.keys()])

itration = 0
for name, color in zip(['5RPSDK28JR','XZ6UVEYK40','G5HEPH9A2T','0A4G5LZWCP','VD51ZQPY9D','XD4I34ED7F','O66JVFS978','GJROWT9RPV','Z4N23EOA13','I2IYFZIO6X'] ,Spectral11):
    p.line([str(i[0])[2:]+'/'+str(i[1]) for i in year_month_dict.keys()], agg_data[itration], color=color, legend=name)
    itration += 1
    
p.legend.location = "top_right"
show(p, notebook_handle=True)

In [152]:
#pd.Timestamp('2017-01-01T00:00:00Z') + pd.Timedelta(days=-180)

Timestamp('2016-07-05 00:00:00+0000', tz='UTC')

## Plot per customer

In [229]:
data_all.customer_id.value_counts()[:10].index

Int64Index([6689489, 5381109, 1756454, 4435859, 8350659, 2819316, 3463352,
            5987313, 7463224, 1634809],
           dtype='int64')

In [230]:
data_all[data_all.customer_id == 6689489]

Unnamed: 0,customer_id,product_id,date
0,6689489,0A4G5LZWCP,2017-01-31 09:31:08.873000+00:00
1,6689489,XZ6UVEYK40,2017-02-03 16:59:16.610000+00:00
2,6689489,XZ6UVEYK40,2017-02-05 19:47:55.353000+00:00
3,6689489,XZ6UVEYK40,2017-02-05 19:48:14.633000+00:00
4,6689489,XZ6UVEYK40,2017-02-06 09:59:49.053000+00:00
...,...,...,...
34958,6689489,G5HEPH9A2T,2020-02-07 11:35:26.967000+00:00
34959,6689489,G5HEPH9A2T,2020-02-07 11:35:28.037000+00:00
34960,6689489,G5HEPH9A2T,2020-02-07 11:35:29.663000+00:00
34961,6689489,G5HEPH9A2T,2020-02-07 11:35:30.887000+00:00


In [231]:
# plot 10 most sold product through time

year_month = [(i, j) for i in [2017, 2018, 2019] for j in [1,2,3,4,5,6,7,8,9,10,11,12]] + [(2020,1), (2020,2), (2020,3)]
agg_data = []
for customer in [6689489, 5381109, 1756454, 4435859, 8350659, 2819316, 3463352, 5987313, 7463224, 1634809]:
    year_month_dict = {}
    for i in year_month:
        year_month_dict[i] = 0

    for i in data_all[data_all.customer_id == customer].date:
        year_month_dict[(i.year, i.month)] += 1
    agg_data.append(list(year_month_dict.values()))

In [233]:
from bokeh.palettes import Spectral11

p = figure(plot_width=1200, plot_height=400, x_range = [str(i[0])[2:]+'/'+str(i[1]) for i in year_month_dict.keys()])

itration = 0
for name, color in zip([6689489, 5381109, 1756454, 4435859, 8350659, 2819316, 3463352, 5987313, 7463224, 1634809] ,Spectral11):
    p.line([str(i[0])[2:]+'/'+str(i[1]) for i in year_month_dict.keys()], agg_data[itration], color=color, legend=str(name))
    itration += 1
    
p.legend.location = "top_right"
show(p, notebook_handle=True)