In [1]:
!python --version

Python 3.7.12


In [2]:
!pip3 install retentioneering



In [3]:
!pip show retentioneering

Name: retentioneering
Version: 2.0.3.3
Summary: Product analytics and marketing optimization framework based on deep user trajectories analysis
Home-page: https://github.com/retentioneering/retentioneering-tools
Author: Retentioneering User Trajectory Analysis Lab
Author-email: retentioneering@gmail.com
License: Retentioneering Software Non-Exclusive License (License)
Location: /usr/local/lib/python3.7/dist-packages
Requires: altair, decorator, matplotlib, matplotlib, networkx, numba, numpy, pandas, plotly, pymongo, scikit-learn, scipy, seaborn, statsmodels, tqdm, umap-learn, vega
Required-by: 


In [4]:
import retentioneering
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, time
from scipy.stats import chi2
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import IPython

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
experiment1_1 = pd.read_csv('drive/My Drive/experiment1_1.csv')
experiment1_2 = pd.read_csv('drive/My Drive/experiment1_2.csv')
experiment2_1 = pd.read_csv('drive/My Drive/experiment2_1.csv')
experiment2_2 = pd.read_csv('drive/My Drive/experiment2_2.csv')

Посмотрим на датасеты эксперимента 1 - вроде сравнимы, по 5 тыс пользователей и не пересекаются, второй эксперимент по длительности немного больше - но, видимо, добирали нужную базу

In [7]:
experiment1_1

Unnamed: 0,client_id,user_session,timestamp,event,session
0,10505,1,2020-05-01 00:00:00.000000000,main,105051
1,10505,1,2020-05-01 00:00:00.000000000,phones,105051
2,10505,1,2020-05-01 00:00:00.000000000,phones/apple,105051
3,10505,1,2020-05-01 00:00:00.000000000,phones,105051
4,10505,1,2020-05-01 00:00:00.000000000,main,105051
...,...,...,...,...,...
352211,15504,8,2020-08-13 15:27:19.500289692,phones/apple,155048
352212,15504,8,2020-08-13 15:27:19.500289692,phones,155048
352213,15504,8,2020-08-13 15:27:19.500289692,phones/apple,155048
352214,15504,8,2020-08-13 15:27:19.500289692,phones,155048


In [8]:
experiment1_1.dtypes

client_id        int64
user_session     int64
timestamp       object
event           object
session          int64
dtype: object

In [9]:
experiment1_1['client_id'].nunique()

5000

In [10]:
experiment1_2

Unnamed: 0,client_id,user_session,timestamp,event,session
0,15505,1,2020-05-01 00:00:00.000000000,main,155051
1,15505,1,2020-05-01 00:00:00.000000000,laptops,155051
2,15505,1,2020-05-01 00:00:00.000000000,main,155051
3,15505,1,2020-05-01 00:00:00.000000000,phones,155051
4,15505,1,2020-05-01 00:00:00.000000000,lost,155051
...,...,...,...,...,...
354800,20504,5,2020-07-31 17:27:53.704342993,lost,205045
354801,20504,6,2020-09-07 12:39:43.565471364,main,205046
354802,20504,6,2020-09-07 12:39:43.565471364,laptops,205046
354803,20504,6,2020-09-07 12:39:43.565471364,main,205046


In [11]:
experiment1_2.dtypes

client_id        int64
user_session     int64
timestamp       object
event           object
session          int64
dtype: object

In [12]:
experiment1_1['client_id'].nunique()

5000

In [13]:
len(set(experiment1_1['client_id']).difference(set(experiment1_2['client_id'])))

5000

Посмотрим какие события у нас есть, чтобы понять событие, которое отвечает за конверсию. События в датасетах одинаковые (и их количества, кстати, тоже более-менее похожи), за событие конверсии будем считать payment_done

In [14]:
experiment1_1['event'].value_counts()

main                    73526
phones                  61311
phones/apple            45289
lost                    40200
laptops                 36398
item_details            25522
laptops/xiaomi/1234     18242
laptops/xiaomi          17159
phones/apple/123        15013
item_reviews             6616
support                  5260
cart                     2821
fill_user_info           1996
support/message_sent     1939
checkout                  786
payment_done              138
Name: event, dtype: int64

In [15]:
experiment1_2['event'].value_counts()

main                    74269
phones                  61593
phones/apple            45642
lost                    40378
laptops                 36829
item_details            25657
laptops/xiaomi/1234     18479
laptops/xiaomi          17324
phones/apple/123        15362
item_reviews             6485
support                  5201
cart                     2724
fill_user_info           1988
support/message_sent     1858
checkout                  862
payment_done              154
Name: event, dtype: int64

Посмотрим, какое количество уникальных пользователей сконвертировалось в группах

In [16]:
experiment1_1[experiment1_1['event']=='payment_done']['client_id'].nunique()

138

In [17]:
experiment1_2[experiment1_2['event']=='payment_done']['client_id'].nunique()

151

Посмотрим на графы переходов. Они не очень информативны по переходу в покупки, потому что очень мало событий конверсии. Зато мы можем посмотреть, как пользователи уходят в lost

In [19]:
retentioneering.config.update({
    'event_col':'event',
    'event_time_col':'timestamp',
    'user_col': 'client_id'
})

In [23]:
IPython.display.HTML(
experiment1_1.rete.plot_graph(thresh=0.01,
                     targets = {'payment_done':'green',
                                'lost':'red'}
                    )
)

In [36]:
IPython.display.HTML(
experiment1_2.rete.plot_graph(thresh=0.01,
                     targets = {'payment_done':'green',
                                'lost':'red'}
                    )
)

Наконец, оценим, есть ли количественная разница в сконвертировавшихся пользователях, используя метрику хи квадрат (метрика, подходящая для конверсии):

In [None]:
O=np.array([138,151,4862,4849])
T=np.array([145,145,4855,4855])
D=np.sum(np.square(T-O)/T)
pvalue=chi2.sf(D, df=1)
print("distance d: {0}\np-value: {1}".format(D,pvalue))

distance d: 0.6037146205476047
p-value: 0.4371642296367684


p-value сильно больше 0,05, не можем с уверенностью сказать, что изменение страниц в эксперименте значимо повлияло на конверсию в лучшую сторону. Воронки сильно идентичные, даже невооруженным взглядом, и на графах это видно

Посмотрим на результаты второго эксперимента

In [26]:
experiment2_1

Unnamed: 0,client_id,user_session,timestamp,event,session
0,10505,1,2020-05-01 00:00:00.000000000,main,105051
1,10505,1,2020-05-01 00:00:00.000000000,phones,105051
2,10505,1,2020-05-01 00:00:00.000000000,phones/apple,105051
3,10505,1,2020-05-01 00:00:00.000000000,phones,105051
4,10505,1,2020-05-01 00:00:00.000000000,main,105051
...,...,...,...,...,...
352211,15504,8,2020-08-13 15:27:19.500289692,phones/apple,155048
352212,15504,8,2020-08-13 15:27:19.500289692,phones,155048
352213,15504,8,2020-08-13 15:27:19.500289692,phones/apple,155048
352214,15504,8,2020-08-13 15:27:19.500289692,phones,155048


In [27]:
experiment2_2

Unnamed: 0,client_id,user_session,timestamp,event,session
0,20505,1,2020-05-01 00:00:00.000000000,main,205051
1,20505,1,2020-05-01 00:00:00.000000000,laptops,205051
2,20505,1,2020-05-01 00:00:00.000000000,laptops/xiaomi,205051
3,20505,1,2020-05-01 00:00:00.000000000,laptops/xiaomi/1234,205051
4,20505,1,2020-05-01 00:00:00.000000000,item_details,205051
...,...,...,...,...,...
604668,25504,8,2020-11-15 01:14:40.752860170,laptops/xiaomi/1234,255048
604669,25504,8,2020-11-15 01:14:40.752860170,laptops/xiaomi,255048
604670,25504,8,2020-11-15 01:14:40.752860170,laptops,255048
604671,25504,8,2020-11-15 01:14:40.752860170,laptops/xiaomi,255048


In [28]:
experiment2_1['client_id'].nunique()

5000

In [29]:
experiment2_2['client_id'].nunique()

5000

In [30]:
len(set(experiment1_1['client_id']).difference(set(experiment1_2['client_id'])))

5000

In [31]:
experiment2_1['event'].value_counts()

main                    73526
phones                  61311
phones/apple            45289
lost                    40200
laptops                 36398
item_details            25522
laptops/xiaomi/1234     18242
laptops/xiaomi          17159
phones/apple/123        15013
item_reviews             6616
support                  5260
cart                     2821
fill_user_info           1996
support/message_sent     1939
checkout                  786
payment_done              138
Name: event, dtype: int64

Контрольная группа, похоже, та же

In [32]:
experiment2_1[experiment2_1['event']=='payment_done']['client_id'].nunique()

138

In [33]:
experiment2_2['event'].value_counts()

phones                  103255
main                     92897
phones/apple             86718
item_details             65636
laptops                  49822
laptops/xiaomi/1234      40998
lost                     39266
phones/apple/123         35741
laptops/xiaomi           31675
item_reviews             18064
support                  14547
cart                      8582
fill_user_info            6971
support/message_sent      6439
checkout                  3109
payment_done               953
Name: event, dtype: int64

О, что-то интересное

In [34]:
experiment2_2[experiment2_2['event']=='payment_done']['client_id'].nunique()

873

Кто-то даже пару раз покупал. Интересно посмотреть на граф: пользователи теперь не уходят в лост с главной и со страница ноутбуков. Меньшие доли пользователей рапределяются по разделам, и меньшие доли возвращаются из lost на main, то есть посещают сайт повторно, но зато они чаще делают покупки. Большая разница в количестве переходов на item details, нужно оценить, что мы изменили в страницах товара

In [37]:
IPython.display.HTML(
experiment2_2.rete.plot_graph(thresh=0.01,
                     targets = {'payment_done':'green',
                                'lost':'red'}
                    )
)

Посмотрим на матрицы переходов экспериментов 1 и 2

In [40]:
df = experiment1_2.rete.get_adjacency(weight_col='client_id', norm_type='full')

import seaborn as sns

#from IPython.html import widgets
from IPython.display import HTML
def rotateheader():
    return  [
      #  dict(selector="th.col_heading",
    #             props=[("writing-mode", "vertical-rl"), 
     #                   ('transform', 'rotateZ(-45deg)'),
     #                   ])
     dict(selector="th", props=[("font-size", "75%"),
                            ("text-align", "right")
                         
                          ]),
        dict(selector="th.col_heading", props=[("font-size", "75%"),
                            ("text-align", "center"),
                          ("transform", "translate(0%,-140%) rotate(-15deg)")
                          ]),
             dict(selector="caption", props=[("caption-side", "bottom")])
]
 

cm = sns.light_palette("green", as_cmap=True)
print("\n\n\n")
df.style.background_gradient(cmap=cm).set_table_styles(rotateheader()).set_caption(" .")







Unnamed: 0,cart,fill_user_info,item_reviews,lost,support/message_sent,checkout,payment_done,item_details,laptops/xiaomi/1234,phones/apple/123,support,laptops,laptops/xiaomi,main,phones,phones/apple
cart,0.0,0.1132,0.129,0.0606,0.1386,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fill_user_info,0.0352,0.0,0.0,0.0408,0.0944,0.0992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
item_reviews,0.2832,0.0,0.0,0.1088,0.0,0.0,0.0,0.4144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9936,0.0,0.0
support/message_sent,0.0776,0.1014,0.0,0.0334,0.0,0.0,0.0,0.0,0.0,0.0,0.0848,0.0,0.0,0.0,0.0,0.0
checkout,0.0,0.0668,0.0,0.0284,0.0,0.0,0.0302,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
payment_done,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027,0.0,0.0
item_details,0.0,0.0,0.537,0.4414,0.0,0.0,0.0,0.0,0.5504,0.5878,0.4884,0.0,0.0,0.0,0.0,0.0
laptops/xiaomi/1234,0.0,0.0,0.0,0.3626,0.0,0.0,0.0,0.7192,0.0,0.0,0.0,0.0,0.5674,0.0,0.0,0.0
phones/apple/123,0.0,0.0,0.0,0.2504,0.0,0.0,0.0,0.6798,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5978


In [41]:
df = experiment2_2.rete.get_adjacency(weight_col='client_id', norm_type='full')

import seaborn as sns

#from IPython.html import widgets
from IPython.display import HTML
def rotateheader():
    return  [
      #  dict(selector="th.col_heading",
    #             props=[("writing-mode", "vertical-rl"), 
     #                   ('transform', 'rotateZ(-45deg)'),
     #                   ])
     dict(selector="th", props=[("font-size", "75%"),
                            ("text-align", "right")
                         
                          ]),
        dict(selector="th.col_heading", props=[("font-size", "75%"),
                            ("text-align", "center"),
                          ("transform", "translate(0%,-140%) rotate(-15deg)")
                          ]),
             dict(selector="caption", props=[("caption-side", "bottom")])
]
 

cm = sns.light_palette("green", as_cmap=True)
print("\n\n\n")
df.style.background_gradient(cmap=cm).set_table_styles(rotateheader()).set_caption(" .")







Unnamed: 0,cart,fill_user_info,item_reviews,lost,support/message_sent,checkout,payment_done,item_details,laptops/xiaomi/1234,phones/apple/123,support,laptops,laptops/xiaomi,main,phones,phones/apple
cart,0.0,0.3134,0.3526,0.1076,0.3574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fill_user_info,0.1252,0.0,0.0,0.0694,0.2944,0.3148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
item_reviews,0.5976,0.0,0.0,0.1706,0.0,0.0,0.0,0.7242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.994,0.0,0.0
support/message_sent,0.2234,0.3028,0.0,0.061,0.0,0.0,0.0,0.0,0.0,0.0,0.252,0.0,0.0,0.0,0.0,0.0
checkout,0.0,0.2064,0.0,0.0458,0.0,0.0,0.1746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
payment_done,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1502,0.0,0.0
item_details,0.0,0.0,0.8172,0.5342,0.0,0.0,0.0,0.0,0.823,0.8482,0.7722,0.0,0.0,0.0,0.0,0.0
laptops/xiaomi/1234,0.0,0.0,0.0,0.3976,0.0,0.0,0.0,0.9004,0.0,0.0,0.0,0.0,0.8026,0.0,0.0,0.0
phones/apple/123,0.0,0.0,0.0,0.3078,0.0,0.0,0.0,0.8908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8408


На матрицах тоже видно, что по-разному отрабатывает item_reviews и почему-то возрастают обращения в support из корзины. У нас лучше работает поддержка? Но зачем вообще в неё обращаются?
Хи=квадрат ожидаемо показывает наличие статистической разницы.

In [None]:
O=np.array([138,873,4862,4127])
T=np.array([506,506,4494,4494])
D=np.sum(np.square(T-O)/T)
pvalue=chi2.sf(D, df=1)
print("distance d: {0}\np-value: {1}".format(D,pvalue))

distance d: 593.9254095491397
p-value: 3.508180411512846e-131


Мы получили какое-то бесконечно маленькое plavue. Очевидно, что тестовая страница во втором тесте отлично просто отрабатывает. Что бы мы там ни сделали, нужно вносить такие же изменения. Даже не верится, надо сроки проверить

In [None]:
experiment2_1['timestamp']=pd.to_datetime(experiment2_1['timestamp'])

In [None]:
experiment2_1['date_only']=experiment2_1['timestamp'].dt.date

In [None]:
pd.crosstab(experiment2_1.date_only, experiment2_1.client_id.nunique())

col_0,5000
date_only,Unnamed: 1_level_1
2020-05-01,8598
2020-05-02,9308
2020-05-03,9907
2020-05-04,10285
2020-05-05,10127
...,...
2020-12-26,1149
2020-12-27,1267
2020-12-28,1375
2020-12-29,1282


In [None]:
experiment2_2['timestamp']=pd.to_datetime(experiment2_2['timestamp'])

In [None]:
experiment2_2['date_only']=experiment2_2['timestamp'].dt.date

In [None]:
pd.crosstab(experiment2_2.date_only, experiment2_2.client_id.nunique())

col_0,5000
date_only,Unnamed: 1_level_1
2020-05-01,15946
2020-05-02,16086
2020-05-03,16869
2020-05-04,16849
2020-05-05,18391
...,...
2020-12-26,2190
2020-12-27,1881
2020-12-28,2197
2020-12-29,1986


Как-то в эксперименте 2_2 в 2 раза больше трафика в предновогодние дни. Если мы уверены, что ничего для этого не делали, люди просто возвращались наконец дооформить покупку, то эксперимент удачный. Но надо подумать