# Fun with funnels
## Logs API

`Logs API` позволяет выгрузить сырые данные со счетчика.

Документация по Logs API - https://tech.yandex.ru/metrika/doc/api2/logs/intro-docpage/

Данные для этого кейса также доступны на Яндекс.Диске - https://yadi.sk/d/XJMDdTDmDO-c9g

### Шаг 1: получаем токен
Для работы с API необходимо получить свой токен - https://tech.yandex.ru/oauth/doc/dg/tasks/get-oauth-token-docpage/

Создаем приложение тут (указываем права для чтения в Яндекс.Метрике) - https://oauth.yandex.ru/client/new

Переходим по ссылке вида - `https://oauth.yandex.ru/authorize?response_type=token&client_id=<идентификатор приложения>`

In [3]:
with open('token.txt') as f:
    TOKEN = f.read().strip()

### Шаг 2: проверяем, можно ли создать запрос в Logs API

In [4]:
import requests
import pandas as pd
import StringIO
import datetime
import json
from urllib import urlencode

import sys
stdin, stdout, stderr = sys.stdin, sys.stdout, sys.stderr
reload(sys)
sys.stdin, sys.stdout, sys.stderr = stdin, stdout, stderr
sys.setdefaultencoding('utf8')

In [5]:
API_HOST = 'https://api-metrika.yandex.ru'
COUNTER_ID = 51017531
START_DATE = '2018-07-01'
END_DATE = '2018-07-31'
SOURCE = 'hits'
API_FIELDS = ('ym:pv:date', 'ym:pv:dateTime', 'ym:pv:URL', 'ym:pv:deviceCategory', 
         'ym:pv:operatingSystemRoot', 'ym:pv:clientID', 'ym:pv:browser', 'ym:pv:lastTrafficSource')


In [6]:
url_params = urlencode(
    [
        ('date1', START_DATE),
        ('date2', END_DATE),
        ('source', SOURCE),
        ('fields', ','.join(API_FIELDS)),
        ('oauth_token', TOKEN)
    ]
)

url = '{host}/management/v1/counter/{counter_id}/logrequests/evaluate?'\
    .format(host=API_HOST, counter_id=COUNTER_ID) + url_params

r = requests.get(url)

In [7]:
r.status_code

200

In [8]:
json.loads(r.text)['log_request_evaluation']

{u'max_possible_day_quantity': 2847, u'possible': True}

### Шаг 3: создаем запрос

In [9]:
url_params = urlencode(
    [
        ('date1', START_DATE),
        ('date2', END_DATE),
        ('source', SOURCE),
        ('fields', ','.join(sorted(API_FIELDS, key=lambda s: s.lower()))),
        ('oauth_token', TOKEN)
    ]
)
url = '{host}/management/v1/counter/{counter_id}/logrequests?'\
    .format(host=API_HOST,
            counter_id=COUNTER_ID) \
      + url_params

r = requests.post(url)

In [10]:
r.status_code

200

In [11]:
json.loads(r.text)['log_request']

{u'counter_id': 51017531,
 u'date1': u'2018-07-01',
 u'date2': u'2018-07-31',
 u'fields': [u'ym:pv:browser',
  u'ym:pv:clientID',
  u'ym:pv:date',
  u'ym:pv:dateTime',
  u'ym:pv:deviceCategory',
  u'ym:pv:lastTrafficSource',
  u'ym:pv:operatingSystemRoot',
  u'ym:pv:URL'],
 u'request_id': 1706834,
 u'source': u'hits',
 u'status': u'created'}

In [12]:
request_id = json.loads(r.text)['log_request']['request_id']

### Шаг 4: ждем окончания обработки

In [40]:
status = 'created'
while status == 'created':
    time.sleep(60)
    print 'trying'
    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}?oauth_token={token}' \
            .format(request_id=request_id,
                    counter_id=COUNTER_ID,
                    token=TOKEN,
                    host=API_HOST)

    r = requests.get(url)
    if r.status_code == 200:
        status = json.loads(r.text)['log_request']['status']
        print json.dumps(json.loads(r.text)['log_request'], indent = 4)
    else:
        raise ValueError, r.text

trying
{
    "date1": "2018-07-01", 
    "status": "created", 
    "date2": "2018-07-31", 
    "counter_id": 51017531, 
    "fields": [
        "ym:pv:browser", 
        "ym:pv:clientID", 
        "ym:pv:date", 
        "ym:pv:dateTime", 
        "ym:pv:deviceCategory", 
        "ym:pv:lastTrafficSource", 
        "ym:pv:operatingSystemRoot", 
        "ym:pv:URL"
    ], 
    "source": "hits", 
    "request_id": 1706834
}
trying
{
    "date1": "2018-07-01", 
    "status": "processed", 
    "date2": "2018-07-31", 
    "counter_id": 51017531, 
    "fields": [
        "ym:pv:browser", 
        "ym:pv:clientID", 
        "ym:pv:date", 
        "ym:pv:dateTime", 
        "ym:pv:deviceCategory", 
        "ym:pv:lastTrafficSource", 
        "ym:pv:operatingSystemRoot", 
        "ym:pv:URL"
    ], 
    "source": "hits", 
    "parts": [
        {
            "part_number": 0, 
            "size": 83750280
        }, 
        {
            "part_number": 1, 
            "size": 136
        }
    

In [41]:
json.loads(r.text)['log_request']

{u'counter_id': 51017531,
 u'date1': u'2018-07-01',
 u'date2': u'2018-07-31',
 u'fields': [u'ym:pv:browser',
  u'ym:pv:clientID',
  u'ym:pv:date',
  u'ym:pv:dateTime',
  u'ym:pv:deviceCategory',
  u'ym:pv:lastTrafficSource',
  u'ym:pv:operatingSystemRoot',
  u'ym:pv:URL'],
 u'parts': [{u'part_number': 0, u'size': 83750280},
  {u'part_number': 1, u'size': 136}],
 u'request_id': 1706834,
 u'size': 83750416,
 u'source': u'hits',
 u'status': u'processed'}

In [42]:
parts = json.loads(r.text)['log_request']['parts']
parts

[{u'part_number': 0, u'size': 83750280}, {u'part_number': 1, u'size': 136}]

### Шаг 5: выгружаем данные

In [43]:
tmp_dfs = []
for part_num in map(lambda x: x['part_number'], parts):
    url = '{host}/management/v1/counter/{counter_id}/logrequest/{request_id}/part/{part}/download?oauth_token={token}' \
            .format(
                host=API_HOST,
                counter_id=COUNTER_ID,
                request_id=request_id,
                part=part_num,
                token=TOKEN
            )

    r = requests.get(url)
    if r.status_code == 200:
        tmp_df = pd.read_csv(StringIO.StringIO(r.text), sep = '\t')
        tmp_dfs.append(tmp_df)
    else:
        raise ValueError, r.text
        
logs_df = pd.concat(tmp_dfs)

In [44]:
logs_df.shape

(757936, 8)

In [45]:
logs_df.to_csv('matemarketing_case_data.csv', sep = '\t', index = False)

## ClickHouse
### Установка 

В первую очередь нам понадобится ClickHouse.
ClickHouse можно установить на Linux и MAC (подробно про установку написано в документации https://clickhouse.yandex/).

Также удобно запустить CH в Docker'e: первая команда поднимает clickhouse-server на порту 8123, а вторая - позволяет подключиться к консольному ClickHouse клиенту.
```
docker run -d --name clickhouse-server --publish=8123:8123 --publish=9000:9000 yandex/clickhouse-server
docker run -it --rm --link clickhouse-server:9000 yandex/clickhouse-client --host clickhouse-server
```

Из библиотек для python нам понадобятся:
* `requests`
* `plotly`
* `pandas`

###  Функции для интеграции с ClickHouse

Напишем функции для интеграции с ClickHouse: первая функция просто возвращает результат из DataBase, вторая же преобразует его в pandas DataFrame.

Также напишем сразу удобную функцию для загрузки данных.

In [46]:
CH_HOST = 'http://localhost:8123'
pd.set_option('display.max_colwidth', 1000)

def get_clickhouse_data(query, host = CH_HOST, connection_timeout = 1500):
    r = requests.post(host, params = {'query': query}, timeout = connection_timeout)
    if r.status_code == 200:
        return r.text
    else:
        raise ValueError, r.text
        
def get_clickhouse_df(query, host = CH_HOST, connection_timeout = 1500):
    data = get_clickhouse_data(query, host, connection_timeout) 
    df = pd.read_csv(StringIO.StringIO(data), sep = '\t')
    return df

def upload(table, content, host=CH_HOST):
    content = content.encode('utf-8')
    query_dict = {
             'query': 'INSERT INTO ' + table + ' FORMAT TabSeparatedWithNames '
        }
    r = requests.post(host, data=content, params=query_dict)
    result = r.text
    if r.status_code == 200:
        return result
    else:
        raise ValueError(r.text)

### Загружаем данные

In [47]:
logs_df.head()

Unnamed: 0,ym:pv:browser,ym:pv:clientID,ym:pv:date,ym:pv:dateTime,ym:pv:deviceCategory,ym:pv:lastTrafficSource,ym:pv:operatingSystemRoot,ym:pv:URL
0,safari_mobile,17078820655649821006,2018-07-27,2018-07-27 09:00:46,3,ad,ios_double,https://supermarket.ru/product
1,ucbrowser,10376593093900581806,2018-07-27,2018-07-27 19:43:32,2,ad,android,https://supermarket.ru/product
2,ucbrowser,10376593093900581806,2018-07-27,2018-07-27 19:43:49,2,ad,android,https://supermarket.ru/product
3,chromemobile,15254990414923183452,2018-07-27,2018-07-27 15:54:29,2,organic,android,https://supermarket.ru/product
4,chrome,8540869803059348947,2018-07-27,2018-07-27 13:11:08,1,referral,windows,https://supermarket.ru/product


In [48]:
q = 'drop table if exists hits'
get_clickhouse_data(q)

q = '''
create table hits (
    Browser String,
    ClientID UInt64,
    EventDate Date,
    EventTime DateTime,
    DeviceCategory String,
    TraficSource String,
    OSRoot String,
    URL String
) ENGINE = MergeTree(EventDate, intHash32(ClientID), (EventDate, intHash32(ClientID)), 8192)
'''

get_clickhouse_data(q)

u''

In [49]:
%%time
upload(
    'hits',
    logs_df.to_csv(index = False, sep = '\t'))

CPU times: user 4.64 s, sys: 432 ms, total: 5.07 s
Wall time: 6.93 s


u''

### Разминка: смотрим на пути пользователей

In [50]:
q = '''
SELECT
    URL,
    uniq(ClientID) as users,
    count() as hits
FROM hits
GROUP BY URL
ORDER BY users DESC
LIMIT 10
FORMAT TabSeparatedWithNames
'''

get_clickhouse_df(q)

Unnamed: 0,URL,users,hits
0,https://supermarket.ru/product,86681,267636
1,https://supermarket.ru/catalog,64045,232665
2,https://supermarket.ru/main,53855,103584
3,https://supermarket.ru/promo,20618,38904
4,https://supermarket.ru/search,19200,49732
5,https://supermarket.ru/promo_action_1,5376,7793
6,https://supermarket.ru/shops,5274,9509
7,https://supermarket.ru/blog,4160,5813
8,https://supermarket.ru/cards,3029,10821
9,https://supermarket.ru/basket,2874,11338


In [51]:
q = '''
SELECT
    ClientID,
    groupArray(URL) as path,
    arraySlice(groupArray(URL), 1, 5) as path_lim
FROM
    (SELECT * FROM hits ORDER BY ClientID, EventTime)
GROUP BY ClientID
LIMIT 10
FORMAT TabSeparatedWithNames
'''

get_clickhouse_df(q)

Unnamed: 0,ClientID,path,path_lim
0,11328496494700445631,['https://supermarket.ru/catalog'],['https://supermarket.ru/catalog']
1,4378518504153020253,['https://supermarket.ru/product'],['https://supermarket.ru/product']
2,3573365835552612154,['https://supermarket.ru/product'],['https://supermarket.ru/product']
3,6722497546488043716,"['https://supermarket.ru/main','https://supermarket.ru/search','https://supermarket.ru/product','https://supermarket.ru/search','https://supermarket.ru/product','https://supermarket.ru/product','https://supermarket.ru/search','https://supermarket.ru/product','https://supermarket.ru/user_profile','https://supermarket.ru/product','https://supermarket.ru/product','https://supermarket.ru/main','https://supermarket.ru/catalog','https://supermarket.ru/catalog','https://supermarket.ru/product']","['https://supermarket.ru/main','https://supermarket.ru/search','https://supermarket.ru/product','https://supermarket.ru/search','https://supermarket.ru/product']"
4,11782234878805415335,"['https://supermarket.ru/product','https://supermarket.ru/main','https://supermarket.ru/promo','https://supermarket.ru/search','https://supermarket.ru/search','https://supermarket.ru/product','https://supermarket.ru/search']","['https://supermarket.ru/product','https://supermarket.ru/main','https://supermarket.ru/promo','https://supermarket.ru/search','https://supermarket.ru/search']"
5,12235634756572665986,['https://supermarket.ru/search'],['https://supermarket.ru/search']
6,6390005486945094701,"['https://supermarket.ru/main','https://supermarket.ru/catalog']","['https://supermarket.ru/main','https://supermarket.ru/catalog']"
7,1755001460118103603,"['https://supermarket.ru/promo','https://supermarket.ru/catalog','https://supermarket.ru/catalog','https://supermarket.ru/search']","['https://supermarket.ru/promo','https://supermarket.ru/catalog','https://supermarket.ru/catalog','https://supermarket.ru/search']"
8,4444492313274277176,['https://supermarket.ru/product'],['https://supermarket.ru/product']
9,1789936309455297152,"['https://supermarket.ru/main','https://supermarket.ru/catalog','https://supermarket.ru/catalog','https://supermarket.ru/product','https://supermarket.ru/product','https://supermarket.ru/product','https://supermarket.ru/product','https://supermarket.ru/product','https://supermarket.ru/promo','https://supermarket.ru/promo_action_1','https://supermarket.ru/promo_action_1_rules','https://supermarket.ru/promo_action_1','https://supermarket.ru/promo_action_1_rules','https://supermarket.ru/main','https://supermarket.ru/product','https://supermarket.ru/promo','https://supermarket.ru/promo_action_1','https://supermarket.ru/promo_action_1','https://supermarket.ru/promo_action_1','https://supermarket.ru/main','https://supermarket.ru/catalog','https://supermarket.ru/main','https://supermarket.ru/product','https://supermarket.ru/main','https://supermarket.ru/main']","['https://supermarket.ru/main','https://supermarket.ru/catalog','https://supermarket.ru/catalog','https://supermarket.ru/product','https://supermarket.ru/product']"


In [52]:
q = '''
SELECT
    path_lim,
    count() as users
FROM
    (SELECT
        ClientID,
        arraySlice(groupArray(URL), 1, 5) as path_lim
    FROM
        (SELECT * FROM hits ORDER BY ClientID, EventTime)
    GROUP BY ClientID)
GROUP BY path_lim
ORDER BY users desc
LIMIT 10
FORMAT TabSeparatedWithNames
'''

get_clickhouse_df(q)

Unnamed: 0,path_lim,users
0,['https://supermarket.ru/product'],28563
1,['https://supermarket.ru/catalog'],10298
2,['https://supermarket.ru/promo'],5827
3,"['https://supermarket.ru/product','https://supermarket.ru/product']",5547
4,['https://supermarket.ru/main'],3638
5,"['https://supermarket.ru/main','https://supermarket.ru/catalog']",2423
6,"['https://supermarket.ru/catalog','https://supermarket.ru/catalog']",2327
7,"['https://supermarket.ru/product','https://supermarket.ru/product','https://supermarket.ru/product']",1857
8,"['https://supermarket.ru/main','https://supermarket.ru/search']",1709
9,"['https://supermarket.ru/main','https://supermarket.ru/catalog','https://supermarket.ru/product','https://supermarket.ru/catalog','https://supermarket.ru/product']",1612


Пофильтруем повторяющиеся шаги в путях, чтобы было понятнее, в каких разделах были пользователи

In [53]:
q = '''
SELECT
    path,
    count() as users,
    arrayEnumerate(path) as indexes,
    arrayFilter(val, index -> (index = 1) or (path[index - 1] != path[index]),
        path, indexes) as path_filt
FROM
    (SELECT
        ClientID,
        groupArray(URL) as path
    FROM
        (SELECT * FROM hits ORDER BY ClientID, EventTime)
    GROUP BY ClientID)
GROUP BY path
ORDER BY users desc
LIMIT 10
FORMAT TabSeparatedWithNames
'''

get_clickhouse_df(q)

Unnamed: 0,path,users,indexes,path_filt
0,['https://supermarket.ru/product'],28563,[1],['https://supermarket.ru/product']
1,['https://supermarket.ru/catalog'],10298,[1],['https://supermarket.ru/catalog']
2,['https://supermarket.ru/promo'],5827,[1],['https://supermarket.ru/promo']
3,"['https://supermarket.ru/product','https://supermarket.ru/product']",5547,"[1,2]",['https://supermarket.ru/product']
4,['https://supermarket.ru/main'],3638,[1],['https://supermarket.ru/main']
5,"['https://supermarket.ru/main','https://supermarket.ru/catalog']",2423,"[1,2]","['https://supermarket.ru/main','https://supermarket.ru/catalog']"
6,"['https://supermarket.ru/catalog','https://supermarket.ru/catalog']",2327,"[1,2]",['https://supermarket.ru/catalog']
7,"['https://supermarket.ru/product','https://supermarket.ru/product','https://supermarket.ru/product']",1857,"[1,2,3]",['https://supermarket.ru/product']
8,"['https://supermarket.ru/main','https://supermarket.ru/search']",1709,"[1,2]","['https://supermarket.ru/main','https://supermarket.ru/search']"
9,['https://supermarket.ru/promo_action_1'],1550,[1],['https://supermarket.ru/promo_action_1']


In [54]:
q = '''
SELECT 
    path_filt,
    sum(users) as users
FROM
    (SELECT
        path,
        count() as users,
        arrayEnumerate(path) as indexes,
        arrayFilter(val, index -> (index = 1) or (path[index - 1] != path[index]),
            path, indexes) as path_filt
    FROM
        (SELECT
            ClientID,
            groupArray(URL) as path
        FROM
            (SELECT * FROM hits ORDER BY ClientID, EventTime)
        GROUP BY ClientID)
    GROUP BY path)
GROUP BY path_filt
ORDER BY users DESC
LIMIT 10
FORMAT TabSeparatedWithNames
'''

get_clickhouse_df(q)

Unnamed: 0,path_filt,users
0,['https://supermarket.ru/product'],37675
1,['https://supermarket.ru/catalog'],14478
2,['https://supermarket.ru/promo'],6767
3,"['https://supermarket.ru/main','https://supermarket.ru/catalog']",5637
4,['https://supermarket.ru/main'],4463
5,"['https://supermarket.ru/main','https://supermarket.ru/catalog','https://supermarket.ru/product']",3819
6,"['https://supermarket.ru/main','https://supermarket.ru/search']",3009
7,"['https://supermarket.ru/catalog','https://supermarket.ru/product']",2898
8,['https://supermarket.ru/promo_action_1'],1829
9,"['https://supermarket.ru/product','https://supermarket.ru/catalog']",1542


## Расчет funnels

#### Подход в лоб

In [55]:
q = '''
SELECT
    ClientID,
    max(URL = 'https://supermarket.ru/main') as step1_achieved,
    sequenceMatch('(?1).*(?2)')(EventTime, URL = 'https://supermarket.ru/main', 
      URL = 'https://supermarket.ru/catalog') as step2_achieved,
    sequenceMatch('(?1).*(?2).*(?3)')(EventTime, URL = 'https://supermarket.ru/main', 
      URL = 'https://supermarket.ru/catalog', URL = 'https://supermarket.ru/product') as step3_achieved
FROM hits
GROUP BY ClientID
LIMIT 10
FORMAT TabSeparatedWithNames
'''

get_clickhouse_df(q)

Unnamed: 0,ClientID,step1_achieved,step2_achieved,step3_achieved
0,11328496494700445631,0,0,0
1,4378518504153020253,0,0,0
2,3573365835552612154,0,0,0
3,6722497546488043716,1,1,1
4,11782234878805415335,1,0,0
5,12235634756572665986,0,0,0
6,6390005486945094701,1,1,0
7,1755001460118103603,0,0,0
8,4444492313274277176,0,0,0
9,1789936309455297152,1,1,1


In [56]:
q = '''
SELECT
    sum(step1_achieved) as step1,
    sum(step2_achieved) as step2,
    sum(step3_achieved) as step3
    --round(100.*step2/step1, 2) as step2_to_1,
    --round(100.*step3/step1, 2) as step3_to_1
FROM
    (SELECT
        ClientID,
        max(URL = 'https://supermarket.ru/main') as step1_achieved,
        sequenceMatch('(?1).*(?2)')(EventTime, URL = 'https://supermarket.ru/main', 
          URL = 'https://supermarket.ru/catalog') as step2_achieved,
        sequenceMatch('(?1).*(?2).*(?3)')(EventTime, URL = 'https://supermarket.ru/main', 
          URL = 'https://supermarket.ru/catalog', URL = 'https://supermarket.ru/product') as step3_achieved
    FROM hits
    GROUP BY ClientID)
FORMAT TabSeparatedWithNames
'''

fdf = get_clickhouse_df(q)

In [57]:
fdf

Unnamed: 0,step1,step2,step3
0,53856,33919,22832


Визуализируем

In [58]:
from plotly.offline import init_notebook_mode, iplot
import plotly
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [71]:
colors = colors = ['#d54936', '#faca34', '#437cba', '#8bc34a', '#795548', '#309688', '#000000', '#40bcd4', '#9e9e9e', '#3ca9f4']
phases = ['Main', 'Catalog', 'Product']
values = fdf.iloc[0].values

In [72]:
def plot_funnel(phases, values):
    n_phase = len(phases)
    plot_width = 500.

    # height of a section and difference between sections 
    section_h = 100
    section_d = 10

    # multiplication factor to calculate the width of other sections
    unit_width = plot_width / max(values)

    # width of each funnel section relative to the plot width
    phase_w = [int(value * unit_width) for value in values]
    print phase_w

    # plot height based on the number of sections and the gap in between them
    height = section_h * n_phase + section_d * (n_phase - 1)
    
    # list containing all the plot shapes
    shapes = []

    # list containing the Y-axis location for each section's name and value text
    label_y = []

    for i in range(n_phase):
            if (i == n_phase-1):
                    points = [phase_w[i] / 2, height, phase_w[i] / 2, height - section_h]
            else:
                    points = [phase_w[i] / 2, height, phase_w[i+1] / 2, height - section_h]

            path = 'M {0} {1} L {2} {3} L -{2} {3} L -{0} {1} Z'.format(*points)

            shape = {
                    'type': 'path',
                    'path': path,
                    'fillcolor': colors[i],
                    'line': {
                        'width': 1,
                        'color': colors[i]
                    }
            }
            shapes.append(shape)

            # Y-axis location for this section's details (text)
            label_y.append(height - (section_h / 2))

            height = height - (section_h + section_d)

    # For phase names
    label_trace = go.Scatter(
        x=[-350]*n_phase,
        y=label_y,
        mode='text',
        text=phases,
        textfont=dict(
            color='rgb(40,40,40)',
            size=15
        )
    )

    # For phase values
    value_trace = go.Scatter(
        x=[350]*n_phase,
        y=label_y,
        mode='text',
        text=values,
        textfont=dict(
            color='rgb(40,40,40)',
            size=15
        )
    )

    data = [label_trace, value_trace]

    layout = go.Layout(
        title="<b>Funnel Chart</b>",
        titlefont=dict(
            size=20,
            color='rgb(0,0,0)'
        ),
        shapes=shapes,
        height=560,
        width=800,
        showlegend=False,
        paper_bgcolor='rgba(255,255,255,1)',
        plot_bgcolor='rgba(255,255,255,1)',
        xaxis=dict(
            showticklabels=False,
            zeroline=False,
            showgrid=False,
            range=[-450, 450]
        ),
        yaxis=dict(
            showticklabels=False,
            zeroline=False,
            showgrid=False
        )
    )

    fig = go.Figure(data=data, layout=layout)
    iplot(fig, show_link=False)

In [73]:
plot_funnel(phases, values)

[500, 314, 211, 11, 3]


### Добавим еще один шаг

In [74]:
%%time
q = '''
SELECT
    sum(step1_achieved) as step1,
    sum(step2_achieved) as step2,
    sum(step3_achieved) as step3,
    sum(step4_achieved) as step4,
    sum(step5_achieved) as step5
FROM
    (SELECT
        ClientID,
        max(URL = 'https://supermarket.ru/main') as step1_achieved,
        sequenceMatch('(?1).*(?2)')(EventTime, URL = 'https://supermarket.ru/main', 
          URL = 'https://supermarket.ru/catalog') as step2_achieved,
        sequenceMatch('(?1).*(?2).*(?3)')(EventTime, URL = 'https://supermarket.ru/main', 
          URL = 'https://supermarket.ru/catalog', URL = 'https://supermarket.ru/product') as step3_achieved,
        sequenceMatch('(?1).*(?2).*(?3).*(?4)')(EventTime, URL = 'https://supermarket.ru/main', 
          URL = 'https://supermarket.ru/catalog', URL = 'https://supermarket.ru/product', 
          URL = 'https://supermarket.ru/basket') as step4_achieved,
        sequenceMatch('(?1).*(?2).*(?3).*(?4).*(?5)')(EventTime, URL = 'https://supermarket.ru/main', 
          URL = 'https://supermarket.ru/catalog', URL = 'https://supermarket.ru/product', 
          URL = 'https://supermarket.ru/basket', URL = 'https://supermarket.ru/finish_order') as step5_achieved
    FROM hits
    GROUP BY ClientID)
FORMAT TabSeparatedWithNames
'''

fdf = get_clickhouse_df(q)

ValueError: Code: 160, e.displayText() = DB::Exception: Pattern application proves too difficult, exceeding max iterations (1000000), e.what() = DB::Exception


### Пойдем другим путем

In [64]:
q = '''
SELECT
    ClientID,
    groupArray(cast(EventTime as UInt64)) as times,
    groupArray(URL) as urls,
    arrayFilter(time, url -> url = 'https://supermarket.ru/main', times, urls)[1] as step1_time
FROM
    (SELECT * FROM hits ORDER BY ClientID, EventTime)
GROUP BY ClientID
LIMIT 5
FORMAT TabSeparatedWithNames
'''

get_clickhouse_df(q)

Unnamed: 0,ClientID,times,urls,step1_time
0,11328496494700445631,[1530616006],['https://supermarket.ru/catalog'],0
1,4378518504153020253,[1533000503],['https://supermarket.ru/product'],0
2,3573365835552612154,[1530630727],['https://supermarket.ru/product'],0
3,6722497546488043716,"[1531310304,1531310317,1531310347,1531310454,1531311435,1531311438,1531312425,1531312725,1531312779,1531312785,1531316138,1532538815,1532538889,1532538902,1532538984]","['https://supermarket.ru/main','https://supermarket.ru/search','https://supermarket.ru/product','https://supermarket.ru/search','https://supermarket.ru/product','https://supermarket.ru/product','https://supermarket.ru/search','https://supermarket.ru/product','https://supermarket.ru/user_profile','https://supermarket.ru/product','https://supermarket.ru/product','https://supermarket.ru/main','https://supermarket.ru/catalog','https://supermarket.ru/catalog','https://supermarket.ru/product']",1531310304
4,11782234878805415335,"[1530824000,1532278205,1532278212,1532278223,1532278270,1532278281,1532278286]","['https://supermarket.ru/product','https://supermarket.ru/main','https://supermarket.ru/promo','https://supermarket.ru/search','https://supermarket.ru/search','https://supermarket.ru/product','https://supermarket.ru/search']",1532278205


In [65]:
q = '''
SELECT
    ClientID,
    groupArray(cast(EventTime as UInt64)) as times,
    groupArray(URL) as urls,
    arrayFilter(time, url -> url = 'https://supermarket.ru/main', times, urls)[1] as step1_time,
    arrayFilter(time, url -> url = 'https://supermarket.ru/catalog' and step1_time != 0 and time >= step1_time, times, urls)[1] as step2_time
FROM
    (SELECT * FROM hits ORDER BY ClientID, EventTime)
GROUP BY ClientID
LIMIT 5
FORMAT TabSeparatedWithNames
'''

get_clickhouse_df(q)

Unnamed: 0,ClientID,times,urls,step1_time,step2_time
0,11328496494700445631,[1530616006],['https://supermarket.ru/catalog'],0,0
1,4378518504153020253,[1533000503],['https://supermarket.ru/product'],0,0
2,3573365835552612154,[1530630727],['https://supermarket.ru/product'],0,0
3,6722497546488043716,"[1531310304,1531310317,1531310347,1531310454,1531311435,1531311438,1531312425,1531312725,1531312779,1531312785,1531316138,1532538815,1532538889,1532538902,1532538984]","['https://supermarket.ru/main','https://supermarket.ru/search','https://supermarket.ru/product','https://supermarket.ru/search','https://supermarket.ru/product','https://supermarket.ru/product','https://supermarket.ru/search','https://supermarket.ru/product','https://supermarket.ru/user_profile','https://supermarket.ru/product','https://supermarket.ru/product','https://supermarket.ru/main','https://supermarket.ru/catalog','https://supermarket.ru/catalog','https://supermarket.ru/product']",1531310304,1532538889
4,11782234878805415335,"[1530824000,1532278205,1532278212,1532278223,1532278270,1532278281,1532278286]","['https://supermarket.ru/product','https://supermarket.ru/main','https://supermarket.ru/promo','https://supermarket.ru/search','https://supermarket.ru/search','https://supermarket.ru/product','https://supermarket.ru/search']",1532278205,0


In [66]:
q = '''
SELECT
    countIf(step1_time != 0) as step1,
    countIf(step2_time != 0) as step2,
    countIf(step3_time != 0) as step3,
    countIf(step4_time != 0) as step4,
    countIf(step5_time != 0) as step5
FROM
    (SELECT
        ClientID,
        groupArray(cast(EventTime as UInt64)) as times,
        groupArray(URL) as urls,
        arrayFilter(time, url -> url = 'https://supermarket.ru/main', times, urls)[1] as step1_time,
        arrayFilter(time, url -> url = 'https://supermarket.ru/catalog' and step1_time != 0 and time >= step1_time, times, urls)[1] as step2_time,
        arrayFilter(time, url -> url = 'https://supermarket.ru/product' and step2_time != 0 and time >= step2_time, times, urls)[1] as step3_time,
        arrayFilter(time, url -> url = 'https://supermarket.ru/basket' and step3_time != 0 and time >= step3_time, times, urls)[1] as step4_time,
        arrayFilter(time, url -> url = 'https://supermarket.ru/finish_order' and step4_time != 0 and time >= step4_time, times, urls)[1] as step5_time
    FROM
        (SELECT * FROM hits ORDER BY ClientID, EventTime)
    GROUP BY ClientID)
FORMAT TabSeparatedWithNames
'''

fdf = get_clickhouse_df(q)

In [75]:
phases = ['Main', 'Catalog', 'Product', 'Basket', 'Finish order']
values = fdf.iloc[0].values

plot_funnel(phases, values)

[500, 314, 211, 11, 3]


Можем также посчитать среднее время достижения шагов

In [70]:
q = '''
SELECT
    medianIf(step2_time - step1_time, step2_time != 0) as step2_from_step1_mean_time,
    medianIf(step3_time - step2_time, step3_time != 0) as step3_from_step2_mean_time,
    medianIf(step4_time - step3_time, step4_time != 0) as step4_from_step3_mean_time,
    medianIf(step5_time - step4_time, step5_time != 0) as step5_from_step4_mean_time
FROM
    (SELECT
        ClientID,
        groupArray(cast(EventTime as UInt64)) as times,
        groupArray(URL) as urls,
        arrayFilter(time, url -> url = 'https://supermarket.ru/main', times, urls)[1] as step1_time,
        arrayFilter(time, url -> url = 'https://supermarket.ru/catalog' and step1_time != 0 and time >= step1_time, times, urls)[1] as step2_time,
        arrayFilter(time, url -> url = 'https://supermarket.ru/product' and step2_time != 0 and time >= step2_time, times, urls)[1] as step3_time,
        arrayFilter(time, url -> url = 'https://supermarket.ru/basket' and step3_time != 0 and time >= step3_time, times, urls)[1] as step4_time,
        arrayFilter(time, url -> url = 'https://supermarket.ru/finish_order' and step4_time != 0 and time >= step4_time, times, urls)[1] as step5_time
    FROM
        (SELECT * FROM hits ORDER BY ClientID, EventTime)
    GROUP BY ClientID)
FORMAT TabSeparatedWithNames
'''

get_clickhouse_df(q)

Unnamed: 0,step2_from_step1_mean_time,step3_from_step2_mean_time,step4_from_step3_mean_time,step5_from_step4_mean_time
0,51,89,996,370
