In [405]:
import pandas as pd
import seaborn as sns
import numpy as np
from datetime import datetime
from google.cloud import bigquery
from google.oauth2 import service_account
import warnings
warnings.filterwarnings('ignore')

In [427]:
credentials = service_account.Credentials.from_service_account_file(
    '/Users/peter/Documents/positive-water-229419-e00e7547dca7.json')
project_id = 'positive-water-229419'

query = '''

#StandardSQL
SELECT
fullVisitorId, date, visitNumber, visitStartTime, channelGrouping, geoNetwork.continent, geoNetwork.country,
device.operatingSystem, device.browser, totals.pageviews
FROM
`bigquery-public-data.google_analytics_sample.ga_sessions_*`
ORDER BY RAND()
LIMIT 200000


'''
df = pd.read_gbq(query, project_id=project_id, credentials=credentials, dialect = 'standard')


In [428]:
df.head()

Unnamed: 0,fullVisitorId,date,visitNumber,visitStartTime,channelGrouping,continent,country,operatingSystem,browser,pageviews
0,195733698260398481,20170531,1,1496267887,Organic Search,Europe,United Kingdom,Windows,Chrome,2.0
1,952637930210123189,20161021,1,1477089519,Direct,Americas,United States,iOS,Safari,1.0
2,9473927269737661541,20161108,1,1478621753,Organic Search,Europe,Netherlands,Windows,Chrome,4.0
3,38653694542083069,20161015,1,1476561637,Social,Asia,Indonesia,Macintosh,Safari,1.0
4,557817217226289866,20161128,2,1480401706,Referral,Asia,Japan,Macintosh,Safari,4.0


In [410]:
df.size

2600000

In [153]:
# Метрики для наблюдения - количество просмотренных страниц и количество визитов
# Интервал когорты - неделя
# Период - 4 недели
# Тип когорты - дата первого визита

In [396]:
df.isna().sum()

fullVisitorId       0
date                0
visitNumber         0
visitStartTime      0
channelGrouping     0
continent           0
country             0
operatingSystem     0
browser             0
pageviews          25
dtype: int64

In [430]:
def preparation(df):
    df.pageviews = df.pageviews.fillna(0).astype(int)
    df['date'] = df.date.apply(lambda x: datetime.strptime(x, '%Y%m%d' ))
    df['timestamp'] = df[['date']].apply(lambda x: x[0].timestamp(), axis=1).astype(int)
    temp = df.groupby('fullVisitorId').\
        agg({'timestamp':['min','max']}).reset_index()
    temp.columns = ['fullVisitorId','timestamp_min','timestamp_max']
    df = df.merge(temp, how='outer')
    return df   

In [431]:
df = preparation(df)

In [470]:
df.head()

Unnamed: 0,fullVisitorId,date,visitNumber,visitStartTime,channelGrouping,continent,country,operatingSystem,browser,pageviews,timestamp,timestamp_min,timestamp_max
0,195733698260398481,2017-05-31,1,1496267887,Organic Search,Europe,United Kingdom,Windows,Chrome,2,1496188800,1496188800,1496188800
1,952637930210123189,2016-10-21,1,1477089519,Direct,Americas,United States,iOS,Safari,1,1477008000,1477008000,1477008000
2,9473927269737661541,2016-11-08,1,1478621753,Organic Search,Europe,Netherlands,Windows,Chrome,4,1478563200,1478563200,1478563200
3,38653694542083069,2016-10-15,1,1476561637,Social,Asia,Indonesia,Macintosh,Safari,1,1476489600,1476489600,1476489600
4,557817217226289866,2016-11-28,2,1480401706,Referral,Asia,Japan,Macintosh,Safari,4,1480291200,1480291200,1480291200


In [476]:
def week4_cohort_by_first_day(dftmp, cohort_day, weeks=4):
    dftmp = dftmp[dftmp['timestamp_min'] == cohort_day]
    dictionary = {}
    tmp_week_timestamp = 7*24*60*60
    for i in range(1,weeks+1):
        dictionary['week_'+str(i)] = tmp_week_timestamp*i 

    dftmp['week'] = df.apply(lambda x: 'week_1' if x.timestamp <= (x.timestamp_min + dictionary['week_1']) 
        else 'week_2' if (x.timestamp > (x.timestamp_min + dictionary['week_1'])) and 
                      (x.timestamp <= (x.timestamp_min + dictionary['week_2'])) 
        else 'week_3' if (x.timestamp > (x.timestamp_min + dictionary['week_2'])) and 
                      (x.timestamp <= (x.timestamp_min + dictionary['week_3'])) 
        else 'week_4' if (x.timestamp > (x.timestamp_min + dictionary['week_3'])) and 
                      (x.timestamp <= (x.timestamp_min + dictionary['week_4'])) 
        else 'other_weeks', axis=1)
    dftmp = dftmp.groupby('week').agg({'pageviews':'sum','fullVisitorId':'count'})\
        .reset_index().rename(columns={'fullVisitorId':'visits_count', 'pageviews':'pageviews_sum'})
    #dftmp['first_day'] = cohort_day
    dftmp['first_day'] = pd.to_datetime(cohort_day, unit = 's')
    return dftmp

In [477]:
week4_cohort_by_first_day(df, 1493510400)

Unnamed: 0,week,pageviews_sum,visits_count,first_day
0,other_weeks,11,4,2017-04-30
1,week_1,1150,384,2017-04-30
2,week_2,12,2,2017-04-30
3,week_4,3,1,2017-04-30


In [479]:
week4_cohort_by_first_day(df, 1490832000, weeks=4)

Unnamed: 0,week,pageviews_sum,visits_count,first_day
0,other_weeks,43,12,2017-03-30
1,week_1,2185,596,2017-03-30
2,week_2,23,6,2017-03-30
3,week_3,21,4,2017-03-30
4,week_4,28,5,2017-03-30


In [480]:
week4_cohort_by_first_day(df, 1496188800, weeks=4)

Unnamed: 0,week,pageviews_sum,visits_count,first_day
0,other_weeks,45,7,2017-05-31
1,week_1,1978,433,2017-05-31
2,week_2,31,4,2017-05-31
3,week_3,8,2,2017-05-31
4,week_4,4,2,2017-05-31


In [481]:
week4_cohort_by_first_day(df, 1480291200, weeks=4)

Unnamed: 0,week,pageviews_sum,visits_count,first_day
0,other_weeks,224,18,2016-11-28
1,week_1,3841,1010,2016-11-28
2,week_2,59,10,2016-11-28
3,week_3,26,10,2016-11-28
4,week_4,41,3,2016-11-28
