In [38]:
import pandas as pd
import numpy as np
import os
import sqlalchemy

from collections import Counter
import plotly.express as pe
from dotenv import load_dotenv
import plotly.io as pio
pio.renderers.default='notebook'

from datetime import date
from datetime import datetime
import copy

In [39]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

pd.set_option('max_colwidth', 120)
pd.set_option('display.width', 500)

In [40]:
from dython import nominal

In [41]:
from class_lib.feature_generator import *
from class_lib.segments_preparer import *
from class_lib.graph_lib import *
from class_lib.featureprocessor import *
from class_lib.feature_minorizer import *
from class_lib.FeatureSaver import FeatureSaver
from class_lib.saver import *
from class_lib.extra_feature_gen import ExtraFeatureGenerator

-----------

## Get data

In [42]:
load_dotenv('.env')

host = os.getenv('EVENTS_DB_HOST')
db = os.getenv('EVENTS_DB_NAME')
user = os.getenv('EVENTS_DB_USER')
password = os.getenv('EVENTS_DB_PASSWORD')
port = os.getenv('EVENTS_DB_PORT')
connection_str = 'postgresql://{0}:{1}@{2}:{3}/{4}'.format(user, password, host, port, db)

engine = sqlalchemy.create_engine(connection_str, execution_options={"stream_results":True})

In [43]:
ACCOUNT_ID = 11

In [None]:
query_sessions = f'''
    select *
    from data.customer_profile_sessions cps 

    left join(
    select id as beh_id,guest_id, customer_profile_id from data.customer_profile_behaviour cpb
    where cpb.account_id = {ACCOUNT_ID}
    ) cpb on cpb.beh_id = cps.customer_profile_behaviour_id


    where cps.account_id = {ACCOUNT_ID} and cps.garbage_session = False and cps.session_start >= '2023-01-01' and cps.session_start < '2023-02-01' 
'''

sessions = pd.read_sql_query(query_sessions, engine)

In [None]:
sessions.add_to_basket_count.value_counts()

In [None]:
atb_sessions = sessions[sessions.add_to_basket_count > 0][['id', 'add_to_basket_count', 'session_start', 'session_end']]

In [None]:
atb_sessions['date_group'] = atb_sessions.session_start.dt.strftime('%d').astype('int')

In [None]:
atb_sessions

In [None]:
atb_per = atb_sessions.date_group.value_counts().reset_index().rename(columns={'date_group': 'atb_counts', 'index': 'period'}).sort_values('period').reset_index(drop=True)

In [None]:
atb_per.head()

In [None]:
# sessions.to_pickle('sessions_11_012023.pkl')

## Bounce sessions
Bounce session journey_status_id = 1

In [None]:
%%time
query_sessions1 = f'''
    select *
    from data.customer_profile_session_journey_statuses
    where account_id = {ACCOUNT_ID} 
    and created >= '2022-12-15' and created < '2023-02-05'

'''
sessions1 = pd.read_sql_query(query_sessions1, engine)


query_sessions2 = f'''
    select *
    from data.customer_journey_statuses
    
'''
statuses = pd.read_sql_query(query_sessions2, engine)

query_sessions3 = f'''
    select *
    from data.sessions_campaign_dict
    
'''
camp = pd.read_sql_query(query_sessions3, engine)

query_sessions4 = f'''
    select *
    from data.sessions_medium_dict
    
'''
med = pd.read_sql_query(query_sessions4, engine)

query_sessions5 = f'''
    select *
    from data.sessions_source_dict
    
'''
sou = pd.read_sql_query(query_sessions5, engine)

In [None]:
sessions.info()

In [None]:
sessions['duration'] = sessions['session_end'] - sessions['session_start']
sessions['duration'] = sessions['duration'].apply(lambda x: x.total_seconds())

In [None]:
sessions.head()

In [None]:
sessions.id.nunique()

In [None]:
len(sessions1[sessions1.id.isin(sessions.id.unique())])

In [None]:
statuses = statuses.rename(columns={'id': 'journey_status_id'})

In [None]:
sessions1 = sessions1.merge(statuses[['journey_status_id', 'status_name']], how='left', on='journey_status_id')

In [None]:
sessions1.head()

In [None]:
sessions = sessions.merge(sessions1[['id', 'journey_status_id', 'created', 'status_name']], how='inner', on='id')

In [None]:
sessions.head()

In [None]:
sessions['is_bounce'] = 0
sessions.loc[sessions['status_name'] == 'Bounce', 'is_bounce'] = 1
sessions['is_returning'] = 0
sessions.loc[sessions['status_name'].isin(list(['Loyalty', 'LoyaltyPlus'])), 'is_returning'] = 1

In [None]:
sessions[sessions['status_name'].isin(['Loyalty', 'LoyaltyPlus'])].head()

## Counting by days

In [None]:
import math
def split_df_chunks(data_df,chunk_size):
    total_length     = len(data_df)
    total_chunk_num  = math.ceil(total_length/chunk_size)
    normal_chunk_num = math.floor(total_length/chunk_size)
    chunks = []
    for i in range(normal_chunk_num):
        chunk = data_df[(i*chunk_size):((i+1)*chunk_size)]
        chunks.append(chunk)
    if total_chunk_num > normal_chunk_num:
        chunk = data_df[(normal_chunk_num*chunk_size):total_length]
        chunks.append(chunk)
    return chunks

In [None]:
# data = split_df_chunks(sessions, 100)

In [None]:
# data = np.array_split(sessions, 60)

In [None]:
# for n in data:
#     print(len(n))

In [None]:
sessions['date_group'] = sessions.session_start.dt.strftime('%d').astype('int')

In [None]:
# sessions.loc[sessions['date_group'] == 31, ['session_start', 'date_group']]

In [None]:
week_max = sessions.date_group.unique().max()
week_min = sessions.date_group.unique().min()

In [None]:
print(sessions.date_group.unique().min())
print(sessions.date_group.unique().max())

In [None]:
print(sessions.id.nunique())
print(sessions.guest_id.nunique())

In [None]:
sessions.info()

In [None]:
data = []
for i in range(week_min, week_max + 1):
    data.append(sessions[sessions['date_group'] == i].reset_index(drop=True).fillna(0))

In [None]:
sessions = sessions[['id', 'source_id', 'medium_id', 'campaign_id', 'device_type', 'browser_family',
                     'ipcountry', 'language', 'is_bounce', 'is_returning', 'duration', 'date_group']]

In [None]:
sessions = sessions.merge(sou, how='left', left_on='source_id', right_on='id', suffixes=(None, '_y')).drop(['source_id', 'id_y'], axis=1)
sessions = sessions.merge(med, how='left', left_on='medium_id', right_on='id', suffixes=(None, '_y')).drop(['medium_id', 'id_y'], axis=1)
sessions = sessions.merge(camp, how='left', left_on='campaign_id', right_on='id', suffixes=(None, '_y')).drop(['campaign_id', 'id_y'], axis=1)
sessions.head()

In [None]:
data1 = []
for i in range(week_min, week_max + 1):
    data1.append(sessions[sessions['date_group'] == i].reset_index(drop=True).fillna(0))

In [None]:
l = len(data1[0].columns)

In [None]:
dfc = []
for i in data1:
    assc = nominal.associations(i, compute_only=True, nom_nom_assoc='cramer', num_num_assoc='pearson', nom_num_assoc='correlation_ratio', mark_columns=True)
    dfc.append(assc['corr'])

In [None]:
dfc[4]

In [None]:
def create_lofl(x, y, n=[]):
    lofl = []
    for i in range(y):
        ofl = []
        for j in range(x):
            ofl.append(n)
        lofl.append(ofl)
    return lofl

In [None]:
dum_list = create_lofl(l, l, [0])

In [None]:
l = len(dfc[0].columns)
lists = copy.deepcopy(dum_list)
l

In [None]:
n=0
for d in dfc:
    li = d.values.tolist()
#     print(li[1][0])
    for i in range(0, l):

        for j in range(0, l):
#             print(li[i][j])
            lists[i][j] = lists[i][j] + [li[i][j]]

In [None]:
for i in range(l):
        for j in range(l):
            lists[i][j] = lists[i][j][1:]

In [None]:
qlists = copy.deepcopy(lists)

In [None]:
q1 = 0
q9 = 0
for m in range(0, l):
        for b in range(0, l):
            q1 = np.quantile(qlists[m][b], .05)
            q9 = np.quantile(qlists[m][b], .95)
            qlists[m][b] = [q1, q9]

In [None]:
anlist = copy.deepcopy(dum_list)
listsc = copy.deepcopy(lists)
qlistc = copy.deepcopy(qlists)

In [None]:
for i in range(l):
        for j in range(l):
#             print(qlistc[i][j][0])
            it = 0
            for k in listsc[i][j]:
                it += 1
                if ((k < qlistc[i][j][0]) | (k > qlistc[i][j][1])):
                    anlist[i][j] = anlist[i][j] + [it] 
#                 anlist[i][j] = next((index + 1 for index, item in enumerate(listsc[i][j]) if ((item < qlistc[i][j][0]) | (item > qlistc[i][j][1]))), None)

In [None]:
for i in range(l):
        for j in range(l):
            anlist[i][j] = anlist[i][j][1:]

In [None]:
figx = pd.DataFrame(lists[1][5], columns=['cor_coef'], index=range(week_min ,week_max + 1))
figx['mean'] = figx.cor_coef.mean()
figx['q01'] = np.quantile(figx.cor_coef, .05)
figx['q99'] = np.quantile(figx.cor_coef, .95)

In [None]:
figx.plot()

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots()
ax1 = figx.plot(use_index=True, y='cor_coef',  kind='line', ax=axes, figsize=(20, 5), title='device_type (nom) - is_bounce (con) for periods', color='cornflowerblue')
ax2 = figx.plot(y='mean', ax=axes, color='green')
ax3 = figx.plot(y='q99', ax=axes, color='darkviolet')
ax4 = figx.plot(y='q01', ax=axes, color='orangered')

In [None]:
dfanlist = pd.DataFrame(anlist,index=dfc[0].columns,columns=dfc[0].columns)
dfanlist

In [None]:
dfanlist.to_csv('feat_coran_und95_202301.csv')

In [None]:
k = []
for i in range(0, l):
    for j in range(0, l):
        k += anlist[i][j]

In [None]:
# %%time
# query_sessions6 = f'''
#     select 
#     id as beh_id,
#     guest_id,
#     last_session_id,
#     session_start as last_session_start,
#     session_end as last_session_end
#     from data.customer_profile_behaviour cpb 

#     left join(
#     select id as ses_id, session_start, session_end from data.customer_profile_sessions cps
#     where cps.account_id = {ACCOUNT_ID}
#     ) cps on cpb.last_session_id = cps.ses_id


#     where cpb.account_id = {ACCOUNT_ID} and cps.session_start >= '2023-01-01' and cps.session_start < '2023-02-01' 
# '''

# newd = pd.read_sql_query(query_sessions6, engine)

In [None]:
# newd

In [None]:
k1 = pd.DataFrame(k, columns=['period']).value_counts().reset_index().rename(columns={0: 'anomaly_counts'}).sort_values('period').reset_index(drop=True)

In [None]:
k1['first_session'] = ''
k1['last_session'] = ''
k1['conversion_rate'] = 0
k1

In [None]:
for i in range(week_min - 1, week_max):
    time1 = data[i]['session_start'].min()
    time2 = data[i]['session_start'].max()
    ovrl = len(data[i])
    atbc = atb_per.loc[i]['atb_counts']
#     print(time1)
#     print(time2)
    k1.iloc[i, k1.columns.get_loc('first_session')] = time1
    k1.iloc[i, k1.columns.get_loc('last_session')] = time2
    k1.iloc[i, k1.columns.get_loc('conversion_rate')] = atbc / ovrl

In [None]:
k1['duration'] = k1['last_session'] - k1['first_session']
k1['duration'] = k1['duration'].apply(lambda x: x.total_seconds() / 60 / 60)

In [None]:
k1

In [None]:
k1.to_csv('period_counts_und95_202301.csv')

In [None]:
week_max

In [None]:
atb_per.iloc[0]['atb_counts']

## Timeline analysys

In [44]:
from dateutil import rrule
from datetime import timedelta

In [45]:
%%time
query_sessions = f'''
    select *
    from data.customer_profile_sessions cps 

    left join(
    select id as beh_id,guest_id, customer_profile_id from data.customer_profile_behaviour cpb
    where cpb.account_id = {ACCOUNT_ID}
    ) cpb on cpb.beh_id = cps.customer_profile_behaviour_id


    where cps.account_id = {ACCOUNT_ID} and cps.garbage_session = False and cps.session_start >= '2022-12-31' and cps.session_start < '2023-02-01' 
'''

sessions = pd.read_sql_query(query_sessions, engine)


query_sessions1 = f'''
    select *
    from data.customer_profile_session_journey_statuses
    where account_id = {ACCOUNT_ID} 
    and created >= '2022-12-25' and created < '2023-02-05'

'''
sessions1 = pd.read_sql_query(query_sessions1, engine)

query_sessions2 = f'''
    select *
    from data.customer_journey_statuses
    
'''
statuses = pd.read_sql_query(query_sessions2, engine)

CPU times: user 807 ms, sys: 45.3 ms, total: 853 ms
Wall time: 16.5 s


In [46]:
len(sessions[(sessions.channel == 'Paid Search') & (sessions.add_to_basket_count >= 1)])

0

In [47]:
sessions['duration'] = sessions['session_end'] - sessions['session_start']
sessions['duration'] = sessions['duration'].apply(lambda x: x.total_seconds())

In [48]:
statuses = statuses.rename(columns={'id': 'journey_status_id'})

In [49]:
sessions1 = sessions1.merge(statuses[['journey_status_id', 'status_name']], how='left', on='journey_status_id')

In [50]:
sessions.id.nunique()

25740

In [51]:
sessions = sessions.merge(sessions1[['id', 'journey_status_id', 'created', 'status_name']], how='inner', on='id')
sessions.id.nunique()

25740

In [52]:
chanls = sessions.channel.unique().tolist()
chanls

['Organic Search', 'Social', 'Direct', 'Referral', 'Email', 'Paid Search']

In [53]:
sessions['session_start'][0] - datetime.timedelta(days=1)

Timestamp('2023-01-30 23:57:42.272000')

In [81]:
# date1 = '2022-12-31'
# date1 = datetime.datetime.strptime(date1, '%Y-%m-%d').date()
# print(date1)
# now = date1
# then = now + timedelta(days=31)
# print(then)

2022-12-31
2023-01-31


In [54]:
data2 = []
now = datetime.datetime(2023, 1, 1)
then = now + timedelta(days=31)
window_dates = []

for dt in rrule.rrule(rrule.HOURLY, dtstart=now, until=then):
#     print(dt)
    row = sessions[(sessions['session_start'] > dt - timedelta(days=1)) & (sessions['session_start'] <= dt)]
#     print(row.head(1))
    data2.append(row)
    window_dates.append([dt - timedelta(days=1), dt])
#     print(len(row))

In [55]:
window_dates

[[datetime.datetime(2022, 12, 31, 0, 0), datetime.datetime(2023, 1, 1, 0, 0)],
 [datetime.datetime(2022, 12, 31, 1, 0), datetime.datetime(2023, 1, 1, 1, 0)],
 [datetime.datetime(2022, 12, 31, 2, 0), datetime.datetime(2023, 1, 1, 2, 0)],
 [datetime.datetime(2022, 12, 31, 3, 0), datetime.datetime(2023, 1, 1, 3, 0)],
 [datetime.datetime(2022, 12, 31, 4, 0), datetime.datetime(2023, 1, 1, 4, 0)],
 [datetime.datetime(2022, 12, 31, 5, 0), datetime.datetime(2023, 1, 1, 5, 0)],
 [datetime.datetime(2022, 12, 31, 6, 0), datetime.datetime(2023, 1, 1, 6, 0)],
 [datetime.datetime(2022, 12, 31, 7, 0), datetime.datetime(2023, 1, 1, 7, 0)],
 [datetime.datetime(2022, 12, 31, 8, 0), datetime.datetime(2023, 1, 1, 8, 0)],
 [datetime.datetime(2022, 12, 31, 9, 0), datetime.datetime(2023, 1, 1, 9, 0)],
 [datetime.datetime(2022, 12, 31, 10, 0),
  datetime.datetime(2023, 1, 1, 10, 0)],
 [datetime.datetime(2022, 12, 31, 11, 0),
  datetime.datetime(2023, 1, 1, 11, 0)],
 [datetime.datetime(2022, 12, 31, 12, 0),
  

In [56]:
data2[2]['session_start'].head()

25120   2023-01-01 01:40:39.297
25121   2023-01-01 01:38:37.675
25122   2023-01-01 01:37:40.883
25123   2023-01-01 01:31:20.288
25124   2023-01-01 01:22:57.111
Name: session_start, dtype: datetime64[ns]

In [57]:
len(data2)

745

In [58]:
data2[24]['session_start'].max()

Timestamp('2023-01-01 23:57:11.512000')

In [59]:
atable = pd.DataFrame(columns=['period', 'period_len', 'period_begin', 'period_end', 'first_session', 'last_session', 'bounce_rate', 'conversion_rate', 'med_duration',
                              'bounce_organic', 'bounce_social', 'bounce_direct', 'bounce_referral', 'bounce_email', 'bounce_paid',
                              'conversion_organic', 'conversion_social', 'conversion_direct', 'conversion_referral', 'conversion_email', 'conversion_paid',
                              'duration_organic', 'duration_social', 'duration_direct', 'duration_referral', 'duration_email', 'duration_paid'], 
                      index=[i for i in range(0, len(data2))])
atable

Unnamed: 0,period,period_len,period_begin,period_end,first_session,last_session,bounce_rate,conversion_rate,med_duration,bounce_organic,...,conversion_direct,conversion_referral,conversion_email,conversion_paid,duration_organic,duration_social,duration_direct,duration_referral,duration_email,duration_paid
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
740,,,,,,,,,,,...,,,,,,,,,,
741,,,,,,,,,,,...,,,,,,,,,,
742,,,,,,,,,,,...,,,,,,,,,,
743,,,,,,,,,,,...,,,,,,,,,,


In [60]:
chanls

['Organic Search', 'Social', 'Direct', 'Referral', 'Email', 'Paid Search']

In [61]:
per = 0
for sch in data2:
    atable.iloc[per, atable.columns.get_loc('period')] = per
    atable.iloc[per, atable.columns.get_loc('first_session')] = sch.session_start.min()
    atable.iloc[per, atable.columns.get_loc('last_session')] = sch.session_start.max()
    le = len(sch)
    bounc = len(sch[sch.status_name == 'Bounce'])
    atable.iloc[per, atable.columns.get_loc('bounce_rate')] = bounc/le
    atb = len(sch[sch.add_to_basket_count >= 1])
    atable.iloc[per, atable.columns.get_loc('conversion_rate')] = atb/le
    atable.iloc[per, atable.columns.get_loc('med_duration')] = sch.duration.median()
    atable.iloc[per, atable.columns.get_loc('period_len')] = len(sch)
    atable.iloc[per, atable.columns.get_loc('period_begin')] = window_dates[per][0]
    atable.iloc[per, atable.columns.get_loc('period_end')] = window_dates[per][1]
    
    per += 1

In [62]:
ch = ['organic', 'social', 'direct', 'referral', 'email', 'paid']
for n in ch:
    per = 0
    for sch in data2:
        if n == ch[0]:
            cat = sch[sch.channel == 'Organic Search']
        elif n == ch[1]:
            cat = sch[sch.channel == 'Social']
        elif n == ch[2]:
            cat = sch[sch.channel == 'Direct']
        elif n == ch[3]:
            cat = sch[sch.channel == 'Referral']
        elif n == ch[4]:
            cat = sch[sch.channel == 'Email']
        elif n == ch[5]:
            cat = sch[sch.channel == 'Paid Search']
        le = len(cat)
        bounc = len(cat[cat.status_name == 'Bounce'])
        try:
            atable.iloc[per, atable.columns.get_loc('bounce_{0}'.format(n))] = bounc/le
        except ZeroDivisionError:
            atable.iloc[per, atable.columns.get_loc('bounce_{0}'.format(n))] = 0
        atb = len(cat[cat.add_to_basket_count >= 1])
        try:
            atable.iloc[per, atable.columns.get_loc('conversion_{0}'.format(n))] = atb/le
        except ZeroDivisionError:
            atable.iloc[per, atable.columns.get_loc('conversion_{0}'.format(n))] = 0
            
        if cat.duration.sum() != 0:   
            try:
                atable.iloc[per, atable.columns.get_loc('duration_{0}'.format(n))] = cat.duration.median()
            except ZeroDivisionError:
                atable.iloc[per, atable.columns.get_loc('duration_{0}'.format(n))] = 0
        else:
            atable.iloc[per, atable.columns.get_loc('duration_{0}'.format(n))] = 0
        
        per += 1

In [63]:
atable

Unnamed: 0,period,period_len,period_begin,period_end,first_session,last_session,bounce_rate,conversion_rate,med_duration,bounce_organic,...,conversion_direct,conversion_referral,conversion_email,conversion_paid,duration_organic,duration_social,duration_direct,duration_referral,duration_email,duration_paid
0,0,591,2022-12-31 00:00:00,2023-01-01 00:00:00,2022-12-31 00:01:16.677000,2022-12-31 23:56:44.223000,0.43824,0.013536,26.946,0.326087,...,0.028571,0.0,0,0,39.781,26.108,32.073,36.59,0,0
1,1,591,2022-12-31 01:00:00,2023-01-01 01:00:00,2022-12-31 01:02:14.635000,2023-01-01 00:56:04.126000,0.431472,0.013536,25.948,0.319149,...,0.028037,0.0,0,0,46.177,24.912,30.802,36.59,0,0
2,2,595,2022-12-31 02:00:00,2023-01-01 02:00:00,2022-12-31 02:12:13.908000,2023-01-01 01:40:39.297000,0.428571,0.015126,26.304,0.319149,...,0.037383,0.0,0,0,46.177,25.61,28.492,36.59,0,0
3,3,601,2022-12-31 03:00:00,2023-01-01 03:00:00,2022-12-31 03:01:24.285000,2023-01-01 02:58:40.748000,0.427621,0.014975,26.304,0.319149,...,0.037037,0.0,0,0,46.177,25.587,29.647,36.59,0,0
4,4,610,2022-12-31 04:00:00,2023-01-01 04:00:00,2022-12-31 04:26:06.427000,2023-01-01 03:56:17.595000,0.42623,0.014754,27.0905,0.326087,...,0.037037,0.0,0,0,50.9915,26.108,29.647,36.59,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
740,740,870,2023-01-30 20:00:00,2023-01-31 20:00:00,2023-01-30 20:00:06.998000,2023-01-31 19:59:01.974000,0.389655,0.014943,49.263,0.226415,...,0.007246,0.0,0.0,0.0,107.542,46.327,41.3925,7.8335,99.317,39.186
741,741,873,2023-01-30 21:00:00,2023-01-31 21:00:00,2023-01-30 21:01:43.586000,2023-01-31 20:59:41.697000,0.386025,0.016037,48.503,0.23301,...,0.014599,0.0,0.0,0.0,109.856,45.3055,34.756,8.965,611.123,39.186
742,742,865,2023-01-30 22:00:00,2023-01-31 22:00:00,2023-01-30 22:01:21.007000,2023-01-31 21:59:37.877000,0.387283,0.016185,47.899,0.227723,...,0.014388,0.0,0.0,0.0,101.036,45.106,39.911,12.0805,611.123,39.186
743,743,842,2023-01-30 23:00:00,2023-01-31 23:00:00,2023-01-30 23:00:39.621000,2023-01-31 22:59:27.659000,0.397862,0.014252,44.0225,0.234043,...,0.014388,0.0,0.0,0.0,105.446,42.467,31.757,12.0805,611.123,39.186


In [64]:
metlist = ['bounce_rate', 'conversion_rate', 'med_duration',
                              'bounce_organic', 'bounce_social', 'bounce_direct', 'bounce_referral', 'bounce_email', 'bounce_paid',
                              'conversion_organic', 'conversion_social', 'conversion_direct', 'conversion_referral', 'conversion_email', 'conversion_paid',
                              'duration_organic', 'duration_social', 'duration_direct', 'duration_referral', 'duration_email', 'duration_paid']

In [65]:
qframe = pd.DataFrame(columns=['metric', 'bot_line', 'upp_line'], index=[i for i in range(0, len(metlist))])
metn = 0
for n in metlist:
    atable['an_{}'.format(n)] = 0
    q95 = np.quantile(atable[n], .95)
    q05 = np.quantile(atable[n], .05)
    atable.loc[(atable[n] > q95) | (atable[n] < q05), 'an_{}'.format(n)] = 1
    qframe.iloc[metn, qframe.columns.get_loc('metric')] = n
    qframe.iloc[metn, qframe.columns.get_loc('bot_line')] = q05
    qframe.iloc[metn, qframe.columns.get_loc('upp_line')] = q95
    
    metn += 1
    print(n, q95, q05)

bounce_rate 0.5393833743505606 0.3347618957238028
conversion_rate 0.021301775147928994 0.006998556998556999
med_duration 49.089099999999995 13.2105
bounce_organic 0.3333333333333333 0.15
bounce_social 0.5586699679174222 0.31072845528455284
bounce_direct 0.6368297372356333 0.36363636363636365
bounce_referral 1.0 0.125
bounce_email 0.8888888888888888 0.0
bounce_paid 0.5 0.0
conversion_organic 0.06013306959180001 0.0
conversion_social 0.01867131030860869 0.00407942345334836
conversion_direct 0.03759398496240601 0.0
conversion_referral 0.1 0.0
conversion_email 0.09090909090909091 0.0
conversion_paid 0.0 0.0
duration_organic 163.52229999999997 62.92680000000001
duration_social 50.3401 10.4256
duration_direct 34.64039999999997 7.84
duration_referral 219.49009999999964 2.258
duration_email 559.1400000000001 0.0
duration_paid 2843.022 0.0


In [82]:
qframe

Unnamed: 0,metric,bot_line,upp_line
0,bounce_rate,0.334762,0.539383
1,conversion_rate,0.006999,0.021302
2,med_duration,13.2105,49.0891
3,bounce_organic,0.15,0.333333
4,bounce_social,0.310728,0.55867
5,bounce_direct,0.363636,0.63683
6,bounce_referral,0.125,1.0
7,bounce_email,0.0,0.888889
8,bounce_paid,0.0,0.5
9,conversion_organic,0.0,0.060133


In [33]:
qframe.to_csv('metric_lines_und202301.csv')

In [86]:
atable['anomaly_coeff']= atable.iloc[:, -21:-1].sum(axis=1)

In [87]:
len(atable)

745

In [89]:
len(atable[atable['anomaly_coeff'] > 0])

529

In [91]:
len(data2)

745

In [97]:
sort_data = (atable[atable['anomaly_coeff'] > 0].index.tolist())

In [102]:
sort_data

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 41,
 52,
 53,
 54,
 55,
 56,
 59,
 61,
 62,
 63,
 64,
 65,
 66,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 110,
 111,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 197,
 198,
 199,
 200,
 201,
 222,
 223,
 224,
 225,
 226,
 227,
 228,
 229,
 230,
 231,
 232,
 234,
 235,
 236,
 237,
 238,
 239,
 240,
 241,
 242,
 243,
 244,
 245,
 246,
 247,
 248,
 249,
 250,
 251,
 252,
 253,
 254,
 255,
 256,
 257,
 258,
 259,
 260,
 261,
 269,
 273,
 274,
 275,
 276,
 277,
 2

In [98]:
data3 = []
for i in sort_data:
    data3.append(data2[i])

In [105]:
data3[1]['duration']

25132       2.162
25133       6.485
25134       3.156
25135       6.982
25136     499.410
           ...   
25718    1687.398
25719      16.340
25720       9.755
25721       0.757
25722       6.120
Name: duration, Length: 591, dtype: float64

In [35]:
# atable['anomaly_coeff'] = atable.apply(['an_bounce_rate', 'an_conversion_rate', 'an_avg_duration',
#                               'an_bounce_organic', 'an_bounce_social', 'an_bounce_direct', 'an_bounce_referral', 'an_bounce_email', 'an_bounce_paid',
#                               'an_conversion_organic', 'an_conversion_social', 'an_conversion_direct', 'an_conversion_referral', 'an_conversion_email', 'an_conversion_paid',
#                               'an_duration_organic', 'an_duration_social', 'an_duration_direct', 'an_duration_referral', 'an_duration_email', 'an_duration_paid'].sum()

In [37]:
atable.to_csv('und2023_01.csv')