In [None]:
import sys
import os
import gc
import traceback
import json
import time

# Vis
import matplotlib.pyplot as plt
import seaborn as sns

# handling
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from statistics import mode, StatisticsError
from sklearn.preprocessing import MinMaxScaler
from collections import Counter 
from typing import List
from tsfresh.feature_extraction import extract_features, ComprehensiveFCParameters
from scipy.stats import spearmanr, pearsonr
# ETC
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
np.set_printoptions(formatter={'float_kind': lambda x: "{0:0.2f}".format(x)}) # 소수점 출력 옵션 변경

# 1. Data Import

In [None]:
def read_files(today, input_path, dates, platform_total = False):
    input_path_list = [input_path]
    if platform_total == True :
        input_path_list = []
        input_path_list.append(input_path.split('/')[0] +'/'+input_path.split('/')[1]+'/'+input_path.split('/')[2]+'/android')
        input_path_list.append(input_path.split('/')[0] +'/'+input_path.split('/')[1]+'/'+input_path.split('/')[2]+'/ios')  
    df_list = []
    for input_path in input_path_list :
        for i in range(1, dates+1):
            try : 
                date = today - timedelta(days=i) # 설정한 today를 기준으로 과거 n일 json
                y = str(date.year)[2:]
                m = str(date.month).zfill(2)
                d = str(date.day).zfill(2)
                filename = y+m+d+'.json'
                df_tmp = pd.read_json(input_path+'/'+filename)
                df_list.append(df_tmp)
            except ValueError :
                print("Not enough data to load.")
    df = pd.concat(df_list).reset_index(drop = True) # data merge
    return df

def set_date_range(start, end, df):
    start_point = str(start.year)+'-'+str(start.month)+'-'+str(start.day)
    end_point = str(end.year)+'-'+str(end.month)+'-'+str(end.day)
    
    start_point = pd.Timestamp(start_point)
    end_point = pd.Timestamp(end_point)
    
    # real_datae : 위에서 만든 date값중 오류가 있는 날짜 존재
    # events : List안에 Dict형태로 데이터가 들어가 있음
    df['real_date'] = df['events'].apply(lambda x: x[0]['date'])
    df['real_date'] = pd.to_datetime(df['real_date'])

    # 지정된 start와 end 사이의 날짜만 선택
    selected_df = df.loc[df['real_date'].apply(lambda x: start_point <= x)]
    selected_df = selected_df.loc[selected_df['real_date'].apply(lambda x: x < end_point)]
    return selected_df

# 2. Basic preprocess

In [None]:
# preprocess function
def basic_preprocess(df):
    df['date'] = pd.to_datetime(df['date'])
    df['duration'] = round(df['duration']/1000)

    #df = make_end_date(df)
    #df['end_date'] = pd.to_datetime(df['end_date'])

    df_1 = real_date(df)
    df_1 = numeric_anomaly_detection(df_1, 'duration')
    df_1 = duration_outlier_detection(df_1) # 임시로 duration outlier 비활성화
    df_1 = df_1.sort_values('date')
    return df_1

def real_date(df) :
    # real_date(date변수에 오류 존재하여 event date대로 대체)
    df['real_date'] = df['events'].apply(lambda x: x[0]['date'])
    df['real_date'] = pd.to_datetime(df['real_date'])
    
    # 기존 date에는 real_date를 기준으로 날짜만 남기기
    df['date'] = df['real_date'].apply(lambda x : x.date())
    df['date_ymd'] = df['real_date']
    return df

def make_end_date(df):
    df['end_date'] = df['date'] + df['duration'].apply(lambda x: timedelta(seconds=int(x)))
    return df

def duration_outlier_detection(df):
    # skew가 클 경우 상위 90% 미만의 데이터만 선택
    if (df['duration'].skew()<-1) or (df['duration'].skew()>1):
        Q1 = df['duration'].quantile(0.999)
        filter = df['duration'] < Q1
        df=df.loc[filter]
    else:
        Q1 = df['duration'].quantile(0.25)
        Q3 = df['duration'].quantile(0.75)
        IQR = Q3 - Q1 #IQR : InterQuartileRange
        filter = (df['duration'] >= Q1 - 1.5 * IQR) & (df['duration'] <= Q3 + 1.5 *IQR)
        df=df.loc[filter]
        
    df=df.loc[df['duration']>0]
    return df

def find_datetime(x):
    if type(x) == pd.Timestamp:
        return True
    else:
        return False

def is_digit(x):
    try:
        tmp = float(x)
        return True
    except ValueError:
        return False

def date_anomaly_detection(df,date_cols):
    if isinstance(date_cols,str):
        df = df.drop(df.loc[df[date_cols].apply(lambda x: find_datetime(x))==False].index,axis=0)
    elif isinstance(date_cols,list):
        select  = df[date_cols].apply(lambda r : all([find_datetime(e) for e in r  ]),axis=1) 
        df=df[select]
    else:
        pass
    return df

def numeric_anomaly_detection(df,num_cols):
    if isinstance(num_cols,str):
        select  = df[num_cols].apply(lambda r : is_digit(r)) 
        df=df[select]
    elif isinstance(num_cols,list):
        select  = df[num_cols].apply(lambda r : all([is_digit(e) for e in r  ]),axis=1) 
        df=df[select]
    else:
        pass
    return df


# 3. Select Target
### (New-visitors vs Re-visitors)

In [None]:
# 1) divide time range
## (1) make date to real date
def real_date(df) :
    # real_date(date변수에 오류 존재하여 event date대로 대체)
    df['real_date'] = df['events'].apply(lambda x: x[0]['date'])
    df['real_date'] = pd.to_datetime(df['real_date'])
    
    # 기존 date에는 real_date를 기준으로 날짜만 남기기
    df['date'] = df['real_date'].apply(lambda x : x.date())
    return df

In [None]:
def abstract_events(x):
    """get events name from list"""
    path = []
    for i in x:
        path.append(i['name'])
    return path

In [None]:
def divide_visitor(today, df, key_id, newb_period = 14, user_property = False) :
    """divide new visitor and re-visitors"""
    # 1) set data range to extract new visitor
    df_last = set_date_range(today-timedelta(newb_period), today, df)

    # 2) abstract events
    df_last['abs_events'] = df_last.loc[:, 'events'].apply(lambda x : abstract_events(x))

    # 3) get new_visitors id
    if user_property == False :
        standard_event = '#appInstall'
        new_idx = df_last['abs_events'].apply(lambda x : True if standard_event in x else False)

    else :
        standard_event = 'signUp'
        new_idx = df_last['abs_events'].apply(lambda x : True if standard_event in x else False)

    # 4) divide new and re visitor
    df_new_visitor_id = pd.DataFrame({key_id : list(set(df_last_2w[new_idx][key_id]))})
    df_new = pd.merge(df_last, df_new_visitor_id,
                      on = key_id, how = 'right')
    df_re = pd.merge(df, df_new_visitor_id,
                     on = key_id, how = 'outer', indicator= True)\
                    .query('_merge != "both"').drop(['_merge'], 1)

    return df_new, df_re


def divide_visitor_v2(today, dates, df, key_id, newb_period = 7, user_property = False) :
    """divide new visitor and re-visitors"""
    # 1) set data range to extract new visitor
    df_last = set_date_range(today-timedelta(dates),
                             today-timedelta(dates)+timedelta(newb_period),
                             df)

    # 2) abstract events
    df_last['abs_events'] = df_last.loc[:, 'events'].apply(lambda x : abstract_events(x))
    df['abs_events'] = df.loc[:, 'events'].apply(lambda x : abstract_events(x))

    # 3) get new_visitors id
    if user_property == False :
        standard_event = '#appInstall'
        new_idx = df_last['abs_events'].apply(lambda x : True if standard_event in x else False)
        delete_new_idx = df['abs_events'].apply(lambda x : True if standard_event in x else False) 

    else :
        standard_event = 'signUp'
        new_idx = df_last['abs_events'].apply(lambda x : True if standard_event in x else False)
        delete_new_idx = df['abs_events'].apply(lambda x : True if standard_event in x else False) 

        
    # 4) divide new and re visitor
    ## (1) df_new
    df_new_visitor_id_target = pd.DataFrame({key_id : list(set(df_last[new_idx][key_id]))})
    df_new = pd.merge(df, df_new_visitor_id_target,
                      on = key_id, how = 'right')
    ## (2) df_re
    df_new_visitor_id_delete = pd.DataFrame({key_id : list(set(df[delete_new_idx][key_id]))})
    df_re = pd.merge(df, df_new_visitor_id_delete,
                     on = key_id, how = 'outer', indicator= True)\
                    .query('_merge != "both"').drop(['_merge'], 1)

    return df_new, df_re


# 4. Activity_index
 : act_index = duration + visit_cnt + conti_visit

In [None]:
def duration_mean(df) : 
    duration_mean = round(df['duration'].mean(), 2)
    return duration_mean

# def duration_sum(df) : 
#     duration_sum = df['duration'].sum()
#     return duration_sum

def act_days_last7(df, date) :
    # last7
#     last7 = df[df['date'].map(lambda x : x >= date.date())]['date']
#     last7 = last7['date'].map(lambda x : x.day).unique()

    # last7(v2) : only for 7days in data(don't neet to setting range)
    last7 = df['date'].map(lambda x : x.day).unique()
    return len(last7)

def conti_act_days_last7(df, date): 
#     last7 = df[df['date'].map(lambda x: x>date)]['date'] 
#     last7 = last7.map(lambda x: x.day).unique().tolist() # series형태로 날짜가 들어있기에 map(lambda x : x.day)으로 하나씩 값을 꺼내와 날짜만 추출한 뒤, 유니크한 값을 list 형태로 반환
    last7 = df['date'].map(lambda x : x.day).unique().tolist()
    output = 0
    if len(last7) == 0:
        return output
    elif len(last7) == 1:
#         output = 1
        output = 0 
        return output
    else:
        conti = 1
        for i in range(len(last7)-1):
            if last7[i+1] - last7[i] == 1:
                conti+=1
                if conti > output:
                    output = conti
            else:
                conti=1
        return output

In [None]:
# make weekly_visit_cnt & weekly_activity_index
def weekly_visit_act (df) :
    df['visit_week_cnt'] = df.apply(lambda x: 4-sum(x[1:5]==0) ,axis=1)
    df['act_index_mean'] = round(df.iloc[:, 1:5].sum(axis = 1)/df.visit_week_cnt,3)
    return df 

# act_index labeling [H/M/L]
def act_index_labeling (df) :
    """
    최종 활동지수 라벨링
    H : 전체 상위 33% 
    M : 전체 중위 33%
    L : 전체 하위 33%
    """
    high_point = round((df['act_index_mean'].max() -df['act_index_mean'].min()) *0.66, 2) + df['act_index_mean'].min() 
    low_point = round((df['act_index_mean'].max() -df['act_index_mean'].min()) *0.33, 2) + df['act_index_mean'].min()
    df['act_label'] = ''
    df.loc[(df['visit_week_cnt'] >= 3) &
       (df['act_index_mean'] > low_point),'act_label'] = 'H'
    df.loc[(df['visit_week_cnt'] >= 3) &
       (df['act_index_mean'] <= low_point),'act_label'] = 'M'
    df.loc[(df['visit_week_cnt'] == 2) &
       (df['act_index_mean'] > low_point),'act_label'] = 'M'
    df.loc[(df['visit_week_cnt'] == 2) &
       (df['act_index_mean'] <= low_point),'act_label'] = 'L'
    df.loc[(df['visit_week_cnt'] == 1), 'act_label'] = 'L'
    return df 

# 4. purchase_index
 : purchase_index = buy_cnt + buy_amt
  - id별로 몇번을 구매했는지 컬럼, 총 얼마의 금액을 구매했는지 

In [None]:
# json flatter
def flatten_json(nested_json, exclude=['']):
    """Flatten json object with nested keys into a single level.
        Args:
            nested_json: A nested json object.
            exclude: Keys to exclude from output.
        Returns:
            The flattened json object if successful, None otherwise.
    """
    out = {}
    def flatten(x, name='', exclude=exclude):
        if type(x) is dict:
            for a in x:
                if a not in exclude: flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out
    
def json_to_dataframe_nodeN(df,key):
    df['event_json']=df.apply(lambda x: [flatten_json(j) for j in x['events']],axis=1)
    l = df['event_json'].str.len()
    event=pd.DataFrame(np.concatenate(np.array(df['event_json'])).tolist(), index=np.repeat(df[key], l))
    event[key_id]=event.index.values
    event.index=list(range(0,event.shape[0]))
    return df, event

In [None]:
# df, event =  json_to_dataframe_nodeN(df,key_id)
def commerce_preprocess(event, key_id, event_buy, event_refund,
                        orderId_param = None, price_param = None,) :
    """
    - coke has 2store(store and vending) ---(could be different by app)
        - event_buy = ['sapBuyStore','sapBuyVending']
        - event_refund = ['sapRefundStore','sapRefundVending']
    """
    ## (1) buy / refund 
    df_event_buy = pd.DataFrame() 
    for i in event_buy:
        df_tmp = event.loc[event['name'] == i,[key_id,orderId_param,price_param]]
        df_event_buy = pd.concat([df_event_buy, df_tmp])

    df_event_refund  = pd.DataFrame() 
    for i in event_refund :
        df_tmp = event.loc[event['name'] == i,[key_id,orderId_param,price_param]]
        df_event_refund = pd.concat([df_event_refund, df_tmp])
    
    ## (2) merge buy & refund
    merge_tmp = pd.merge(df_event_buy, df_event_refund.rename(columns = {price_param : 'refund_price'}),
                         on = [key_id, orderId_param], 
                         how = 'left')

    ## (3) Exclude the refund
    merge_tmp['refund_price'] = merge_tmp['refund_price'].fillna(0)
    merge_tmp['priceFinal'] = merge_tmp[price_param] - merge_tmp['refund_price']
    commerce_df = merge_tmp.loc[merge_tmp['priceFinal'] != 0, [key_id, orderId_param, 'priceFinal']]
    
    return commerce_df

In [None]:
def commerce_feature (df, key_id) :
    """
    make total Price and buyCount column
    """
    # 1) PriceFinal by key_id
    tmp_price = df.groupby(key_id).sum().reset_index()
    # 2) buyCount by key_id
    tmp_cnt = df.groupby(key_id).count().reset_index()\
            [[key_id, 'priceFinal']].rename(columns = {'priceFinal' : 'buyCnt'})
    # 3) merge price and cnt
    tmp_final = pd.merge(tmp_price, tmp_cnt, on = key_id)
    # 4) priceFinal_mean & priceFinal_log
    tmp_final['buyCnt_log'] = np.log(tmp_final['buyCnt']+1)
    tmp_final['price_mean'] = tmp_final['priceFinal']/tmp_final['buyCnt']
    tmp_final['price_Meanlog'] = np.log(tmp_final['price_mean'])

    return tmp_final

def commerce_index_labeling (commerce_final,  buyCnt_weight) :
    """
    최종 구매지수 라벨링
    H : 2회 구매자 평균 purchase_index이상 
    M : 1회 평균 purchase_index이상 ~ 2회 평균 purchase_index이하 
    L : 1회 평균 purchase_index이하 
    0 : 0회 구매자
    """
    # 1) min_max normalize
    minMax_mat = commerce_final[['buyCnt_log', 'price_Meanlog']].to_numpy()
    scaler = MinMaxScaler().fit(minMax_mat)
    minMax_mat = scaler.transform(minMax_mat)
    
    # 2) make purchase_index using buyCnt_log + price_Meanlog
    w_1 = buyCnt_weight 
    w_2 = 1 - w_1
    # log buycnt & price
    commerce_final['purchase_index'] = ((minMax_mat[:,0])*w_1 + minMax_mat[:,1]*w_2)
    commerce_final['purchase_index'] = round(commerce_final['purchase_index'], 2)  
    commerce_final.head()
    
    
    # 3) labeling
    high_point = commerce_final[commerce_final['buyCnt'] == 2].purchase_index.mean()
    low_point = commerce_final[commerce_final['buyCnt'] == 1].purchase_index.mean()

    commerce_final['purchase_label'] = ''
    commerce_final.loc[commerce_final['purchase_index'] >  high_point, 'purchase_label'] = 'H' # 83
    commerce_final.loc[(commerce_final['purchase_index'] <=  high_point) &
                   (commerce_final['purchase_index'] >  low_point), 'purchase_label'] = 'M'# 227
    commerce_final.loc[(commerce_final['purchase_index'] <=  low_point) &
                   (commerce_final['purchase_index'] >  0), 'purchase_label'] = 'L'# 212
    commerce_final.loc[(commerce_final['purchase_index'] ==0), 'purchase_label'] = 0 # 157,390
    
    return commerce_final

# 5-1. Activity_Aggregate

In [None]:
def activity_agg(df, today, key_id, duration_weight = 0.5) :
    groupes = df.groupby(key_id)
    ids = list(set(df[key_id]))
#     print(str(today)+'_number of unique customer : ' + str(len(ids)))

    # summarise
    listForDF = []
    for i in ids :
        tmp = groupes.get_group(i)
        mp = {}
        mp[key_id] = i
        mp['period'] = today
        # mp['start'] = mp['date'].apply(lambda x : x.date()).min()
        # mp['end'] = mp['date'].apply(lambda x : x.date()).max()
        mp['conti_act_days_last7'] = conti_act_days_last7(tmp, today)
        mp['act_days_last7'] = act_days_last7(tmp, today)
        mp['duration_mean_log'] = np.log(duration_mean(tmp))
        mp['act_conti_sum'] = mp['act_days_last7'] + mp['conti_act_days_last7']*0.5
        listForDF.append(mp)
    df = pd.DataFrame(listForDF)

    # act_index (min_max)
    w_1 = duration_weight 
    w_2 = 1 - w_1
    minMax_mat = df[['act_conti_sum', 'duration_mean_log']].to_numpy()
    scaler = MinMaxScaler().fit(minMax_mat)
    minMax_mat = scaler.transform(minMax_mat)
    df['act_index'] = ((minMax_mat[:,0])*w_1 + minMax_mat[:,1]*w_2)
    df['act_index'] = round(df['act_index'], 2) + 1 # +1 for ditinguish them from unvisited people
    
    return df

# 5-2. purchase_Aggregate

In [None]:
def purchase_agg(df, key_id, event_buy = None, event_refund = None, 
                 orderId_param = None, price_param = None) :
    # 1) json flatten
    df, event = json_to_dataframe_nodeN(df, key_id)
    
    # 2) parameter setting (incase different tagging name)
    """아래 파라미터값 확인 필요"""
    if event_buy == None :
        event_buy = ['sapBuyStore', 'sapBuyVending']
    if event_refund == None :
        event_refund = ['sapRefundStore', 'sapRefundVending']
    if orderId_param == None :
        orderId_param = 'params__sapOrderId'
    if price_param == None :
        price_param = 'params__sapPriceFinal'
    
    # 3) feature Engineer
    commerce_tmp = commerce_preprocess(event, key_id,
                                       event_buy,event_refund, 
                                       orderId_param , price_param)
    commerce_tmp = commerce_feature(commerce_tmp, key_id) 
        
    # 4) merge with non-buyer (df : total data)
    df_commerce_merge = pd.merge(pd.DataFrame({key_id : list(set(df.sphereId))}),
                                 commerce_tmp,
                                 on = key_id, how = 'outer').fillna(0)
    
    # 5) purchase index(min_max) & labeling 
    buyCnt_weight = 0.8 # 구매횟수에 대한 가중치 수정시 변경
    commerce_final = commerce_index_labeling(df_commerce_merge,  buyCnt_weight)

    return commerce_final


# 5. segment_main

In [None]:
# main
def basic_segment_main(df, today, dates, key_id, commerce = False) :
    # 1) activity 
    ## (1)divide time range
    df_act_merge = pd.DataFrame()
    for i in range(int(dates/7)) :
        end_day = (today - timedelta(days = 7 * (int(dates/7) - (i + 1))+1)).date()
        start_day = end_day - timedelta(days = 7)
        df_tmp = df[(df['date'] <= end_day) & (df['date'] > start_day)] 
        ## (2) apply activity agg
        df_tmp = activity_agg(df_tmp, end_day, key_id)[[key_id,'act_index']]
        df_tmp = df_tmp.rename(columns = {'act_index' : str(end_day)})
        ## (3) merge weekly activity index
        try : 
            df_act_merge = pd.merge(df_act_merge, df_tmp,on = key_id, how = 'outer')
        except KeyError :
            df_act_merge = df_tmp
            pass
    df_act_merge = df_act_merge.fillna(0) # fill Na with 0
    
    ## (4) 'weekly_visit_cnt & weekly_act_mean' for act_index  
    df_act_merge = weekly_visit_act(df_act_merge) 
    df_act_merge = act_index_labeling(df_act_merge)
    df_act_final = df_act_merge[[key_id,'act_label']]
    df_final = df_act_final
    
    # 2) phurchase_agg                
    if commerce == True :
        df_commerce_tmp = purchase_agg(df, key_id) 
        
        # merge act & commerce
        df_final = pd.merge(df_act_merge, df_commerce_tmp,
                    on = key_id)
        df_final[['act_label', 'purchase_label']]
    
    return df_final

# 6. final_segments_divider

In [None]:
# feature select
def feature_select (df, key_id, commerce = False) : 
    if commerce == False :
        df_re_segment_detail = df[[key_id,'visit_week_cnt', 'act_index_mean', 
                                             'act_label']]
        df_re_segment_label = df[[key_id, 'act_label']]
    else : 
        df_re_segment_detail = df[[key_id,'visit_week_cnt', 'act_index_mean', 
                                             'buyCnt','priceFinal', 'price_mean', 'purchase_index',
                                             'act_label', 'purchase_label']]
        df_re_segment_label = df[[key_id, 'act_label','purchase_label']]

    return df_re_segment_detail, df_re_segment_label

In [None]:
# final segments divide 
def final_segments_divider(df, commerce = False) :
    """
    (1) 비커머스 
    act : H ==> Heavy
    act : M ==> Light
    (2) 커머스 
    purchase : H ==> Heavy 
    purchase : M & act : H,M==> Heavy
    purchase : L ==> Light
    purchase : L & act : M ==> Light
    """
    
    df['segment'] = 'None'
    
    if commerce == False :
        ## loyal
        df.loc[df['act_label'] == 'H', 'segment'] = 'HEAVY'
        ## light
        df.loc[df['act_label'] == 'M', 'segment'] = 'LIGHT'
    else :
        # pruchase
        ## loyal
        df.loc[df['purchase_label'] == 'H', 'segment'] = 'HEAVY'
        df.loc[(df['act_label'] == 'H') & (df['purchase_label'] == 'M'), 'segment'] = 'HEAVY' 

        ## light
        df.loc[df['purchase_label'] == 'L', 'segment'] = 'LIGHT'
        df.loc[(df['act_label'] == 'H') & (df['purchase_label'] == 0), 'segment'] = 'LIGHT' 

    return df

# 7. growth index

In [None]:
def make_real_date(x,y):
    real_date_list = []
    for i in x:
        if i in y.keys():
            real_date_list.append(y[i]) # {날짜 : 시간} dict에 날짜 key값으로 시간 출력 
        else:
            real_date_list.append(0)
    return real_date_list

def conti_corr(x):
    global idx
    corr, p = pearsonr(idx, x)
    return corr

def min_engage_2w(x):
    x_2w = [c for c in x[-14:] if c != 0]
    return len(x_2w)

def preprocess_potential_users(data):
    global date_list
    
    poten_df = data[[key_id,'date','duration', 'real_date']]
    poten_df['date'] = pd.to_datetime(poten_df['date'])
    poten_df['date_time'] = pd.to_datetime(poten_df['real_date'])
    poten_df['date'] = poten_df['date_time'].astype(str).apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    poten_df = poten_df.sort_values(by=[key_id,'date'],ascending=False)
    poten_df['date_ymd'] = poten_df['date'].dt.date
    poten_df = poten_df.groupby(['sphereId','date_ymd'])['duration'].agg('sum').reset_index()

    tmp = poten_df.groupby(key_id)
    poten_df_agg = pd.DataFrame()
    poten_df_agg[key_id] = list(tmp.groups.keys())
    poten_df_agg['date'] =  poten_df.groupby(key_id)['date_ymd'].apply(list).values
    poten_df_agg['duration'] =  poten_df.groupby(key_id)['duration'].apply(list).values
    poten_df_agg['real_date'] = [sorted(date_list) for i in range(poten_df_agg.shape[0])]
    poten_df_agg['mapping'] = poten_df_agg.apply(lambda x: dict(zip(x['date'],x['duration'])), axis=1) # 시간과 날짜 꺼내와서 dict 형태로 묶기

    poten_df_agg['real_duration'] = poten_df_agg.apply(lambda x: make_real_date(x['real_date'],x['mapping']), axis=1) # 접속한 날짜만 시간이 있고, 나머지 날짜는 0으로 가득 차있는 리스트
    poten_df_agg['duration_corr'] = poten_df_agg['real_duration'].apply(lambda x: conti_corr(x)) # corr : 기울기
    poten_df_agg['min_engage_2w'] = poten_df_agg.real_duration.apply(lambda x: min_engage_2w(x)) # 뒤 2주에 접속한 수
    poten_df_agg = poten_df_agg.loc[(poten_df_agg['min_engage_2w']>=3)&(poten_df_agg['duration_corr']>=0.2)] # 뒤 2주에 3번이상 접속하였고, 기울기가 0.2 이상인 사용자만 선택
    return poten_df_agg

# 99. run

In [None]:
# today = datetime(2020,5,31)
# input_path = '../in/coke/android'
# key_id = 'sphereId'
# dates = 28

# newb_period = 7
# user_property = False
# commerce = True 
# platform_total = False


# # 1) data import
# df = read_files(today, input_path, dates, platform_total)

# # 2) Basic preprocess
# df = basic_preprocess(df) # duration outlier

# #3) divide_target
# # df_new, df_re = divide_visitor(today, df,key_id,
# #                                newb_period=14,
# #                                user_property = False) 
# df_new, df_re = divide_visitor_v2(today, dates, df,key_id,
#                                   newb_period=7,
#                                   user_property = False)

# 4) basic_segment_main
# df_re_index = basic_segment_main(df_re, today, dates,
#                                    key_id, commerce)
# # 5) feature select
# df_re_index_detail, df_re_index = feature_select(df_re_index, 
#                                                  key_id, commerce)

# # 6) final_segments_divider
# df_re_segment = final_segments_divider(df_re_index, commerce=True)

# df_re_growth = preprocess_potential_users(df_re)
# df_new_growth = preprocess_potential_users(df_new)
# #     return df_re_segment, df_re_growth, df_new_growth

In [None]:
def segment_run (today, input_path, dates, key_id, newb_period = 14, 
                 user_property = False, commerce = True, 
                 platform_total = False) :
    # 1) data import
    df = read_files(today, input_path, dates, platform_total)

    # 2) Basic preprocess
    df = basic_preprocess(df) # duration outlier

    # 3) divide_target
#     df_new, df_re = divide_visitor(today, df,key_id,
#                                    newb_period=14,
#                                    user_property = False) 
    df_new, df_re = divide_visitor_v2(today, dates, df,key_id,
                                  newb_period=7,
                                  user_property = False)
    
    # 4) basic_segment_main
    df_re_index = basic_segment_main(df_re, today, dates,
                                       key_id, commerce)
     
    # 5) feature select
    df_re_index_detail, df_re_index = feature_select(df_re_index, 
                                                     key_id, commerce)
     
    # 6) basic_segments_divider
    df_re_segment = final_segments_divider(df_re_index, commerce=True)

    # 7) growth_segments_divider
    df_re_growth = preprocess_potential_users(df_re)
    df_new_growth = preprocess_potential_users(df_new)
    
    return df_re_segment, df_re_growth, df_new_growth

In [None]:
%%time
# run
today = datetime(2020,5,31)
input_path = '/home/heemok/tand/data_coke/coke_android'#'../in/coke/android'
key_id = 'sphereId'
dates = 28
idx = [i for i in range(dates)]
date_list = [(today - timedelta(days=x)).date() for x in range(dates)]
df_re_segment, df_re_growth, df_new_growth = segment_run (today, input_path, dates,
                             key_id, newb_period = 7, 
                             user_property = False, commerce = True, platform_total=False) 
df_re_segment.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 1min 41s, sys: 1.71 s, total: 1min 43s
Wall time: 1min 43s


Unnamed: 0,sphereId,act_label,purchase_label,segment
0,kqVrm1oVTLqORTbE0k7oNuFsYaRsg,M,0,
1,-C-FSJ_P64E9TNKyojYgO9uOJIKrP,H,0,LIGHT
2,K27K6kTMvdDugDuLeU6VeyqpXxGN3,H,0,LIGHT
3,PrbR8uI4vUZA_dDGfUbHGcZtNU6XW,L,0,
4,fVIUoaZ6_rRiEUrV1iAAHG57VxqaD,L,0,


In [None]:
df_re_growth.head()
df_new_growth.head()

Unnamed: 0,sphereId,date,duration,real_date,mapping,real_duration,duration_corr,min_engage_2w
70,-5W_QnqpE98C2HY8bNvirzpMRqjLQ,"[2020-05-17, 2020-05-18, 2020-05-19, 2020-05-2...","[265.0, 2708.0, 618.0, 599.0, 260.0, 144.0, 13...","[2020-05-04, 2020-05-05, 2020-05-06, 2020-05-0...","{2020-05-17: 265.0, 2020-05-18: 2708.0, 2020-0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 265.0,...",0.219627,13
90,-7JSpraoM8WOt22JrHJ0qpdq6B-61,"[2020-05-25, 2020-05-26, 2020-05-28]","[676.0, 301.0, 37.0]","[2020-05-04, 2020-05-05, 2020-05-06, 2020-05-0...","{2020-05-25: 676.0, 2020-05-26: 301.0, 2020-05...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.262067,3
95,-7wxzKPpymbIp7oSw2s7nFE-QZKtU,"[2020-05-19, 2020-05-20, 2020-05-21, 2020-05-2...","[510.0, 1228.0, 647.0, 408.0, 269.0, 232.0, 23...","[2020-05-04, 2020-05-05, 2020-05-06, 2020-05-0...","{2020-05-19: 510.0, 2020-05-20: 1228.0, 2020-0...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.485639,12
98,-8800JAP9EaVW2c0PauP_JvEDtIhq,"[2020-05-25, 2020-05-26, 2020-05-27, 2020-05-2...","[527.0, 431.0, 84.0, 189.0, 20.0]","[2020-05-04, 2020-05-05, 2020-05-06, 2020-05-0...","{2020-05-25: 527.0, 2020-05-26: 431.0, 2020-05...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.370359,5
101,-8O3O5xPHImZ-krYK35PvA__VNHS0,"[2020-05-03, 2020-05-05, 2020-05-06, 2020-05-0...","[341.0, 101.0, 224.0, 141.0, 122.0, 145.0, 640...","[2020-05-04, 2020-05-05, 2020-05-06, 2020-05-0...","{2020-05-03: 341.0, 2020-05-05: 101.0, 2020-05...","[0, 101.0, 224.0, 141.0, 0, 0, 0, 0, 0, 122.0,...",0.204,11


Unnamed: 0,sphereId,date,duration,real_date,mapping,real_duration,duration_corr,min_engage_2w
1079,Aj4N1S2FulvsdNUbeYx2jXs9OkEPo,"[2020-05-07, 2020-05-08, 2020-05-12, 2020-05-1...","[59.0, 8.0, 365.0, 704.0, 67.0, 447.0, 11.0, 1...","[2020-05-04, 2020-05-05, 2020-05-06, 2020-05-0...","{2020-05-07: 59.0, 2020-05-08: 8.0, 2020-05-12...","[0, 0, 0, 59.0, 8.0, 0, 0, 0, 365.0, 704.0, 0,...",0.278139,7
4029,fGm38ai9HVrnreqTNI5R9N5_uelWF,"[2020-05-04, 2020-05-06, 2020-05-24, 2020-05-2...","[290.0, 70.0, 158.0, 290.0, 361.0, 581.0, 2067...","[2020-05-04, 2020-05-05, 2020-05-06, 2020-05-0...","{2020-05-04: 290.0, 2020-05-06: 70.0, 2020-05-...","[290.0, 0, 70.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",0.526388,7
5457,vLwKEUGzLNAqs4yi35MUtVKn91oLn,"[2020-05-08, 2020-05-13, 2020-05-21, 2020-05-2...","[1.0, 355.0, 63.0, 56.0, 101.0, 225.0, 83.0, 2...","[2020-05-04, 2020-05-05, 2020-05-06, 2020-05-0...","{2020-05-08: 1.0, 2020-05-13: 355.0, 2020-05-2...","[0, 0, 0, 0, 1.0, 0, 0, 0, 0, 355.0, 0, 0, 0, ...",0.438966,7


In [None]:
df = read_files(today, input_path, dates, False)
df = basic_preprocess(df)
df_new, df_re = divide_visitor_v2(today, dates, df,key_id,
                              newb_period=7,
                              user_property = False)

# 신규성장 혹은 잠재 고객의 성장 기울기 확인

In [None]:
define_df = df_new_growth # 신규 성장 고객
#define_df = df_re_growth # 잠재 고객

In [None]:
df_new_growth.shape, df_re_growth.shape

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as sp
from sklearn import datasets, linear_model
def linear_reg(x, y):
    slope, intercept, r_value, p_value, std_err =sp.linregress(x,y)
    regr = linear_model.LinearRegression()
    regr.fit(np.array(x).reshape(1,-1), np.array(y).reshape(1,-1))
    coef = regr.coef_
    xf = np.linspace(min(x),max(x),100)
    xf1 = xf.copy()
    yf = (slope*xf)+intercept

    return xf1, yf, slope, intercept, r_value, p_value, std_err, coef

for index, row in define_df.iterrows():
    print('-------------------------')
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
    fig.suptitle('growth plot')
    ax1.plot(idx, row['real_duration'])
    ax1.set_title('duration')
    ax1.set(xlabel='date')

    
    pred_x, pred_y, slope, intercept, r_value, p_value, std_err, coef = linear_reg(idx,row['real_duration'])

    ax3.plot(pred_x, pred_y,label='Linear fit', lw=3)
    ax3.plot(idx, row['real_duration'])
    #print('coef = ', coef)
    print('slope = ', slope, '\n', 'intercept = ', intercept)
    print('r = ', r_value, '\n', 'p = ', p_value)

    #plt.plot(idx, row['real_click'], 'g') # plotting t, c separately 
    plt.show()

## 신규 성장성 검토 테스트 코드


In [None]:
# 0) besic setting
today = datetime(2020,5,31)
input_path = '../in/coke/android'
key_id = 'sphereId'
dates = 28

today, input_path, dates
key_id
newb_period = 7
user_property = False
commerce = True
platform_total = True # for import android and ios at once

In [None]:
# # 1) data import
# df = read_files(today, input_path, dates, platform_total = True)

# # 2) Basic preprocess
# df = basic_preprocess(df) # duration outlier

# 3) divide_target
df_new, df_re = divide_visitor_v2(today, dates, df,key_id,
                                  newb_period=7,
                                  user_property = False)

In [None]:
# check
df_new.date.min()
df_new.date.max() # 신규 고려 기간 4주(가입 첫주 포함)
df_new[key_id].nunique() # 첫주에 가입한 회원수
df_re[key_id].nunique()  # 4주간 모든 신규를 제외한(appInstall X) 나머지 기존 유저수

In [None]:
df_new.head()
len(df_re)