In [1]:
import os
import glob
import pandas as pd
import numpy as np
import dask.dataframe as dd



In [2]:
def list_files_in_directory(directory, pattern='*'):
    # 使用 glob 模組列出資料夾中符合模式的檔案
    files = glob.glob(os.path.join(directory, pattern))
    return files

In [3]:
behavior_folder = 'D:/Desktop/新增資料夾/bda-final/bda-final/behavior'
behavior_files = list_files_in_directory(behavior_folder)
behavior_files

['D:/Desktop/新增資料夾/bda-final/bda-final/behavior\\2023-09',
 'D:/Desktop/新增資料夾/bda-final/bda-final/behavior\\2023-10',
 'D:/Desktop/新增資料夾/bda-final/bda-final/behavior\\2023-11',
 'D:/Desktop/新增資料夾/bda-final/bda-final/behavior\\2023-12',
 'D:/Desktop/新增資料夾/bda-final/bda-final/behavior\\2024-01',
 'D:/Desktop/新增資料夾/bda-final/bda-final/behavior\\2024-02']

In [None]:
dtype_spec = {
    'CategoryId': 'object',
    'ContentId': 'object',
    'ContentName': 'object',
    'ContentType': 'object',
    'PageType': 'object',
    'RegisterTunnel': 'object',
    'SearchTerm': 'object',
    'TradesGroupCode': 'object'
}
folder = list_files_in_directory(behavior_files[0])
dask_df = dd.read_csv(folder, dtype=dtype_spec)
dask_df = dask_df.sort_values(by=['FullvisitorId', 'EventTime'])
dask_df = dask_df.compute()
dask_df['EventTime'] = dd.to_datetime(dask_df['EventTime'])
dask_df = dask_df.reset_index(drop=True)
dask_df = dask_df.drop(columns=['Tunnel', 'Device', 'DeviceId', 'RegisterTunnel', 'SearchTerm', 'ContentType', 'ContentName', 'ContentId'])

In [19]:
def filter_userid(df, idx):
    return df[df['FullvisitorId'] == idx]

In [21]:
def calculate_time_to_checkout(df):
    # 初始化变量
    first_action_time = None
    actions_in_interval = []
    intervals = []

    for _, row in df.iterrows():
        if row['Behavior'] == 'purchase':
            if first_action_time:
                # 计算时间间隔并记录举动
                interval = row['EventTime'] - first_action_time
                intervals.append({
                    'customer_id': row['FullvisitorId'],
                    'actions': actions_in_interval.copy(),
                    'start_time': first_action_time,
                    'end_time': row['EventTime'],
                    'time_to_checkout': interval,
                    'trade_order': row['TradesGroupCode']
                })
            # 更新第一次动作时间
            first_action_time = row['EventTime']
            actions_in_interval = []
        else:
            if not first_action_time:
                first_action_time = row['EventTime']
            actions_in_interval.append(row['Behavior'])

    return pd.DataFrame(intervals)

In [None]:
def process_result(df):
    df['mean_time_to_check'] = df['time_to_checkout'].mean()
    df['hesitate'] = (df['time_to_checkout'] > df['mean_time_to_check']).astype(int)
    return df

In [20]:
dtype_spec = {
    'CategoryId': 'object',
    'ContentId': 'object',
    'ContentName': 'object',
    'ContentType': 'object',
    'PageType': 'object',
    'RegisterTunnel': 'object',
    'SearchTerm': 'object',
    'TradesGroupCode': 'object'
}
folder = list_files_in_directory(behavior_files[0])
dask_df = dd.read_csv(folder, dtype=dtype_spec)
dask_df = dask_df.drop(columns=['Tunnel', 'Device', 'DeviceId', 'RegisterTunnel', 'SearchTerm', 'ContentType', 'ContentName', 'ContentId'])
dask_df

Unnamed: 0_level_0,ShopId,ShopMemberId,FullvisitorId,HitTime,Behavior,CategoryId,SalePageId,UnitPrice,Qty,TotalSalesAmount,TradesGroupCode,PageType,EventTime
npartitions=216,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
,string,string,string,string,string,string,float64,float64,float64,float64,string,string,string
,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...


In [5]:
visitor_id = dask_df['FullvisitorId']
id_set = list(set(np.array(visitor_id.compute())))
id_set

['TFQ8vaC+K1wnty8lQjdGs/LFRL1m2o7RutsZISd0mzb0FUXwzZHnXm8L9ZGjY8mQ',
 'hrtw9iSt0/xhsMdRX0MwXvt4fLvsyqeCdect2TCdaO2Fxlz4U0k6jtOpuTJkxB96',
 'N6LKnL2p9W+jIU5IDa9X4sJ81G8XlBOR9JOXCV4nqBkDAaLJdFGPDtF5PPo2DxVA',
 'XPja/4zErVXg9kDoWGDC4Tei3GspI7FRAMD1SGeMoc+0sH1XydkmXWU8gEjbQxg8',
 'BYhvokik7unudlIe1IuUZue91lpo3+//85acf5xXRMjVnNNuwez5b9qiYQZejHNN',
 'rL9pRzeMR8E745IoGdOrwoixRBxh/sF/7i4Dn7gf2iy+yf1N+WhJnUThP9HwC70i',
 'MJeeFa55T7FbfEpu6pwkPb5trySokmWGHX1mD7hp/t5dtZFAUugNaJxE/i736bkU',
 'yGYuLF0TBZZvmRIQPZQaIL8qnJB53VtOerkkT1MA3JHEzHmQ+nTU47eh8a0l8vVN',
 'W6o0JyyVq5k/XgVGun5DEcVoJny423qO0VHq45KWQeH7uuO057F+F6Op3Y+r/NpH',
 'ja8oN8xxQHYsB0Ugp1H+zpTrskczh1toM9I+MtnMdHr2f0xtQO2TEY9RpYnTddsM',
 'fZiu37mvW4U4E1eUvXWDeZClAM2iTdw8VeAU2zE0GC+pQMt0KdK9+ilmgV0g7GYm',
 'aI1JEuyDYfHNjRnWZOfOYzAIi9p6mRpZLDGxPAY3N7xby8cOE2T1yk8344DpfPSd',
 'Wlj2AzVVw9wWbvvlB6hW2LRTzyRvc58ayhRanXYz0bi6KZN97I1DTd1BG4hWYLIG',
 '945G7lS0ZiKQ/vqWpB4zP38QNZ+SVNJ4lVi4k4c1QuyNLYfvekln60MGMKRJWcWU',
 '3GbG9zF5ZCymCURtAz3x/RaEcCLL3kbe

In [37]:
result_df = pd.DataFrame()
for i in range(len(id_set)):
    temp = dask_df.map_partitions(filter_userid, id_set[i]).compute()
    temp['EventTime'] = pd.to_datetime(temp['EventTime'])
    temp = temp.sort_values('EventTime')
    result = temp.groupby('FullvisitorId').apply(calculate_time_to_checkout).reset_index(drop=True)
    result_df = pd.concat([result_df, result])
result_df = process_result(result_df)

  result = temp.groupby('FullvisitorId').apply(calculate_time_to_checkout).reset_index(drop=True)


# 示範範例

In [14]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

# 生成示例数据
np.random.seed(42)

num_records = 100
fullvisitor_ids = np.random.choice(range(1, 21), size=num_records)  # 20个访客
events = np.random.choice(['view', 'click', 'purchase'], size=num_records, p=[0.6, 0.3, 0.1])
start_date = datetime(2024, 1, 1)
event_times = [start_date + timedelta(minutes=np.random.randint(0, 60*24*30)) for _ in range(num_records)]

# 创建Pandas DataFrame
df = pd.DataFrame({
    'FullvisitorId': fullvisitor_ids,
    'Behavior': events,
    'EventTime': event_times
})

# 转换为Dask DataFrame并设置分区数
ddf = dd.from_pandas(df, npartitions=2).compute()
ddf = ddf.sort_values(['FullvisitorId', 'EventTime'])
result = ddf.groupby('FullvisitorId').apply(calculate_time_to_checkout).reset_index(drop=True)
result['mean_time_to_check'] = result['time_to_checkout'].mean()
result['hesitate'] = (result['time_to_checkout'] > result['mean_time_to_check']).astype(int)
result

  result = ddf.groupby('FullvisitorId').apply(calculate_time_to_checkout).reset_index(drop=True)


Unnamed: 0,customer_id,actions,start_time,end_time,time_to_checkout,mean_time_to_check,hesitate
0,2.0,"[click, view, view, view]",2024-01-01 19:14:00,2024-01-08 10:49:00,6 days 15:35:00,10 days 11:45:48,0
1,7.0,"[click, click, click]",2024-01-01 11:39:00,2024-01-08 16:53:00,7 days 05:14:00,10 days 11:45:48,0
2,8.0,"[view, view, view, view, view, view, click, view]",2024-01-03 09:16:00,2024-01-26 23:30:00,23 days 14:14:00,10 days 11:45:48,1
3,10.0,"[view, view]",2024-01-04 21:20:00,2024-01-14 16:58:00,9 days 19:38:00,10 days 11:45:48,0
4,12.0,"[view, view]",2024-01-04 06:55:00,2024-01-09 11:03:00,5 days 04:08:00,10 days 11:45:48,0


In [18]:
ddf[ddf['FullvisitorId']==10]

Unnamed: 0,FullvisitorId,Behavior,EventTime
59,10,view,2024-01-04 21:20:00
20,10,view,2024-01-09 22:34:00
54,10,purchase,2024-01-14 16:58:00
89,10,view,2024-01-26 10:31:00
