In [1]:
import pandas as pd
import os.path as path
import time
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:

"""Read All Necessary File"""

#TRANS = path.join("..","data","transactions_train.csv")
TRANS = path.join("..","data","trans.parquet")

tran_dtype = {
    't_dat': 'datetime64[ns]',
    'customer_id': 'category',
    'article_id': 'category',
    'sales_channel_id':"int"
}

# trans = pd.read_csv(TRANS, engine = 'pyarrow', dtype = tran_dtype)
# trans.to_parquet(path.join("..","data","trans.parquet"))
trans = pd.read_parquet(TRANS, engine= 'pyarrow')
print(trans.head())

ARTICLE = path.join("..","data","articles.csv")
article_dtype = {
    'article_id':"category",
    'detail_desc':"category"
}
article = pd.read_csv(ARTICLE,usecols=['article_id','detail_desc'],dtype=article_dtype,engine='pyarrow')
print(article.head())

       t_dat                                        customer_id  article_id  \
0 2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   663713001   
1 2018-09-20  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   541518023   
2 2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   505221004   
3 2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   685687003   
4 2018-09-20  00007d2de826758b65a93dd24ce629ed66842531df6699...   685687004   

      price  sales_channel_id  
0  0.050831                 2  
1  0.030492                 2  
2  0.015237                 2  
3  0.016932                 2  
4  0.016932                 2  
  article_id                                        detail_desc
0  108775015            Jersey top with narrow shoulder straps.
1  108775044            Jersey top with narrow shoulder straps.
2  108775051            Jersey top with narrow shoulder straps.
3  110065001  Microfibre T-shirt bra with underwired, moulde...
4  110065002 

In [None]:
"""Analysis article_id without detail_desc"""
missing_des = article['detail_desc'].isnull()
missing_ids = article['article_id'][missing_des].values.to_numpy().tolist()
print(f"Miss Detail_desc : {len(missing_ids)}")
print(missing_ids)

Miss Detail_desc : 416
[351332007, 420049002, 420049003, 426199002, 426199010, 426199018, 426199019, 426199021, 426199027, 426199028, 426199029, 469039001, 469039019, 469039021, 469039024, 469039028, 469039032, 469039033, 469039034, 469039035, 469039040, 469039047, 493438001, 493438002, 493438005, 493438012, 493438013, 493438014, 493438015, 493438018, 493438019, 493438020, 493438021, 493438025, 493438027, 507883002, 507883007, 507883008, 507883009, 507883011, 507883014, 507883015, 507883018, 510419001, 510419004, 510419006, 519929006, 519929007, 519929008, 519929009, 519929010, 519929019, 522090001, 522090003, 522090005, 527657001, 538946002, 538946003, 539514001, 539844003, 539844004, 539844007, 551394002, 551394003, 555990001, 555990002, 555990003, 555990005, 556673001, 557216001, 557287001, 557287003, 558651001, 558651002, 558651003, 558651005, 558651006, 558651007, 559551001, 561551010, 561551012, 561551014, 561551015, 561551017, 561551019, 561551020, 562327001, 562327006, 56233900

In [5]:
"""Analysis article which are low interacted """

interacted_freq = [1,5,10,20,25,30]
trans_frq = trans['article_id'].value_counts()
print(f"Total Article in transcation :{trans['article_id'].nunique()}")

for t in interacted_freq:
    cuting = trans_frq[trans_frq<=t]
    print(f"Article bought less than  {t} : {len(cuting)}, accouting for all article {100*(len(cuting)/trans['article_id'].nunique()):.2f}%")
    


Total Article in transcation :104547
Article bought less than  1 : 4491, accouting for all article 4.30%
Article bought less than  5 : 14669, accouting for all article 14.03%
Article bought less than  10 : 22477, accouting for all article 21.50%
Article bought less than  20 : 32403, accouting for all article 30.99%
Article bought less than  25 : 35968, accouting for all article 34.40%
Article bought less than  30 : 39056, accouting for all article 37.36%


In [7]:
"""Cross Analysis Missing desc and Low Transcation Article"""
for t in interacted_freq:
    cuting = trans_frq[trans_frq<=t].index.to_list()
    cross = list(set(cuting+missing_ids))
    print(f"Total Remove Articles (less {t}) : {len(cross)} accouting for {100*(len(cross)/trans['article_id'].nunique()):.2f}%")


Total Remove Articles (less 1) : 4880 accouting for 4.67%
Total Remove Articles (less 5) : 15012 accouting for 14.36%
Total Remove Articles (less 10) : 22791 accouting for 21.80%
Total Remove Articles (less 20) : 32672 accouting for 31.25%
Total Remove Articles (less 25) : 36220 accouting for 34.64%
Total Remove Articles (less 30) : 39294 accouting for 37.59%


In [12]:
"""Session Analysis"""
def remove_article_ids(trans, ids:list)->pd.DataFrame:
    return trans[~trans['article_id'].isin(ids)]

def reorganize_session(trans, remove_t):
    print("產生 session")
    df_grouped = trans.groupby("customer_id").agg({
        'article_id': list,
        'price': list,
        't_dat': list,
        'sales_channel_id': list
    }).reset_index()

    df_grouped["session"] = list(zip(
        df_grouped["article_id"],
        df_grouped["price"],
        df_grouped["t_dat"],
        df_grouped["sales_channel_id"]
    ))
    df_grouped["session"] = df_grouped["session"].apply(lambda x: str(x))

    df_grouped = df_grouped[["customer_id", "session"]]
    df_grouped.to_parquet(
        path.join("..", "data", f"session_{remove_t}.parquet"),
        engine='pyarrow',
    )
    return df_grouped


remove_threshold = [1,5,10]
for t in remove_threshold:
    cuting = trans_frq[trans_frq<=t].index.to_list()
    cross = list(set(cuting+missing_ids))
    trans_removed = remove_article_ids(trans,cross)
    session = reorganize_session(trans_removed,t)


產生 session


  df_grouped = trans.groupby("customer_id").agg({


產生 session


  df_grouped = trans.groupby("customer_id").agg({


產生 session


  df_grouped = trans.groupby("customer_id").agg({


In [15]:
filepath = path.join("..", "data", f"session_{1}.parquet")
df = pd.read_parquet(filepath, engine='pyarrow')
row_d = df.iloc[0]
print(type(row_d['customer_id']))
print((row_d['customer_id']))
print(type(row_d['session']))
print((row_d['session']))

<class 'str'>
00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657
<class 'str'>
([625548001, 176209023, 627759010, 697138006, 568601006, 568601006, 607642008, 745232001, 656719005, 797065001, 797065001, 785186005, 694736004, 785710001, 812683013, 841260003, 887593002, 890498002, 795440001, 859416011, 568601043], [0.044050847457627114, 0.035576271186440675, 0.03049152542372881, 0.010152542372881357, 0.050830508474576264, 0.050830508474576264, 0.012694915254237289, 0.021169491525423727, 0.044050847457627114, 0.05422033898305084, 0.05422033898305084, 0.016932203389830508, 0.018288135593220338, 0.02440677966101695, 0.03659322033898305, 0.011508474576271186, 0.02159322033898305, 0.031762711864406774, 0.014389830508474576, 0.014389830508474576, 0.050830508474576264], [Timestamp('2018-12-27 00:00:00'), Timestamp('2018-12-27 00:00:00'), Timestamp('2018-12-27 00:00:00'), Timestamp('2019-05-02 00:00:00'), Timestamp('2019-05-25 00:00:00'), Timestamp('2019-05-25 00:00:00'), Timestamp(

In [3]:
import ast
safe_globals = {"__builtins__": None, "Timestamp": pd.Timestamp}

def safe_eval_with_timestamp(val):
    try:
        return eval(val, safe_globals, {})
    except Exception as e:
        print(f"[警告] 無法解析 session: {val}\n錯誤: {e}")
        return ([], [], [], [])

def load_session_file(remove_t):
    filepath = path.join("..", "data", f"session_{remove_t}.parquet")
    df = pd.read_parquet(filepath, engine='pyarrow')
    df["session"] = df["session"].apply(safe_eval_with_timestamp)
    return df

def analyze_session_lengths(df):
    df["session_length"] = df["session"].apply(lambda sess: len(sess[0]))
    return df[["customer_id", "session_length"]]

def filter_short_sessions(df, min_length):
    df["session_length"] = df["session"].apply(lambda sess: len(sess[0]))
    before_count = len(df)
    filtered_df = df[df["session_length"] >= min_length].reset_index(drop=True)
    after_count = len(filtered_df)
    
    print(f"--- 過濾條件: session 長度 >= {min_length} ---")
    print(f"原始筆數: {before_count}")
    print(f"保留筆數: {after_count}")
    print(f"移除筆數: {before_count - after_count}")
    print(f"移除比例: {(100*(before_count - after_count)/before_count):.2f}%\n")

    
    return filtered_df


remove_threshold = [1,5,10]
for t in remove_threshold:
    session_df = load_session_file(remove_t=t)
    session_lengths = analyze_session_lengths(session_df)
    print("="*30, f" session_{t} ","="*30)
    print(session_lengths.describe())  # 顯示統計資料，例如平均、最大、最小

    
    filter_threshold = [3,5,10]
    for f in filter_threshold:
        filtered_df = filter_short_sessions(session_df, min_length=f)

        filtered_df["session"] = filtered_df["session"].apply(lambda x: str(x))
        filtered_df.to_parquet(f"session_{t}_filtered_{f}.parquet", engine='pyarrow')

    # 若需要儲存過濾後的結果也可以加上：
    # filtered_df.to_parquet("filtered_session.parquet", engine='pyarrow')



       session_length
count    1.362281e+06
mean     2.324692e+01
std      3.910678e+01
min      0.000000e+00
25%      3.000000e+00
50%      9.000000e+00
75%      2.700000e+01
max      1.894000e+03
--- 過濾條件: session 長度 >= 3 ---
原始筆數: 1362281
保留筆數: 1102071
移除筆數: 260210
移除比例: 19.10%

--- 過濾條件: session 長度 >= 5 ---
原始筆數: 1362281
保留筆數: 924170
移除筆數: 438111
移除比例: 32.16%

--- 過濾條件: session 長度 >= 10 ---
原始筆數: 1362281
保留筆數: 674691
移除筆數: 687590
移除比例: 50.47%

       session_length
count    1.362281e+06
mean     2.322230e+01
std      3.905783e+01
min      0.000000e+00
25%      3.000000e+00
50%      9.000000e+00
75%      2.700000e+01
max      1.894000e+03
--- 過濾條件: session 長度 >= 3 ---
原始筆數: 1362281
保留筆數: 1101717
移除筆數: 260564
移除比例: 19.13%

--- 過濾條件: session 長度 >= 5 ---
原始筆數: 1362281
保留筆數: 923735
移除筆數: 438546
移除比例: 32.19%

--- 過濾條件: session 長度 >= 10 ---
原始筆數: 1362281
保留筆數: 674300
移除筆數: 687981
移除比例: 50.50%

       session_length
count    1.362281e+06
mean     2.317757e+01
std      3.896803e+01
min     