In [None]:
import pandas as pd
import pandas as pd
import numpy as np
import os.path as path

def filter_cold_start_articles(df: pd.DataFrame, min_purchases: int = 5) -> pd.DataFrame:
    """
    Remove cold-start articles based on minimum interaction threshold.

    Args:
    - df: DataFrame, must contain column 'article_id'. 
          Typically includes [t_dat, customer_id, article_id, price, sales_channel_id]
    - min_purchases : int, default 5. The minimum number of total purchases an article must have to be retained in the dataset.

    Return:
    - filtered_df : DataFrame containing only the transactions where article_id has appeared at least `min_purchases` times. Index is reset.
    """
    df = df.copy()
    article_counts = df.groupby('article_id')['article_id'].transform('count')

    return df[article_counts >= min_purchases].reset_index(drop=True)


def filter_weeks_length(df, weeks=24, min_purchases=4):
    """
    Filter user transactions based on their last active date and minimum purchase count.

    Args:
    - df             : DataFrame, must contain cols: [t_dat, customer_id, article_id, price, sales_channel_id]
    - weeks          : int, default 24. The number of weeks to retain from each user's last transaction date.
    - min_purchases  : int, default 4. Minimum number of transactions required for a user to be retained.

    Return:
    - filtered_df : DataFrame containing only transactions that occurred within the [last_date - weeks, last_date]
                   time window for each customer, and only for customers with at least `min_purchases` transactions.
                   The returned DataFrame is sorted as in original and reset index.
    """

    df = df.copy()
    df['t_dat'] = pd.to_datetime(df['t_dat'])

    df['customer_id'] = df['customer_id'].astype('category')

    last_date  = df.groupby('customer_id')['t_dat'].transform('max')
    row_count  = df.groupby('customer_id')['t_dat'].transform('size')

    start_date = last_date - np.timedelta64(weeks*7, 'D')

    mask = (df['t_dat'] >= start_date) & (df['t_dat'] <= last_date) & (row_count >= min_purchases)

    return df[mask].reset_index(drop=True)



In [9]:
"""main"""
trans = pd.read_csv(r"C:\113-2-WM-Final-Project\data\transactions_train.csv",
                 parse_dates=['t_dat'],
                 dtype={
                     'customer_id':'category',
                     'article_id': 'int32',
                     'sales_channel_id':'uint8'
                 })

ARTICLE = path.join("..","data","articles.csv")
article_dtype = {
    'article_id':"int32",
    'detail_desc':"category"
}

article = pd.read_csv(ARTICLE,usecols=['article_id','detail_desc'],dtype=article_dtype,engine='pyarrow')

""" Remove the transaction record of Missing desc articles"""
valid_ids = article.loc[article['detail_desc'].notnull(), 'article_id']
trans = trans[trans['article_id'].isin(valid_ids)]
trans.to_csv("transactions_train_clean.csv",index=False)

"""Remove Cold Start articles in transactions"""
trans_remove_cold = filter_cold_start_articles(trans,min_purchases = 5)
origin_cust_sum = trans_remove_cold['customer_id'].nunique()

"""Filtering date(last 24 weeks) and session lengths(min = [4,6])"""
trans_minLen_4 = filter_weeks_length(trans_remove_cold, weeks=24, min_purchases = 4)
min4_cust_sum = trans_minLen_4['customer_id'].nunique()

trans_minLen_6 = filter_weeks_length(trans_remove_cold, weeks=24, min_purchases = 6)
min6_cust_sum = trans_minLen_6['customer_id'].nunique()

print(f"After Filter length 4 : retain ratio ({min4_cust_sum} : {origin_cust_sum}) , remove: {origin_cust_sum-min4_cust_sum}")
print(f"After Filter length 6 : retain ratio ({min6_cust_sum} : {origin_cust_sum}) , remove: {origin_cust_sum-min6_cust_sum}")

  last_date  = df.groupby('customer_id')['t_dat'].transform('max')
  row_count  = df.groupby('customer_id')['t_dat'].transform('size')
  last_date  = df.groupby('customer_id')['t_dat'].transform('max')
  row_count  = df.groupby('customer_id')['t_dat'].transform('size')


After Filter length 4 : retain ratio (1006003 : 1361469) , remove: 355466
After Filter length 6 : retain ratio (859243 : 1361469) , remove: 502226


In [None]:
# trans_remove_cold.to_csv("transactions_5.csv",index=False)
# trans_minLen_4.to_csv("transactions_5_4.csv",index=False)
# trans_minLen_6.to_csv("transactions_5_6.csv",index=False)
