In [1]:
# Cell 1 – imports & params
import pandas as pd, numpy as np, gc

AGE_TOP_PER_BUCKET = 100      # top articles retained per age bucket
N_PER_USER         = 100      # cap per user after join
HISTORY_WEEKS      = 2       # history length before cut_ts
EXCLUDE_LAST_DAYS  = 7       # exclude last 7 days from history

def hex_last16_to_uint64(s):
    return np.int64(np.uint64(int(s[-16:], 16)))

In [2]:
# Cell 2 – load transactions
tx = pd.read_csv(
    '../data/input_data/transactions_train.csv',
    usecols=['t_dat','customer_id','article_id'],
    dtype={'t_dat':'string','customer_id':'string','article_id':'int32'}
)
tx['customer_id'] = tx['customer_id'].apply(hex_last16_to_uint64)
tx['t_dat'] = pd.to_datetime(tx['t_dat'])
last_ts = tx['t_dat'].max()
cut_ts  = last_ts - pd.Timedelta(days=EXCLUDE_LAST_DAYS)
start_ts = cut_ts - pd.Timedelta(weeks=HISTORY_WEEKS)
tx = tx[(tx['t_dat'] > start_ts) & (tx['t_dat'] <= cut_ts)].copy()

In [3]:
# Cell 3 – load customer ages & bucketize
cust = pd.read_csv('../data/input_data/customers.csv', dtype={'customer_id':'string'})
cust['customer_id'] = cust['customer_id'].apply(hex_last16_to_uint64)
cust['age'] = cust['age'].fillna(-1)

def age_bucket(a):
    if a < 0: return -1
    if a < 18: return 0
    if a < 25: return 1
    if a < 35: return 2
    if a < 45: return 3
    if a < 55: return 4
    if a < 65: return 5
    return 6
cust['age_bucket'] = cust['age'].astype('int16').apply(age_bucket).astype('int8')
cust_bucket = cust[['customer_id','age_bucket']]

In [4]:
# Cell 4 – attach age bucket to transactions
tx = tx.merge(cust_bucket, on='customer_id', how='left')

In [5]:
# Cell 5 – Time-Decayed Popularity per (age_bucket, article)
# CHANGE: Calculate time weight
max_date = tx['t_dat'].max()
tx['days_old'] = (max_date - tx['t_dat']).dt.days
# Simple decay: 1.0 for today, 0.5 for 10 days ago, etc.
tx['pop_weight'] = 1.0 / (tx['days_old'] + 1)

pop = (tx.groupby(['age_bucket','article_id'])['pop_weight']
         .sum()
         .reset_index(name='score'))

# Rank within bucket & keep top AGE_TOP_PER_BUCKET
pop = pop.sort_values(['age_bucket','score'], ascending=[True,False])
pop['rank'] = pop.groupby('age_bucket')['score'].rank('first', ascending=False)
pop = pop[pop['rank'] <= AGE_TOP_PER_BUCKET].drop(columns=['rank'])

In [6]:
# Cell 6 – build user candidates
user_age = cust_bucket.drop_duplicates()
cand = user_age.merge(pop, on='age_bucket', how='left')
cand = cand.dropna(subset=['article_id'])

cand['value'] = cand['score'].astype('float32')
cand = cand.drop(columns=['score']).drop_duplicates(['customer_id','article_id'])

# Cap per user (redundant if N_PER_USER == AGE_TOP_PER_BUCKET, but safe)
cand = cand.sort_values(['customer_id','value'], ascending=[True,False])
cand['rnk'] = cand.groupby('customer_id').cumcount()+1
cand = cand[cand['rnk'] <= N_PER_USER].drop(columns=['rnk'])

cand['window_type'] = 'age_bucket_pop'

In [7]:
# Cell 7 – add hex id & save
sub = pd.read_csv('../data/input_data/sample_submission.csv',
                  usecols=['customer_id'], dtype={'customer_id':'string'})
sub['customer_id_int'] = sub['customer_id'].apply(hex_last16_to_uint64)

cand = cand.merge(sub[['customer_id','customer_id_int']],
                  left_on='customer_id', right_on='customer_id_int', how='left')

cand = cand.rename(columns={'customer_id_x':'customer_id',
                            'customer_id_y':'customer_id_hex'}).drop(columns=['customer_id_int'])

cand['article_id'] = cand['article_id'].astype('int32')
cand['value']      = cand['value'].astype('float32')

cand = cand.drop_duplicates(['customer_id','article_id'])
cand.to_parquet('../data/outputs/candidates_age_bucket_pop.parquet', index=False)

del tx, cust, cust_bucket, pop, cand, sub
gc.collect()

0