In [28]:
# Cell 1

import numpy as np
import pandas as pd
from datetime import timedelta
import gc

import cudf

In [29]:
# Cell 2

N = 50

In [30]:
# Cell 3

df = cudf.read_csv(
    '../data/input_data/transactions_train.csv',
    usecols=['t_dat', 'customer_id', 'article_id'],
    dtype={'t_dat': 'string', 'customer_id': 'string', 'article_id': 'int32'},
)

# Map customer_id to int64 (same as other pipelines)
df['customer_id'] = df['customer_id'].str[-16:].str.hex_to_int().astype('int64')

df['t_dat'] = cudf.to_datetime(df['t_dat'])
last_ts = df['t_dat'].max()

# define cutoff for training history (exclude last 7 days)
cut_ts = last_ts - pd.Timedelta(days=7)

# Filter to history only (everything up to and including cut_ts)
df = df[df['t_dat'] <= cut_ts]

# Week / biweek / month cutoffs based on cut_ts (not last_ts)
week_start   = cut_ts - pd.Timedelta(days=7)
biweek_start = cut_ts - pd.Timedelta(days=14)
month_start  = cut_ts - pd.Timedelta(days=30)

In [31]:
# Cell 4

# Compute "last date of the week" bucket (ldbw) on CPU using pandas
tmp = df[['t_dat']].copy().to_pandas()
tmp['dow'] = tmp['t_dat'].dt.dayofweek
tmp['ldbw'] = tmp['t_dat'] - pd.TimedeltaIndex(tmp['dow'] - 1, unit='D')
tmp.loc[tmp['dow'] >= 2, 'ldbw'] = (
    tmp.loc[tmp['dow'] >= 2, 'ldbw']
    + pd.TimedeltaIndex(np.ones(len(tmp.loc[tmp['dow'] >= 2])) * 7, unit='D')
)

df['ldbw'] = tmp['ldbw'].values
del tmp
gc.collect()

  tmp['ldbw'] = tmp['t_dat'] - pd.TimedeltaIndex(tmp['dow'] - 1, unit='D')
  + pd.TimedeltaIndex(np.ones(len(tmp.loc[tmp['dow'] >= 2])) * 7, unit='D')


26

In [32]:
# Cell 5

# Weekly sales count per (ldbw, article_id)
weekly_sales = df.drop('customer_id', axis=1).groupby(['ldbw', 'article_id']).count().reset_index()
weekly_sales = weekly_sales.rename(columns={'t_dat': 'count'})

# Merge weekly count into df
df = df.merge(weekly_sales, on=['ldbw', 'article_id'], how='left')

# For each article, get count in the last week (target week = week ending at cut_ts)
weekly_sales = weekly_sales.reset_index().set_index('article_id')
df = df.merge(
    weekly_sales.loc[weekly_sales['ldbw'] == cut_ts, ['count']],
    on='article_id',
    suffixes=('', '_targ')
)

df['count_targ'].fillna(0, inplace=True)
del weekly_sales
gc.collect()

# Quotient = "importance" of this article in this week vs target week
df['quotient'] = df['count_targ'] / df['count']

# target_sales: sum of quotient per article over all weeks
target_sales = df.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()

# Global top-N articles
general_pred = target_sales.nlargest(N).index.to_pandas().tolist()
general_pred = ['0' + str(article_id) for article_id in general_pred]
general_pred_str = ' '.join(general_pred)

# Save so the ranker / submission notebook can reuse the same global list
pd.Series({'general_pred_str': general_pred_str}).to_json(
    '../data/outputs/general_pred_str.json'
)

del target_sales
gc.collect()


0

In [33]:
# Cell 6

# Move to pandas for easier numeric ops
tmp = df.copy().to_pandas()

# Days since transaction
tmp['x'] = ((last_ts - tmp['t_dat']) / np.timedelta64(1, 'D')).astype(int)
tmp['dummy_1'] = 1
tmp['x'] = tmp[['x', 'dummy_1']].max(axis=1)

# Recency weighting function
a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
tmp['y'] = a / np.sqrt(tmp['x']) + b * np.exp(-c * tmp['x']) - d

tmp['dummy_0'] = 0
tmp['y'] = tmp[['y', 'dummy_0']].max(axis=1)

# Final value per transaction
tmp['value'] = tmp['quotient'] * tmp['y']

# Aggregate per (customer_id, article_id)
tmp = tmp.groupby(['customer_id', 'article_id']).agg({'value': 'sum'}).reset_index()

# Keep reasonably candidates
tmp = tmp.loc[tmp['value'] > 20]

# Rank per customer and keep top-N
tmp['rank'] = tmp.groupby('customer_id')['value'].rank('dense', ascending=False)
tmp = tmp.loc[tmp['rank'] <= N]

# Sorted candidate list
purchase_df = tmp.sort_values(['customer_id', 'value'], ascending=False).reset_index(drop=True)
purchase_df = cudf.DataFrame(purchase_df)

In [34]:
# Cell 7

# Base candidates (customer_id, article_id, value)
candidates = purchase_df[['customer_id', 'article_id', 'value']].copy()
candidates = candidates.sort_values(['customer_id', 'value'], ascending=False)

# Build recency window_type using last purchase timestamp per (customer, article)
df_min = df[['customer_id', 'article_id', 't_dat']].to_pandas()
df_min = df_min.sort_values('t_dat').drop_duplicates(['customer_id', 'article_id'], keep='last')

candidates = candidates.to_pandas()
candidates = candidates.merge(df_min, on=['customer_id', 'article_id'], how='left')

# Classify into window_type: weekly / biweekly / monthly / older
candidates['window_type'] = 'older'
candidates.loc[candidates['t_dat'] >= week_start, 'window_type'] = 'weekly'
candidates.loc[
    (candidates['t_dat'] < week_start)
    & (candidates['t_dat'] >= biweek_start),
    'window_type'
] = 'biweekly'
candidates.loc[
    (candidates['t_dat'] < biweek_start)
    & (candidates['t_dat'] >= month_start),
    'window_type'
] = 'monthly'

# Keep only the needed columns
candidates = candidates[['customer_id', 'article_id', 'value', 'window_type']]

In [35]:
# Cell 8

import cudf

candidates = cudf.DataFrame(candidates)

# Map int customer_id back to hex string form for consistency with sample_submission
customers = cudf.read_csv(
    '../data/input_data/sample_submission.csv',
    usecols=['customer_id'],
    dtype={'customer_id': 'string'}
)
customers['customer_id_int'] = customers['customer_id'].str[-16:].str.hex_to_int().astype('int64')

# Merge so that we keep the int id as 'customer_id' and add hex as 'customer_id_hex'
candidates = candidates.merge(
    customers[['customer_id', 'customer_id_int']],
    left_on='customer_id',
    right_on='customer_id_int',
    how='left'
)

candidates = candidates.rename(
    columns={'customer_id_x': 'customer_id', 'customer_id_y': 'customer_id_hex'}
)
candidates = candidates.drop('customer_id_int', axis=1)

del customers
gc.collect()

# Ensure dtypes are as expected for downstream processing
candidates['article_id'] = candidates['article_id'].astype('int32')
candidates['value']      = candidates['value'].astype('float32')

# Deduplicate just in case
candidates = candidates.drop_duplicates(['customer_id', 'article_id'])

candidates.head()

Unnamed: 0,customer_id,article_id,value,window_type,customer_id_hex
0,9162379705966698872,883307004,2436.140625,biweekly,1bd6056682c35cc892c9db3165c4b3d8d3b208079cee77...
1,9162379705966698872,748355003,2133.914307,older,1bd6056682c35cc892c9db3165c4b3d8d3b208079cee77...
2,9162379705966698872,853839003,1838.798462,older,1bd6056682c35cc892c9db3165c4b3d8d3b208079cee77...
3,9162379705966698872,854951003,652.056213,older,1bd6056682c35cc892c9db3165c4b3d8d3b208079cee77...
4,9162379705966698872,884081001,468.520721,older,1bd6056682c35cc892c9db3165c4b3d8d3b208079cee77...


In [36]:
# Cell 9

#candidates.to_pandas().to_csv('../data/outputs/candidates_weekly_trending.csv', index=False)
candidates.to_pandas().to_parquet('../data/outputs/candidates_weekly_trending.parquet', index=False)

del df, df_min, purchase_df, candidates
gc.collect()

0