In [1]:
import pandas as pd
import pandas as pd
import numpy as np
import os.path as path
from typing import Dict, Any, List, Tuple, Optional
import pickle
from itertools import chain
from pathlib import Path
from random import Random
from tqdm import tqdm


def transaction_to_session(df: pd.DataFrame, cast: bool = True, save_path = None, pkl_name = str) -> Dict[str, Dict[str, list]]:
    """
    Turn transaction record into seesion based data 

    Args:
    - df   : DataFrame, must contain cols: [t_dat, customer_id, article_id, price, sales_channel_id]
    - cast : bool, default True. Force to turn into correct dtype for each cols

    Return:
    - sessions : {
        customer_id: {
            'article_id'      : [int32, ...],
            't_dat'           : [Timestamp, ...],
            'price'           : [float32, ...],
            'sales_channel_id': [uint8, ...]
        },
        ...
     }
    """
    if cast:
        df = df.astype({
            'article_id'      : 'int32',
            'price'           : 'float32',
            'sales_channel_id': 'uint8'
        }).copy()
        df['t_dat'] = pd.to_datetime(df['t_dat'])

    session_df = (
        df.groupby('customer_id', sort=False)
          .agg({
              'article_id'      : list,
              't_dat'           : list,
              'price'           : list,
              'sales_channel_id': list
          })
    )

    sessions = session_df.to_dict(orient='index')

    if save_path is not None:
        if pkl_name is None:
            raise ValueError("pkl_name must be provided if save_path is set.")
        full_path = path.join(save_path,f"{pkl_name}.pkl")
        with open(full_path, "wb") as f:
            pickle.dump(sessions, f, protocol=pickle.HIGHEST_PROTOCOL)
    return sessions



def baseline_transformation(
    sessions: Dict[str, Dict[str, List[int]]],
    num_neg: int = 99,
    seed: int = 42,
    user_session_path: Optional[str] = None,
    testing_path: Optional[str] = None,
) -> Tuple[Dict[str, List[int]], Dict[str, List[int]]]:
    """
    Turn session dict into : 
    1. user_session  : {uid: [iid1, iid2, ...]}          For training/validation
    2. testing_data  : {uid: [neg1..neg99, test_item]}   For leave-one-out test

    Args:
    ----
    - sessions          : dict generated by func transaction_to_session 
    - num_neg           : int, num of negative sampling
    - seed              : random seed
    - user_session_path : path for saving  user_session pickle, default None
    - testing_path      : path for saving testing_data pickle, default None
    
    Return
   
    -   user_session : Dict[str, List[int]]
            { uid: [iid_1, iid_2, ..., iid_n] }
    -   testing_data : Dict[str, List[int]]
            { uid: [neg_1, ..., neg_99, test_item] }
    """

    rng = Random(seed)

    user_session: Dict[str, List[int]] = {}
    all_items_set: set[int] = set()

    for uid, rec in sessions.items():
        items = rec["article_id"]
        user_session[uid] = items
        all_items_set.update(items)

    all_items_arr = np.fromiter(all_items_set, dtype=np.int32)

    testing_data: Dict[str, List[int]] = {}

    for uid, items in tqdm(user_session.items(), desc="Creating Data...",unit= " item"):
        pos_item = items[-1]       
        positives = set(items)

        negs: List[int] = []
        while len(negs) < num_neg:
            cand = rng.sample(list(all_items_arr), k=num_neg * 3)
            negs.extend(x for x in cand if x not in positives)
            negs = negs[:num_neg]

        testing_data[uid] = negs + [pos_item]
    def _dump(obj: dict, out_path: Optional[str]):
        if out_path:
            Path(out_path).parent.mkdir(parents=True, exist_ok=True)
            with open(out_path, "wb") as f:
                pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)

    _dump(user_session, user_session_path)
    _dump(testing_data, testing_path)

    return user_session, testing_data



In [3]:
"""main"""
transactions_list = ['transactions_5_4_30_mapping.csv','transactions_5_6_30_mapping.csv']
saving_name = ['sessions_5_4_30_mapping','sessions_5_6_30_mapping']
for i,file in enumerate(transactions_list):
    trans = pd.read_csv(file,
                    parse_dates=['t_dat'],
                    dtype={
                        'customer_id':'int',
                        'article_id': 'int',
                        'sales_channel_id':'int'
                    })
    
    sessions = transaction_to_session(
        df=trans,cast=True,save_path=".",pkl_name=saving_name[i]
    )

# user_session, testing_data = baseline_transformation(
#     sessions=sessions,
#     num_neg= 99,
#     seed= 42,
#     user_session_path=r"C:\113-2-WM-Final-Project\data_new\user_session_5_4.pkl",
#     testing_path=r"C:\113-2-WM-Final-Project\data_new\test_data_5_4.pkl"
# )

# for k,v in user_session.items():
#     print(f"id:{k} : {v}")
#     break

# file_list = [r"C:\113-2-WM-Final-Project\data\sessions_5_4.pkl",r"C:\113-2-WM-Final-Project\data\sessions_5_6.pkl"]
# user_saving_list = [r"user_session_5_4.pkl",r"user_session_5_6.pkl"]
# testing_saving_list = [r"testing_data_5_4.pkl",r"testing_data_5_6.pkl"]
# for i,p in enumerate(file_list):
#     with open(p, "rb") as f:
#         sessions = pickle.load(f)
    
#     user_session, testing_data = baseline_transformation(
#         sessions=sessions,
#         num_neg= 99,
#         seed= 42,
#         user_session_path= user_saving_list[i],
#         testing_path= testing_saving_list[i]
#     )


In [4]:
"" "Test """
with open(r"sessions_5_4_30_mapping.pkl", "rb") as f:
    sessions_load = pickle.load(f)

count = 0
for k,v in sessions_load.items():
    print(f"customer: {k}")
    for t,l in v.items():
        print(f"{t} :")
        for e in l:
            print(e,end=" ")
        print("\n")
    if count == 3:
        break
    else: count+=1

customer: 724
article_id :
36060 18685 30738 29410 

t_dat :
2018-09-20 00:00:00 2018-09-20 00:00:00 2018-09-20 00:00:00 2018-09-20 00:00:00 

price :
0.02744067832827568 0.01981355994939804 0.015237288549542427 0.01981355994939804 

sales_channel_id :
1 1 1 1 

customer: 1383
article_id :
25542 31944 35885 2760 15542 15594 44871 25538 17428 28308 26539 

t_dat :
2018-09-20 00:00:00 2018-09-20 00:00:00 2018-09-20 00:00:00 2018-09-20 00:00:00 2018-09-20 00:00:00 2018-09-20 00:00:00 2018-09-20 00:00:00 2018-09-20 00:00:00 2018-09-20 00:00:00 2018-09-20 00:00:00 2018-09-20 00:00:00 

price :
0.016932202503085136 0.010152542032301426 0.02540677972137928 0.016932202503085136 0.033881355077028275 0.011847457848489285 0.02540677972137928 0.016932202503085136 0.032186441123485565 0.04828813672065735 0.033881355077028275 

sales_channel_id :
2 2 2 2 2 2 2 2 2 2 2 

customer: 1386
article_id :
6641 40723 12618 41391 39645 36492 21375 

t_dat :
2018-09-20 00:00:00 2018-09-20 00:00:00 2018-10-07 0