In [None]:
import sys
sys.path.append("../../")

import pandas as pd
from pathlib import Path
from src.logger import Logger
from src.util import read_parquet_from_csv

from torch.nn.utils.rnn import pad_sequence
import torch

In [None]:
class Config:
    debug = True
    seed = 8823

In [None]:
HOME = Path("/workspace")
RESOURCES = HOME / "resources"
INPUT = RESOURCES / "input"

config = Config()
logger = Logger(__name__)

In [None]:
raw_train_df = read_parquet_from_csv(filepath=INPUT / "task1_dataset.csv.gz", dirpath=INPUT)
poi_df = read_parquet_from_csv(filepath=INPUT / "cell_POIcat.csv.gz", dirpath=INPUT)

if config.debug:
    user_ids = task1_df["uid"].sample(100, random_state=config.seed).tolist()
    raw_train_df = raw_train_df[raw_train_df["uid"].isin(user_ids)].reset_index(drop=True)
    
raw_train_df.head()

In [None]:


class GroupedDiffFeatureExtractor():
    def __init__(self, group_key:str="uid", group_values:list[str]=["t", "d"], intervals:list[int]=[1,2]):
        self.group_key = group_key
        self.group_values = group_values
        self.intervals = intervals
    
    def __call__(self, df):
        cols = [{v:f"{v}_grpby_{self.group_key}_diff_{interval}" for v in self.group_values} for interval in self.intervals]
        out_df = pd.concat([df.groupby(self.group_key)[self.group_values].diff(interval).rename(columns=col) for interval, col in zip(self.intervals, cols)], axis=1)
        return out_df.add_prefix("f_")
    
    
def sort_df(df):
    return df.sort_values(["uid", "d", "t"]).reset_index(drop=True)


def make_features(df):
    df = sort_df(train_df)
    funcs = [
        GroupedDiffFeatureExtractor(
            group_key="uid", 
            group_values=["t", "d"], 
            intervals=[1,2],
            )
        ]
    features_df = pd.concat([df] + [func(df) for func in funcs], axis=1)
    return features_df
    
        
    
train_df = make_features(raw_train_df)
train_df.head()

In [None]:

def make_sequences(df:pd.DataFrame, group_key:str, group_values:list[str]):
    grouped = df.groupby(group_key)
    sequences = [torch.tensor(group[group_values].to_numpy()) for _, group in grouped]
    return sequences
    
feature_names = [x for x in train_df.columns if x.startswith("f_")]
feature_seqs = make_sequences(df=train_df, group_key="uid", group_values=feature_names)
target_seqs = make_sequences(df=train_df, group_key="uid", group_values=["x", "y"])


In [None]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, feature_seqs):
        self.feature_seqs = feature_seqs

    def __len__(self):
        return len(self.feature_seqs)

    def __getitem__(self, index: int) -> dict[str : torch.Tensor]:
        feature_seqs = self.feature_seqs[index]
        return {"feature_seqs":feature_seqs}


def collate_fn(batch):
    # batch is a list of sequences
    feature_seqs = [item["feature_seqs"] for item in batch]
    lengths = [len(seq) for seq in feature_seqs]
    feature_seqs_padded = pad_sequence([torch.as_tensor(seq) for seq in feature_seqs], batch_first=True)
    return {"feature_seqs":feature_seqs_padded, "lengths":lengths}

ds = TrainDataset(feature_seqs=feature_seqs)
dl = torch.utils.data.DataLoader(ds, batch_size=5, collate_fn=collate_fn)

for x in dl:
    print(x["lengths"], [len(x_) for x_ in x["feature_seqs"]])