In [2]:
import gzip
import glob
import numpy as np
import os
import json
from sklearn.model_selection import train_test_split

def parse_yahoo_r6_line(line):
    parts = line.strip().split('|')
    header_part = parts[0].strip().split()
    timestamp = int(header_part[0])
    displayed_article = int(header_part[1].replace('id-', ''))
    click = int(header_part[2])

    user_part = parts[1].strip().split()
    # user_part[0] = 'user', the rest are user features (categorical IDs)
    user_features = list(map(int, user_part[1:]))

    pool_article_ids = []
    for article_section in parts[2:]:
        article_id_str = article_section.strip()
        article_id = int(article_id_str.replace('id-', ''))
        pool_article_ids.append(article_id)

    return timestamp, displayed_article, click, user_features, pool_article_ids

# Path to your dataset directory
data_path = "/shared/share_mala/Leon/CB_dataset/dataset/"
gz_files = sorted(glob.glob(os.path.join(data_path, "ydata-fp-td-clicks-v2_0.*.gz")))

all_contexts = []
all_actions = []
all_rewards = []
all_pool_article_ids = []

# First pass: Collect all user features to determine max ID
max_user_feature_id = 0
for gz_file in gz_files:
    with gzip.open(gz_file, 'rt') as f:
        for line in f:
            if not line.strip() or not line[0].isdigit():
                continue
            if '|user' not in line:
                continue
            _, _, _, user_features, _ = parse_yahoo_r6_line(line)
            if user_features:
                max_id = max(user_features)
                if max_id > max_user_feature_id:
                    max_user_feature_id = max_id

print("Maximum user feature ID found:", max_user_feature_id)

# Second pass: One-hot encode
for gz_file in gz_files:
    print("Processing file:", gz_file)
    with gzip.open(gz_file, 'rt') as f:
        for line in f:
            if not line.strip() or not line[0].isdigit():
                continue
            if '|user' not in line:
                continue

            timestamp, displayed_article, click, user_features, pool_article_ids = parse_yahoo_r6_line(line)

            # Create a one-hot vector for user features
            one_hot_user = np.zeros(max_user_feature_id, dtype=np.float32)
            for uf in user_features:
                one_hot_user[uf - 1] = 1.0

            all_contexts.append(one_hot_user)
            all_actions.append(displayed_article)
            all_rewards.append(click)
            all_pool_article_ids.append(pool_article_ids)

X = np.array(all_contexts, dtype=np.float32)
A = np.array(all_actions, dtype=np.int32)
Y = np.array(all_rewards, dtype=np.int32)

# Pad the candidate articles
if all_pool_article_ids:
    max_articles = max(len(p) for p in all_pool_article_ids)
    padded_article_ids = [p + [-1]*(max_articles - len(p)) for p in all_pool_article_ids]
    P = np.array(padded_article_ids, dtype=np.int32)
else:
    P = np.empty((0,0), dtype=np.int32)

# Create output directory
out_dir = "/shared/share_mala/Leon/yahoo_cb"
os.makedirs(out_dir, exist_ok=True)

# Create article ID to index mapping
unique_articles = sorted(np.unique(A))
article_to_idx = {int(article_id): idx for idx, article_id in enumerate(unique_articles)}

# Save the mapping
with open(os.path.join(out_dir, "article_to_idx.json"), 'w') as f:
    json.dump(article_to_idx, f)

# Convert A to use indices
A_indexed = np.array([article_to_idx[a] for a in A], dtype=np.int32)

# Save the indexed version
np.save(os.path.join(out_dir, "A_indexed.npy"), A_indexed)

print(f"Number of unique articles: {len(unique_articles)}")
print(f"Original A range: [{A.min()}, {A.max()}]")
print(f"Indexed A range: [0, {len(unique_articles)-1}]")

# Create final dataset by concatenating X and A_indexed
X_final = np.column_stack((X, A_indexed))

# Create train-test split (80-20 split)
X_train, X_test, Y_train, Y_test = train_test_split(
    X_final, Y, 
    test_size=0.2, 
    random_state=42,
    shuffle=True
)

# Save the train-test splits
split_dir = os.path.join(out_dir, "splits")
os.makedirs(split_dir, exist_ok=True)

np.save(os.path.join(split_dir, "X_train.npy"), X_train)
np.save(os.path.join(split_dir, "X_test.npy"), X_test)
np.save(os.path.join(split_dir, "Y_train.npy"), Y_train)
np.save(os.path.join(split_dir, "Y_test.npy"), Y_test)

print("\nFinal dataset statistics:")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"Y_test shape: {Y_test.shape}")

Maximum user feature ID found: 136
Processing file: /shared/share_mala/Leon/CB_dataset/dataset/ydata-fp-td-clicks-v2_0.20111002.gz
Processing file: /shared/share_mala/Leon/CB_dataset/dataset/ydata-fp-td-clicks-v2_0.20111003.gz
Processing file: /shared/share_mala/Leon/CB_dataset/dataset/ydata-fp-td-clicks-v2_0.20111004.gz
Processing file: /shared/share_mala/Leon/CB_dataset/dataset/ydata-fp-td-clicks-v2_0.20111005.gz
Processing file: /shared/share_mala/Leon/CB_dataset/dataset/ydata-fp-td-clicks-v2_0.20111006.gz
Processing file: /shared/share_mala/Leon/CB_dataset/dataset/ydata-fp-td-clicks-v2_0.20111007.gz
Processing file: /shared/share_mala/Leon/CB_dataset/dataset/ydata-fp-td-clicks-v2_0.20111008.gz
Processing file: /shared/share_mala/Leon/CB_dataset/dataset/ydata-fp-td-clicks-v2_0.20111009.gz
Processing file: /shared/share_mala/Leon/CB_dataset/dataset/ydata-fp-td-clicks-v2_0.20111010.gz
Processing file: /shared/share_mala/Leon/CB_dataset/dataset/ydata-fp-td-clicks-v2_0.20111011.gz
Proce