In [1]:
# Install required libraries
!pip -q install -U datasets huggingface_hub

# Imports 
import re
import random
from pprint import pprint

from datasets import load_dataset, DatasetDict
from huggingface_hub import login, whoami, create_repo


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/561.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.1/561.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m561.5/561.5 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Login to Hugging Face Hub
from getpass import getpass

HF_TOKEN = getpass("Paste your Hugging Face token: ")
login(token=HF_TOKEN)

me = whoami()
hf_username = me["name"]
print("Logged in as:", hf_username)


Paste your Hugging Face token: ··········
Logged in as: mohammad-shirkhani


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
# Configuration
DATASET_ID = "mohammad-shirkhani/social_movielens_new"  # source dataset
REPO_NAME   = "social_movielens_compress"               
SEED        = 42                                        
K_PATHS     = 5   # keep 5 meta paths per row

random.seed(SEED)
print(f"Source: {DATASET_ID}  ->  Target repo: {hf_username}/{REPO_NAME}")


Source: mohammad-shirkhani/social_movielens_new  ->  Target repo: mohammad-shirkhani/social_movielens_compress


In [4]:
# Load dataset splits
ds_raw = None
try:
    
    ds_raw = load_dataset(DATASET_ID)
    print("Splits found:", list(ds_raw.keys()))
except Exception as e:
    raise RuntimeError(f"Failed to load dataset {DATASET_ID}: {e}")

if "validation" not in ds_raw and "val" in ds_raw:
    ds_raw = DatasetDict({
        "train": ds_raw["train"],
        "validation": ds_raw["val"],
        "test": ds_raw["test"],
    })
elif "validation" not in ds_raw and "val" not in ds_raw:
    ds_raw = DatasetDict({
        "train": load_dataset(DATASET_ID, split="train"),
        "validation": load_dataset(DATASET_ID, split="validation"),
        "test": load_dataset(DATASET_ID, split="test"),
    })

print("Final splits:", list(ds_raw.keys()))
for name, ds in ds_raw.items():
    print(name, "num_rows:", len(ds), "columns:", ds.column_names)


README.md:   0%|          | 0.00/918 [00:00<?, ?B/s]

data/train-00000-of-00003.parquet:   0%|          | 0.00/65.1M [00:00<?, ?B/s]

data/train-00001-of-00003.parquet:   0%|          | 0.00/65.1M [00:00<?, ?B/s]

data/train-00002-of-00003.parquet:   0%|          | 0.00/65.1M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/24.4M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/24.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/80000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Splits found: ['train', 'validation', 'test']
Final splits: ['train', 'validation', 'test']
train num_rows: 80000 columns: ['user', 'item', 'answer', 'paths']
validation num_rows: 10000 columns: ['user', 'item', 'answer', 'paths']
test num_rows: 10000 columns: ['user', 'item', 'answer', 'paths']


In [5]:
# Peek a couple of rows to understand schema
for split in ["train", "validation", "test"]:
    print(f"\n--- {split.upper()} SAMPLE ROW ---")
    sample = ds_raw[split][0]
    pprint({k: (type(v).__name__, (list(v.keys()) if isinstance(v, dict) else None)) for k, v in sample.items()})
    # Show lengths and an example of paths
    try:
        print("paths count:", len(sample["paths"]))
        print("first path:", sample["paths"][0])
    except Exception as e:
        print("No 'paths' preview due to:", e)



--- TRAIN SAMPLE ROW ---
{'answer': ('float', None),
 'item': ('dict', ['MovieID', 'Title', 'ReleaseDate', 'Genres']),
 'paths': ('list', None),
 'user': ('dict', ['UserID', 'Age', 'Gender', 'Occupation'])}
paths count: 30
first path: User{UserID 919, Age 25, Gender Male, Occupation other} -> usersim -> User{UserID 770, Age 28, Gender Male, Occupation student} -> user_item_4 (rating=4) -> Item{MovieID 288, Title "Scream (1996)", Release Date 20-Dec-1996, Genres Horror, Thriller} -> item_user_1 (rating=1) -> User{UserID 557, Age 30, Gender Female, Occupation writer} -> user_item_4 (rating=4) -> Item{MovieID 750, Title "Amistad (1997)", Release Date 18-Dec-1997, Genres Drama}

--- VALIDATION SAMPLE ROW ---
{'answer': ('float', None),
 'item': ('dict', ['MovieID', 'Title', 'ReleaseDate', 'Genres']),
 'paths': ('list', None),
 'user': ('dict', ['UserID', 'Age', 'Gender', 'Occupation'])}
paths count: 30
first path: User{UserID 758, Age 27, Gender Male, Occupation student} -> usersim -> Use

In [6]:
# Define regex patterns and row-wise transform

USER_LEAD_PATTERN  = re.compile(r'^\s*User\{[^}]*\}')
ITEM_TAIL_PATTERN  = re.compile(r'Item\{[^}]*\}\s*$')

def compress_paths_for_row(example, idx, k=K_PATHS, seed=SEED):
    """
    For a single example (row):
      - Randomly pick k paths (or fewer if not enough).
      - Replace the first leading 'User{...}' with 'user_question'.
      - Replace the trailing 'Item{...}' with 'item_question'.
      - Write back to 'paths'.
    """
    paths = example.get("paths", None)
    if not isinstance(paths, list) or len(paths) == 0:
        # Nothing to do
        return {"paths": paths}

    # Deterministic randomness per row for reproducibility:
    rng = random.Random(seed + idx)
    k_eff = min(k, len(paths))
    picked = rng.sample(paths, k=k_eff)

    processed = []
    for s in picked:
        if not isinstance(s, str):
            s = str(s)

        # Normalize whitespace at ends
        t = s.strip()

        # Replace leading User{...}
        t = USER_LEAD_PATTERN.sub("user_question", t)

        # Replace trailing Item{...}
        t = ITEM_TAIL_PATTERN.sub("item_question", t)

        processed.append(t)

    return {"paths": processed}


In [7]:
# Apply transformation to every split
ds_proc = DatasetDict()
for split in ["train", "validation", "test"]:
    print(f"Processing split: {split}...")
    ds_proc[split] = ds_raw[split].map(
        compress_paths_for_row,
        with_indices=True,
        desc=f"Compressing paths ({split})",
    )

print("Done.")
for name, ds in ds_proc.items():
    # Sanity checks
    row0 = ds[0]
    print(f"\n{name}: rows={len(ds)}")
    if isinstance(row0.get("paths", None), list):
        print("row0 paths count:", len(row0["paths"]))
        print("row0 first path:", row0["paths"][0])


Processing split: train...


Compressing paths (train):   0%|          | 0/80000 [00:00<?, ? examples/s]

Processing split: validation...


Compressing paths (validation):   0%|          | 0/10000 [00:00<?, ? examples/s]

Processing split: test...


Compressing paths (test):   0%|          | 0/10000 [00:00<?, ? examples/s]

Done.

train: rows=80000
row0 paths count: 5
row0 first path: user_question -> user_item_2 (rating=2) -> Item{MovieID 878, Title "That Darn Cat! (1997)", Release Date 14-Feb-1997, Genres Children's, Comedy, Mystery} -> item_user_4 (rating=4) -> User{UserID 810, Age 55, Gender Female, Occupation other} -> usersim -> User{UserID 724, Age 31, Gender Male, Occupation executive} -> user_item_2 (rating=2) -> item_question

validation: rows=10000
row0 paths count: 5
row0 first path: user_question -> user_item_3 (rating=3) -> Item{MovieID 1244, Title "Metro (1997)", Release Date 17-Jan-1997, Genres Action} -> item_user_4 (rating=4) -> User{UserID 332, Age 20, Gender Male, Occupation student} -> user_item_5 (rating=5) -> Item{MovieID 562, Title "Quick and the Dead, The (1995)", Release Date 01-Jan-1995, Genres Action, Adventure, Western} -> itemsim -> item_question

test: rows=10000
row0 paths count: 5
row0 first path: user_question -> usersim -> User{UserID 643, Age 39, Gender Male, Occupation

In [8]:
# Extra validation: ensure every path begins with 'user_question' and ends with 'item_question'
def check_row_paths(paths):
    if not isinstance(paths, list) or len(paths) == 0:
        return True
    ok = True
    for p in paths:
        if not isinstance(p, str):
            ok = False
            break
        if not p.strip().startswith("user_question"):
            ok = False
            break
        if not p.strip().endswith("item_question"):
            ok = False
            break
    return ok

for split in ["train", "validation", "test"]:
    sample_ok = check_row_paths(ds_proc[split][0]["paths"])
    print(f"{split} first-row check:", "OK" if sample_ok else "NOT OK")


train first-row check: OK
validation first-row check: OK
test first-row check: OK


In [9]:
repo_id = f"{hf_username}/{REPO_NAME}"
print("Target repo_id:", repo_id)

create_repo(repo_id, repo_type="dataset", exist_ok=True, private=False)

# Push all splits in one shot
ds_proc.push_to_hub(
    repo_id=repo_id,
    commit_message="Add compressed version: 5 random meta paths with user_question/item_question replacements.",
)

print("Pushed successfully!")
print(f"View on the Hub: https://huggingface.co/datasets/{repo_id}")


Target repo_id: mohammad-shirkhani/social_movielens_compress


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/80 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  29%|##8       | 9.49MB / 33.3MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  63%|######3   | 2.63MB / 4.15MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  89%|########9 | 3.72MB / 4.16MB            

Pushed successfully!
View on the Hub: https://huggingface.co/datasets/mohammad-shirkhani/social_movielens_compress
