In [1]:
# Install required libraries 
!pip -q install -U datasets huggingface_hub

# Imports 
import re
from pprint import pprint

from datasets import load_dataset, DatasetDict
from huggingface_hub import login, whoami, create_repo


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/561.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m307.2/561.5 kB[0m [31m9.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m561.5/561.5 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Login to Hugging Face Hub
from getpass import getpass

HF_TOKEN = getpass("Paste your Hugging Face token: ")
login(token=HF_TOKEN)

me = whoami()
print("Logged in as:", me.get("name") or me.get("displayName"))


Paste your Hugging Face token: ··········
Logged in as: mohammad-shirkhani


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
# Configuration 
SOURCE_DATASET_ID = "mohammad-shirkhani/social_movielens_new"  
TARGET_REPO_ID    = "mohammad-shirkhani/social_movielens_new2"  

print("Source:", SOURCE_DATASET_ID)
print("Target repo:", TARGET_REPO_ID)


Source: mohammad-shirkhani/social_movielens_new
Target repo: mohammad-shirkhani/social_movielens_new2


In [4]:
# Load dataset splits
ds_raw = load_dataset(SOURCE_DATASET_ID)
print("Splits found on source:", list(ds_raw.keys()))

# Normalize split names if needed
if "validation" not in ds_raw and "val" in ds_raw:
    ds_raw = DatasetDict({
        "train": ds_raw["train"],
        "validation": ds_raw["val"],
        "test": ds_raw["test"],
    })

print("Final splits used:", list(ds_raw.keys()))
for name, ds in ds_raw.items():
    print(name, "rows:", len(ds), "columns:", ds.column_names)


README.md:   0%|          | 0.00/918 [00:00<?, ?B/s]

data/train-00000-of-00003.parquet:   0%|          | 0.00/65.1M [00:00<?, ?B/s]

data/train-00001-of-00003.parquet:   0%|          | 0.00/65.1M [00:00<?, ?B/s]

data/train-00002-of-00003.parquet:   0%|          | 0.00/65.1M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/24.4M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/24.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/80000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Splits found on source: ['train', 'validation', 'test']
Final splits used: ['train', 'validation', 'test']
train rows: 80000 columns: ['user', 'item', 'answer', 'paths']
validation rows: 10000 columns: ['user', 'item', 'answer', 'paths']
test rows: 10000 columns: ['user', 'item', 'answer', 'paths']


In [5]:
# Peek a couple of rows to understand schema
for split in ["train", "validation", "test"]:
    print(f"\n--- {split.upper()} SAMPLE ROW ---")
    sample = ds_raw[split][0]
    pprint({k: (type(v).__name__, (list(v.keys()) if isinstance(v, dict) else None))
            for k, v in sample.items()})
    try:
        print("paths count:", len(sample["paths"]))
        print("first path:", sample["paths"][0])
    except Exception as e:
        print("No 'paths' preview due to:", e)



--- TRAIN SAMPLE ROW ---
{'answer': ('float', None),
 'item': ('dict', ['MovieID', 'Title', 'ReleaseDate', 'Genres']),
 'paths': ('list', None),
 'user': ('dict', ['UserID', 'Age', 'Gender', 'Occupation'])}
paths count: 30
first path: User{UserID 919, Age 25, Gender Male, Occupation other} -> usersim -> User{UserID 770, Age 28, Gender Male, Occupation student} -> user_item_4 (rating=4) -> Item{MovieID 288, Title "Scream (1996)", Release Date 20-Dec-1996, Genres Horror, Thriller} -> item_user_1 (rating=1) -> User{UserID 557, Age 30, Gender Female, Occupation writer} -> user_item_4 (rating=4) -> Item{MovieID 750, Title "Amistad (1997)", Release Date 18-Dec-1997, Genres Drama}

--- VALIDATION SAMPLE ROW ---
{'answer': ('float', None),
 'item': ('dict', ['MovieID', 'Title', 'ReleaseDate', 'Genres']),
 'paths': ('list', None),
 'user': ('dict', ['UserID', 'Age', 'Gender', 'Occupation'])}
paths count: 30
first path: User{UserID 758, Age 27, Gender Male, Occupation student} -> usersim -> Use

In [6]:
# Regex patterns to modify each path string
# Replace ONLY the leading 'User{...}' (at the very beginning) with 'user_question'
# and ONLY the trailing 'Item{...}' (at the very end) with 'item_question'.

USER_LEAD_PATTERN = re.compile(r'^\s*User\{[^}]*\}')
ITEM_TAIL_PATTERN = re.compile(r'Item\{[^}]*\}\s*$')

def transform_paths(paths_list):
    """Return a new list where each path keeps its content but:
       - first leading User{...} -> 'user_question'
       - last trailing Item{...} -> 'item_question'
    """
    if not isinstance(paths_list, list) or len(paths_list) == 0:
        return paths_list

    out = []
    for s in paths_list:
        t = str(s).strip()
        t = USER_LEAD_PATTERN.sub("user_question", t)
        t = ITEM_TAIL_PATTERN.sub("item_question", t)
        out.append(t)
    return out

def process_row(example):
    paths = example.get("paths", None)
    return {"paths": transform_paths(paths)}


In [7]:
# Apply transformation to every split (no sampling; keep ALL paths)
ds_proc = DatasetDict()
for split in ["train", "validation", "test"]:
    print(f"Processing split: {split}...")
    ds_proc[split] = ds_raw[split].map(
        process_row,
        desc=f"Replacing user/item anchors in paths ({split})",
    )

print("Done.")
for name, ds in ds_proc.items():
    row0 = ds[0]
    print(f"\n{name}: rows={len(ds)}")
    if isinstance(row0.get("paths", None), list) and len(row0["paths"]) > 0:
        print("row0 paths count:", len(row0["paths"]))
        print("row0 first path:", row0["paths"][0])


Processing split: train...


Replacing user/item anchors in paths (train):   0%|          | 0/80000 [00:00<?, ? examples/s]

Processing split: validation...


Replacing user/item anchors in paths (validation):   0%|          | 0/10000 [00:00<?, ? examples/s]

Processing split: test...


Replacing user/item anchors in paths (test):   0%|          | 0/10000 [00:00<?, ? examples/s]

Done.

train: rows=80000
row0 paths count: 30
row0 first path: user_question -> usersim -> User{UserID 770, Age 28, Gender Male, Occupation student} -> user_item_4 (rating=4) -> Item{MovieID 288, Title "Scream (1996)", Release Date 20-Dec-1996, Genres Horror, Thriller} -> item_user_1 (rating=1) -> User{UserID 557, Age 30, Gender Female, Occupation writer} -> user_item_4 (rating=4) -> item_question

validation: rows=10000
row0 paths count: 30
row0 first path: user_question -> usersim -> User{UserID 268, Age 24, Gender Male, Occupation engineer} -> user_item_1 (rating=1) -> Item{MovieID 743, Title "Crow: City of Angels, The (1996)", Release Date 30-Aug-1996, Genres Action, Thriller} -> item_user_4 (rating=4) -> User{UserID 472, Age 24, Gender Male, Occupation student} -> user_item_5 (rating=5) -> item_question

test: rows=10000
row0 paths count: 30
row0 first path: user_question -> usersim -> User{UserID 303, Age 19, Gender Male, Occupation student} -> user_item_3 (rating=3) -> Item{Movi

In [8]:
# Validate: every path should start with 'user_question' and end with 'item_question'
def check_row_paths(paths):
    if not isinstance(paths, list) or len(paths) == 0:
        return True
    for p in paths:
        ps = p.strip()
        if not ps.startswith("user_question"):
            return False
        if not ps.endswith("item_question"):
            return False
    return True

for split in ["train", "validation", "test"]:
    ok = check_row_paths(ds_proc[split][0]["paths"])
    print(f"{split} first-row check:", "OK" if ok else "NOT OK")


train first-row check: OK
validation first-row check: OK
test first-row check: OK


In [9]:
create_repo(TARGET_REPO_ID, repo_type="dataset", exist_ok=True, private=False)

ds_proc.push_to_hub(
    repo_id=TARGET_REPO_ID,
    commit_message="Add full-path version: all meta paths with user_question/item_question replacements.",
)

print("Pushed successfully!")
print(f"View on the Hub: https://huggingface.co/datasets/{TARGET_REPO_ID}")


Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/40 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   2%|1         | 1.58MB / 84.5MB            

Creating parquet from Arrow format:   0%|          | 0/40 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   1%|          |  525kB / 84.4MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :  10%|9         | 2.10MB / 21.1MB            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        :   0%|          | 30.5kB / 21.1MB            

Pushed successfully!
View on the Hub: https://huggingface.co/datasets/mohammad-shirkhani/social_movielens_new2
