In [None]:
# Install required libraries
!pip -q install --upgrade datasets huggingface_hub

In [None]:
# Imports 
import os
import random
from collections import Counter

from datasets import load_dataset, DatasetDict, concatenate_datasets

SEED = 42  
random.seed(SEED)

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # faster uploads when possible

In [None]:
src_name = "mohammad-shirkhani/social_movielens_new2"
train = load_dataset(src_name, split="train")

print(train)
print("Columns:", train.column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/909 [00:00<?, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/84.5M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/84.4M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/21.1M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/21.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/80000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset({
    features: ['user', 'item', 'answer', 'paths'],
    num_rows: 80000
})
Columns: ['user', 'item', 'answer', 'paths']


In [None]:
# sanity checks 

# 1) Ensure required columns exist
required_cols = {"user", "item", "answer", "paths"}
missing = required_cols.difference(set(train.column_names))
assert not missing, f"Missing columns: {missing}"

# 2) Show one example row (truncated paths)
ex0 = train[0]
print("Sample row 0 (truncated paths to 2):")
print("user:", ex0["user"])
print("item:", ex0["item"])
print("answer:", ex0["answer"])
print("paths (len/raw/truncated):", len(ex0["paths"]), ex0["paths"][:2])

Sample row 0 (truncated paths to 2):
user: {'UserID': 919, 'Age': 25, 'Gender': 'Male', 'Occupation': 'other'}
item: {'MovieID': 750, 'Title': 'Amistad (1997)', 'ReleaseDate': '18-Dec-1997', 'Genres': 'Drama'}
answer: 3.0
paths (len/raw/truncated): 30 ['user_question -> usersim -> User{UserID 770, Age 28, Gender Male, Occupation student} -> user_item_4 (rating=4) -> Item{MovieID 288, Title "Scream (1996)", Release Date 20-Dec-1996, Genres Horror, Thriller} -> item_user_1 (rating=1) -> User{UserID 557, Age 30, Gender Female, Occupation writer} -> user_item_4 (rating=4) -> item_question', 'user_question -> user_item_5 (rating=5) -> Item{MovieID 15, Title "Mr. Holland\'s Opus (1995)", Release Date 29-Jan-1996, Genres Drama} -> itemsim -> Item{MovieID 237, Title "Jerry Maguire (1996)", Release Date 13-Dec-1996, Genres Drama, Romance} -> item_user_1 (rating=1) -> User{UserID 639, Age 42, Gender Female, Occupation librarian} -> user_item_2 (rating=2) -> item_question']


In [None]:
# Randomly keep exactly 20 paths per row

def select_paths_per_row(example, idx, k=20):
    paths = example.get("paths") or []
    rng = random.Random(SEED + idx)
    k = min(k, len(paths))
    if k == len(paths):
        paths = paths.copy()
        rng.shuffle(paths)
        return {"paths": paths}
    else:
        return {"paths": rng.sample(paths, k)}

train_paths20 = train.map(
    select_paths_per_row,
    with_indices=True,
    desc="Sampling 20 paths per row"
)

# Sanity check
lens = [len(p) for p in train_paths20["paths"]]
print("Min/Max path lengths after sampling:", min(lens), max(lens))

Sampling 20 paths per row:   0%|          | 0/80000 [00:00<?, ? examples/s]

Min/Max path lengths after sampling: 1 20


In [None]:
def to_int_answer(example):
    # defensive cast
    return {"answer": int(round(float(example["answer"])) )}

train_int = train_paths20.map(to_int_answer, desc="Casting 'answer' to int")
distinct_vals = sorted(set(train_int["answer"]))
print("Distinct answer values:", distinct_vals)
assert set(distinct_vals).issubset({1,2,3,4,5}), "Unexpected 'answer' values found!"


Casting 'answer' to int:   0%|          | 0/80000 [00:00<?, ? examples/s]

Distinct answer values: [1.0, 2.0, 3.0, 4.0, 5.0]


In [None]:
desired = {1: 3500, 2: 3600, 3: 4200, 4: 4500, 5: 4200}
assert sum(desired.values()) == 20000, "Desired counts must sum to 20,000."

# Helper
def keep_label(example, label):
    return int(example["answer"]) == int(label)

label_datasets = []
for label, n in desired.items():
    ds_label = train_int.filter(keep_label, fn_kwargs={"label": label}, desc=f"Filtering label={label}")
    print(f"Label={label} -> available rows: {ds_label.num_rows}")
    assert ds_label.num_rows >= n, f"Not enough rows for label={label} (need {n}, have {ds_label.num_rows})"
    ds_label = ds_label.shuffle(seed=SEED + label)
    ds_label = ds_label.select(range(n))
    label_datasets.append(ds_label)

train_20k = concatenate_datasets(label_datasets).shuffle(seed=SEED)
print("Final train size:", train_20k.num_rows)

# Verify the distribution
cnt = Counter(train_20k["answer"])
print("Final counts per answer:", dict(sorted(cnt.items())))


Filtering label=1:   0%|          | 0/80000 [00:00<?, ? examples/s]

Label=1 -> available rows: 4875


Filtering label=2:   0%|          | 0/80000 [00:00<?, ? examples/s]

Label=2 -> available rows: 9137


Filtering label=3:   0%|          | 0/80000 [00:00<?, ? examples/s]

Label=3 -> available rows: 21603


Filtering label=4:   0%|          | 0/80000 [00:00<?, ? examples/s]

Label=4 -> available rows: 27382


Filtering label=5:   0%|          | 0/80000 [00:00<?, ? examples/s]

Label=5 -> available rows: 17003
Final train size: 20000
Final counts per answer: {1.0: 3500, 2.0: 3600, 3.0: 4200, 4.0: 4500, 5.0: 4200}


In [None]:
# Verify all rows have exactly 20 paths and show a small sample order.

path_lengths = [len(p) for p in train_20k["paths"]]
print("Min/Max path length in final train:", min(path_lengths), max(path_lengths))

# Print first 3 rows
for i in range(3):
    print(f"Row {i}: answer={train_20k[i]['answer']}, paths_len={len(train_20k[i]['paths'])}")


Min/Max path length in final train: 1 20
Row 0: answer=4.0, paths_len=20
Row 1: answer=5.0, paths_len=20
Row 2: answer=3.0, paths_len=20


In [None]:
final_ds = DatasetDict({"train": train_20k})
print(final_ds)


DatasetDict({
    train: Dataset({
        features: ['user', 'item', 'answer', 'paths'],
        num_rows: 20000
    })
})


In [None]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
HF_USERNAME   = "mohammad-shirkhani"              
NEW_REPO_NAME = "social_movielens_custom"  
PRIVATE       = False                          
COMMIT_MSG    = "Initial upload: 20k train rows, 20 paths each, stratified by answer."

from huggingface_hub import create_repo

repo_id = f"{HF_USERNAME}/{NEW_REPO_NAME}"
create_repo(repo_id, repo_type="dataset", private=PRIVATE, exist_ok=True)
print("Target dataset repo:", repo_id, "| private:", PRIVATE)


Target dataset repo: mohammad-shirkhani/social_movielens_custom | private: False


In [None]:
# Push only the 'train' split
final_ds.push_to_hub(
    repo_id,
    commit_message=COMMIT_MSG,
    max_shard_size="200MB",
    private=PRIVATE
)

print("✅ Pushed successfully to:", f"https://huggingface.co/datasets/{repo_id}")


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        : 100%|##########| 29.2MB / 29.2MB            

✅ Pushed successfully to: https://huggingface.co/datasets/mohammad-shirkhani/social_movielens_custom
