In [1]:
# # Social MovieLens: Build LLM FT dataset from meta-paths
# This notebook:
# - Loads meta-path JSONL files 
# - Uses MovieLens-100K metadata (users, items, genres)
# - Converts graph paths to natural text
# - Exports a Hugging Face Dataset with splits: train/validation/test
# - Pushes to hub as `social_movielens`

!pip -q install datasets huggingface_hub tqdm pandas pyarrow


In [2]:
from pathlib import Path

# Paths on Google Drive 
DRIVE_ROOT = Path("/content/drive/MyDrive")

META_PATHS_DIR = DRIVE_ROOT / "COMPER_movie_lens_100k_meta_paths"

MAPPINGS_DIR = DRIVE_ROOT / "COMPER_movie_lens_100k_ix_mapping"

JSONL_FILENAMES = {
    "train": "train_movielens100k_meta_paths.jsonl",
    "validation": "val_movielens100k_meta_paths.jsonl", 
    "test": "test_movielens100k_meta_paths.jsonl",
}

MAX_PATHS_PER_INTERACTION = None  

# Hugging Face dataset repo name
HF_DATASET_REPO = "social_movielens"


In [3]:
from google.colab import drive
drive.mount('/content/drive')

# Sanity checks
for split, fname in JSONL_FILENAMES.items():
    f = META_PATHS_DIR / fname
    assert f.exists(), f"Missing {f}. Please fix the path or copy files to Drive."

for req in ["ix_to_entity.dict", "ix_to_type.dict", "ix_to_relation.dict"]:
    f = MAPPINGS_DIR / req
    assert f.exists(), f"Missing mapping file: {f}. Copy your mapping dir to Drive as instructed."

print("All required JSONLs and mapping files found ")


Mounted at /content/drive
All required JSONLs and mapping files found 


In [4]:
import urllib.request, zipfile, os, shutil

ML_DIR = Path("/content/ml-100k")
ZIP_PATH = Path("/content/ml-100k.zip")
if not ML_DIR.exists():
    if not ZIP_PATH.exists():
        print("Downloading MovieLens-100K …")
        urllib.request.urlretrieve(
            "https://files.grouplens.org/datasets/movielens/ml-100k.zip",
            ZIP_PATH
        )
    print("Extracting …")
    with zipfile.ZipFile(ZIP_PATH, 'r') as zf:
        zf.extractall("/content")

# Confirm required files exist
for name in ["u.user", "u.item", "u.genre"]:
    assert (ML_DIR / name).exists(), f"Missing {ML_DIR/name}"
print("MovieLens metadata ready ")


Downloading MovieLens-100K …
Extracting …
MovieLens metadata ready 


In [5]:
import pandas as pd

# u.genre -> map id -> label, keeping order by id
genre_df = pd.read_csv(ML_DIR / "u.genre", sep="|", header=None, names=["genre", "id"])
genre_df = genre_df.dropna()  
genre_map = dict(zip(genre_df["id"].astype(int), genre_df["genre"]))

# u.user -> user_id|age|gender|occupation|zip_code
users_df = pd.read_csv(
    ML_DIR / "u.user",
    sep="|", header=None,
    names=["user_id", "age", "gender", "occupation", "zip"]
)
users_df["GenderText"] = users_df["gender"].map({"M": "Male", "F": "Female"}).fillna("Unknown")
users_meta = {
    int(r.user_id): {
        "UserID": int(r.user_id),
        "Age": int(r.age) if pd.notna(r.age) else None,
        "Gender": r.GenderText,
        "Occupation": str(r.occupation) if pd.notna(r.occupation) else "Unknown"
    }
    for _, r in users_df.iterrows()
}

# u.item -> movie_id|title|release_date|video_release_date|IMDb_URL|[19 genre flags...]
# Some release dates are NaN; we'll pass them as "Unknown".
item_cols = ["MovieID", "Title", "ReleaseDate", "VideoRelease", "IMDbURL"]
item_cols += [f"g{i}" for i in range(len(genre_map))]

items_df = pd.read_csv(
    ML_DIR / "u.item",
    sep="|", header=None, encoding="latin-1",
    names=item_cols
)

def row_genres_text(row):
    active = []
    for gid in range(len(genre_map)):
        flag = row.get(f"g{gid}", 0)
        if int(flag) == 1:
            active.append(genre_map[gid])
    if not active:
        active = ["unknown"]
    return ", ".join(active)

items_meta = {}
for _, r in items_df.iterrows():
    mid = int(r.MovieID)
    title = str(r.Title) if pd.notna(r.Title) else f"Movie {mid}"
    rel = str(r.ReleaseDate) if pd.notna(r.ReleaseDate) else "Unknown"
    genres_text = row_genres_text(r)
    items_meta[mid] = {
        "MovieID": mid,
        "Title": title,
        "ReleaseDate": rel,
        "Genres": genres_text
    }

print(f"Users loaded: {len(users_meta)}; Items loaded: {len(items_meta)}; Genres: {len(genre_map)} ")


Users loaded: 943; Items loaded: 1682; Genres: 19 


In [6]:
import pickle

with open(MAPPINGS_DIR / "ix_to_entity.dict", "rb") as f:
    ix_to_entity = pickle.load(f)  # maps entity_ix -> (raw_id, type_int)

with open(MAPPINGS_DIR / "ix_to_type.dict", "rb") as f:
    ix_to_type = pickle.load(f)   

with open(MAPPINGS_DIR / "ix_to_relation.dict", "rb") as f:
    ix_to_relation = pickle.load(f)

example_key = sorted(ix_to_entity.keys())[0]
print("Example ix_to_entity:", example_key, "->", ix_to_entity[example_key])
print("Types:", ix_to_type)
print("Relations (sample):", {k: ix_to_relation[k] for k in sorted(ix_to_relation)[:6]})


Example ix_to_entity: 0 -> (np.int64(1), 0)
Types: {0: 'user', 1: 'item', 2: '#PAD_TOKEN'}
Relations (sample): {0: 'user_item_1', 1: 'user_item_2', 2: 'user_item_3', 3: 'user_item_4', 4: 'user_item_5', 5: 'item_user_1'}


In [7]:
# Normalize relation labels to exactly the tokens:
# - 'usersim' for user_sim
# - 'itemsim' for item_sim
# - For user_item_k / item_user_k, keep their names and also attach (rating=k)
# - If an edge name is unknown or empty, use 'link'

def relation_to_text(rel_ix: int) -> str:
    name = ix_to_relation.get(rel_ix, None)
    if not name:
        return "link"
    if name == "user_sim":
        return "usersim"
    if name == "item_sim":
        return "itemsim"
    if name.startswith("user_item_"):
        try:
            k = int(name.split("_")[-1])
            return f"user_item_{k} (rating={k})"
        except Exception:
            return name
    if name.startswith("item_user_"):
        try:
            k = int(name.split("_")[-1])
            return f"item_user_{k} (rating={k})"
        except Exception:
            return name
    if name in ("#END_RELATION", "#PAD_TOKEN"):
        return "link"
    return name


In [12]:
# Type ids: find which int means 'user'/'item'
type_name_to_id = {v:k for k,v in ix_to_type.items()}
USER_TYPE_ID = type_name_to_id.get("user", 0)
ITEM_TYPE_ID = type_name_to_id.get("item", 1)
PAD_TYPE_ID = type_name_to_id.get("#PAD_TOKEN", None)

# Cache to speed up repeated path rendering
_entity_text_cache = {}

def describe_user(raw_user_id: int) -> str:
    meta = users_meta.get(raw_user_id, None)
    if not meta:
        return f"User{{UserID {raw_user_id}}}"
    return (
        f"User{{UserID {meta['UserID']}, Age {meta['Age']}, "
        f"Gender {meta['Gender']}, Occupation {meta['Occupation']}}}"
    )

def describe_item(raw_item_id: int) -> str:
    meta = items_meta.get(raw_item_id, None)
    if not meta:
        return f"Item{{MovieID {raw_item_id}}}"
    return (
        f"Item{{MovieID {meta['MovieID']}, Title \"{meta['Title']}\", "
        f"Release Date {meta['ReleaseDate']}, Genres {meta['Genres']}}}"
    )

def entity_ix_to_text(entity_ix: int, type_ix: int) -> str:
    key = (entity_ix, type_ix)
    if key in _entity_text_cache:
        return _entity_text_cache[key]

    val = ix_to_entity.get(entity_ix)

    if isinstance(val, str):
        s = ""
        _entity_text_cache[key] = s
        return s

    raw_id, recorded_type = val  # (raw_id, type_ix_from_build)

    t = recorded_type
    if t == USER_TYPE_ID:
        s = describe_user(int(raw_id))
    elif t == ITEM_TYPE_ID:
        s = describe_item(int(raw_id))
    else:
        # Unknown type
        s = f"Entity{{ID {raw_id}}}"

    _entity_text_cache[key] = s
    return s



In [13]:
def path_triplets_to_text(triplets, true_len: int) -> str:
    """
    triplets: list of [entity_ix, type_ix, relation_to_next_ix], padded to max length.
    true_len: real path length stored alongside each path.

    Returns a human-readable string, or "" if this is a padding-only path.
    """
    true_len = int(true_len)
    if true_len <= 0:
        return ""

    # If the first node is PAD, skip this path entirely
    first_type = int(triplets[0][1])
    if PAD_TYPE_ID is not None and first_type == PAD_TYPE_ID:
        return ""

    parts = []
    for i in range(true_len):
        ent_ix, typ_ix, rel_to_next = triplets[i]

        # If any node within the true length is PAD, stop rendering
        if PAD_TYPE_ID is not None and int(typ_ix) == PAD_TYPE_ID:
            return ""

        ent_text = entity_ix_to_text(int(ent_ix), int(typ_ix))
        if not ent_text:
            # Empty means PAD or unknown; drop the path
            return ""

        parts.append(ent_text)

        if i < true_len - 1:
            parts.append(" -> ")
            parts.append(relation_to_text(int(rel_to_next)))
            parts.append(" -> ")

    return "".join(parts)



In [14]:
import json

def build_record(line: str):
    """
    Input line from your JSONL:
      {
        'user_idx': int,
        'item_idx': int,
        'rating': float,
        'paths': [{'triplets': [[e,t,r],...], 'len': int}, ...]
      }
    Output dict with 4 fields:
      user:   {UserID, Age, Gender, Occupation}
      item:   {MovieID, Title, ReleaseDate, Genres}
      answer: float rating
      paths:  [string, ...] (paths in natural text)
    """
    obj = json.loads(line)
    user_idx = int(obj["user_idx"])
    item_idx = int(obj["item_idx"])
    answer = float(obj["rating"])

    # Top-level user/item dicts 
    raw_uid, uid_type = ix_to_entity[user_idx]
    raw_iid, iid_type = ix_to_entity[item_idx]
    raw_uid, raw_iid = int(raw_uid), int(raw_iid)

    # Build dicts as requested
    u = users_meta.get(raw_uid, {
        "UserID": raw_uid, "Age": None, "Gender": "Unknown", "Occupation": "Unknown"
    })
    it = items_meta.get(raw_iid, {
        "MovieID": raw_iid, "Title": f"Movie {raw_iid}", "ReleaseDate": "Unknown", "Genres": "unknown"
    })

    # Naturalize paths
    path_objs = obj.get("paths", [])
    if MAX_PATHS_PER_INTERACTION is not None:
        path_objs = path_objs[:MAX_PATHS_PER_INTERACTION]
    paths_text = [
        path_triplets_to_text(p["triplets"], p["len"])
        for p in path_objs
    ]

    # Filter out empty strings (padding-only paths)
    paths_text = [s for s in paths_text if s]

    return {
        "user": u,
        "item": it,
        "answer": answer,
        "paths": paths_text
    }


In [15]:
from datasets import Dataset, DatasetDict, Features, Value, Sequence, Array2D, Array3D, ClassLabel, load_dataset

# Define the exact schema (structs + sequences)
features = {
    "user": {
        "UserID": Value("int32"),
        "Age": Value("int32"),
        "Gender": Value("string"),
        "Occupation": Value("string"),
    },
    "item": {
        "MovieID": Value("int32"),
        "Title": Value("string"),
        "ReleaseDate": Value("string"),
        "Genres": Value("string"),
    },
    "answer": Value("float32"),
    "paths": Sequence(Value("string")),
}

hf_features = Features(features)

def gen_split(path: Path):
    with path.open("r") as f:
        for line in f:
            yield build_record(line)

splits = {}
for split, fname in JSONL_FILENAMES.items():
    p = META_PATHS_DIR / fname
    ds = Dataset.from_generator(
        lambda p=p: gen_split(p),
        features=hf_features
    )
    splits[split] = ds
    print(split, ds)

dataset = DatasetDict(splits)
dataset


Generating train split: 0 examples [00:00, ? examples/s]

train Dataset({
    features: ['user', 'item', 'answer', 'paths'],
    num_rows: 80000
})


Generating train split: 0 examples [00:00, ? examples/s]

validation Dataset({
    features: ['user', 'item', 'answer', 'paths'],
    num_rows: 10000
})


Generating train split: 0 examples [00:00, ? examples/s]

test Dataset({
    features: ['user', 'item', 'answer', 'paths'],
    num_rows: 10000
})


DatasetDict({
    train: Dataset({
        features: ['user', 'item', 'answer', 'paths'],
        num_rows: 80000
    })
    validation: Dataset({
        features: ['user', 'item', 'answer', 'paths'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['user', 'item', 'answer', 'paths'],
        num_rows: 10000
    })
})

In [16]:
import random

for split in ["train", "validation", "test"]:
    ds = dataset[split]
    idx = random.randrange(len(ds))
    row = ds[idx]
    print(f"\n=== {split.upper()} SAMPLE @ {idx} ===")
    print("user :", row["user"])
    print("item :", row["item"])
    print("answer :", row["answer"])
    print("paths[0] :", row["paths"][0] if row["paths"] else "(no path)")



=== TRAIN SAMPLE @ 10391 ===
user : {'UserID': 125, 'Age': 30, 'Gender': 'Male', 'Occupation': 'lawyer'}
item : {'MovieID': 585, 'Title': 'Son in Law (1993)', 'ReleaseDate': '01-Jan-1993', 'Genres': 'Comedy'}
answer : 4.0
paths[0] : User{UserID 125, Age 30, Gender Male, Occupation lawyer} -> user_item_2 (rating=2) -> Item{MovieID 383, Title "Flintstones, The (1994)", Release Date 01-Jan-1994, Genres Children's, Comedy} -> item_user_1 (rating=1) -> User{UserID 336, Age 23, Gender Male, Occupation salesman} -> user_item_1 (rating=1) -> Item{MovieID 577, Title "Coneheads (1993)", Release Date 01-Jan-1993, Genres Comedy, Sci-Fi} -> itemsim -> Item{MovieID 585, Title "Son in Law (1993)", Release Date 01-Jan-1993, Genres Comedy}

=== VALIDATION SAMPLE @ 626 ===
user : {'UserID': 60, 'Age': 50, 'Gender': 'Male', 'Occupation': 'healthcare'}
item : {'MovieID': 525, 'Title': 'Big Sleep, The (1946)', 'ReleaseDate': '01-Jan-1946', 'Genres': 'Film-Noir, Mystery'}
answer : 5.0
paths[0] : User{UserI

In [17]:
from huggingface_hub import notebook_login
notebook_login()  


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
dataset.push_to_hub(HF_DATASET_REPO, private=False)
print(f"Pushed to https://huggingface.co/datasets/<your-username>/{HF_DATASET_REPO}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Uploading the dataset shards:   0%|          | 0/3 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/27 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Pushed to https://huggingface.co/datasets/<your-username>/social_movielens
