In [25]:
import io
import json

import numpy as np
import sagemaker
import boto3
from torch.backends.opt_einsum import strategy

# 2) Hard‑code (or read from env var) the execution‑role ARN you created
role = "arn:aws:iam::371087393859:role/defaultrole"
bucket = "ir-sagemaker"
session = boto3.Session(profile_name="lprofile", region_name="us-east-1")

sm_session = sagemaker.Session(boto_session=session, default_bucket=bucket)

INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials


In [26]:

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sm_session.default_bucket()}")
print(f"sagemaker session region: {sm_session.boto_region_name}")


sagemaker role arn: arn:aws:iam::371087393859:role/defaultrole
sagemaker bucket: ir-sagemaker
sagemaker session region: us-east-1


In [27]:
import sys
print(sys.version)

3.11.13 (main, Jun  5 2025, 13:12:00) [GCC 11.2.0]


In [28]:
from sagemaker.s3 import S3Uploader
bucket = sm_session.default_bucket()
prefix = "modernbert"

train_uri = f"s3://{bucket}/{prefix}/train/train.jsonl"
val_uri   = f"s3://{bucket}/{prefix}/val/val.jsonl"
test_uri  = f"s3://{bucket}/{prefix}/test/test.jsonl"
target_set_uri  = f"s3://{bucket}/{prefix}/target/target_set.jsonl"

In [5]:
train_uri = S3Uploader.upload("modernbert/data/train/train.jsonl", f"s3://{bucket}/{prefix}/train")
val_uri = S3Uploader.upload("modernbert/data/val/val.jsonl",   f"s3://{bucket}/{prefix}/val")
test_uri = S3Uploader.upload("modernbert/data/test/test.jsonl", f"s3://{bucket}/{prefix}/test")
target_set_uri = S3Uploader.upload("modernbert/data/target/target_set.jsonl", f"s3://{bucket}/{prefix}/target_set")

In [18]:
from sagemaker.huggingface import HuggingFace

metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_f1', 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_precision', 'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_recall', 'Regex': "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_runtime', 'Regex': "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_samples_per_second', 'Regex': "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

hyper = {"learning_rate":3e-5,
         "num_train_epochs":1,
         "max_steps":2,
         "temperature":0.05,
         "deepspeed": "ds_zero3.json"}

est = HuggingFace(
    entry_point="train_sm.py",
    source_dir="modernbert",
    role=role,
    instance_type="ml.g5.12xlarge",
    instance_count=1,
    distribution={"mpi": {"enabled": True}},
    transformers_version="4.49.0", pytorch_version="2.5.1", py_version="py311",
    hyperparameters=hyper,
    metric_definitions=metric_definitions,
    environment={
        "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
        "NCCL_DEBUG": "INFO"
    },
    output_path=f"s3://{bucket}/{prefix}/outputs"
)

In [19]:
est.fit({"train": train_uri, "val": val_uri, "test": test_uri})

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-pytorch-training-2025-08-13-17-23-57-467


2025-08-13 17:23:59 Starting - Starting the training job
2025-08-13 17:23:59 Pending - Training job waiting for capacity......
2025-08-13 17:24:45 Pending - Preparing the instances for training...
2025-08-13 17:25:16 Downloading - Downloading the training image...........................
2025-08-13 17:29:49 Training - Training image download completed. Training in progress....[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34mCUDA compat package should be installed for NVIDIA driver smaller than 550.163.01[0m
[34mCurrent installed NVIDIA driver version is 570.172.08[0m
[34mSkipping CUDA compat setup as newer NVIDIA driver is installed[0m
[34m2025-08-13 17:30:30,042 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2025-08-13 17:30:30,080 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-08-13 17

In [46]:
from sagemaker.analytics import TrainingJobAnalytics

df = TrainingJobAnalytics(training_job_name='huggingface-pytorch-training-2025-07-30-03-59-45-613').dataframe()
print(df)




    timestamp              metric_name      value
0         0.0                     loss   3.360500
1       180.0                     loss   2.731600
2         0.0            learning_rate   1.721739
3       180.0            learning_rate   4.173913
4         0.0            eval_accuracy   0.031250
5         0.0                  eval_f1   0.019156
6         0.0           eval_precision   0.022735
7         0.0              eval_recall   0.031250
8         0.0             eval_runtime  21.798900
9         0.0  eval_samples_per_second  22.157000
10        0.0                    epoch   0.430000
11      180.0                    epoch   0.870000
12      240.0                    epoch   1.000000


#### Model Evaluation

In [13]:
from sagemaker.huggingface import HuggingFaceModel
from sklearn.metrics import f1_score
import numpy as np
import tqdm
import faiss
import uuid
import sagemaker
import torch

print("numpy version:", np.__version__)
print("tqdm version:", tqdm.__version__)
print("torch version:", torch.__version__)
print("faiss version:", faiss.__version__)

numpy version: 1.26.4
tqdm version: 4.67.1
torch version: 2.8.0+cu128
faiss version: 1.9.0


In [14]:
s3 = boto3.client("s3")
model_data = f"s3://{bucket}/{prefix}/outputs/huggingface-pytorch-training-2025-08-13-17-23-57-467/output/model.tar.gz"
run_id = uuid.uuid4().hex
input_key = f"{prefix}/target_set/target_set.jsonl"
output_key = f"{prefix}/batch-output/{run_id}.jsonl"

body = s3.get_object(Bucket=bucket, Key=input_key)["Body"].read().decode("utf-8")

out_buf = io.StringIO()
for line in body.splitlines():
    rec = json.loads(line)
    doc_id = rec["doc_id"]
    for i, sent in enumerate(rec["case_text"]):
        out_buf.write(json.dumps({"doc_id": doc_id, "sent_id": i, "inputs": sent}) + "\n")

s3.put_object(Bucket=bucket, Key=output_key, Body=out_buf.getvalue().encode("utf-8"))
sentences_set_uri = f"s3://{bucket}/{output_key}"

In [None]:
output_prefix = f"{prefix}/embedded-output/{run_id}"

# create model class
huggingface_model = HuggingFaceModel(
    model_data=model_data,
    role=role,
    transformers_version="4.49.0",
    pytorch_version="2.6.0",
    py_version="py312",
    entry_point="inference.py",
    source_dir="code",
    env={
        "TOWER": "suffix",
        "POOLING": "mean",
        "NORMALIZE": "true",
        "MAX_LEN": "512",
    }
)

transformer = huggingface_model.transformer(
    instance_count=1,
    instance_type="ml.g5.12xlarge",
    output_path=f"s3://{bucket}/{output_prefix}",
    assemble_with="Line",
    accept="application/jsonlines",
    env={'INFERENCE_PREFERRED_MODE': 'embedding'},
    strategy="SingleRecord",
)

transformer.transform(
    data=sentences_set_uri,
    content_type="application/jsonlines",
    split_type="Line",
    input_filter="$.inputs",
    join_source=None,
)
transformer.wait()

resp = s3.list_objects_v2(Bucket=bucket, Prefix=output_prefix)
out_keys = [o["Key"] for o in resp.get("Contents", []) if o["Key"].endswith(".out")]
assert out_keys, f"No output files under s3://{bucket}/{output_prefix}"

sent_embeddings = {}

for k in out_keys:
    obj = s3.get_object(Bucket=bucket, Key=k)
    for raw in obj["Body"].iter_lines():
        if not raw:
            continue
        rec = json.loads(raw)

        # after join_source="Input", your metadata is intact and the prediction is here:
        emb = rec["SageMakerOutput"]     # shape depends on your model/inference.py

        # Build a stable key (doc_id + sent_id)
        key = f'{rec["doc_id"]}:{rec["sent_id"]}'
        sent_embeddings[key] = np.array(emb, dtype="float32")

[35m2025-08-14T23:39:36,280 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [transform_fn] data sample (bytes)=bytearray(b'"That internal contradiction is incompatible with the obligation to state reasons under Article 190 of the EC Treaty (no')[0m
[35m2025-08-14T23:39:36,280 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [transform_fn] text when it is text after to_text That internal contradiction is incompatible with the obligation to state reasons under Article 190 of the EC Treaty (now Article 253 EC).[0m
[35m2025-08-14T23:39:36,299 [INFO ] W-model-1-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - embedding returned <class 'list'>[0m
[35m2025-08-14T23:39:36,300 [WARN ] W-model-1-stderr com.amazonaws.ml.mms.wlm.WorkerLifeCycle - [dbg] body_head=[-0.03704202175140381, -0.02284402772784233, -0.019695455208420753, 0.04888606071472168, 0.0019460903713479638, -0.012751004658639431, -0.027861300855875015, -0.015160726383328438, -0.01675632

In [None]:
print(sent_embeddings[:10])

In [None]:
doc_to_int, int_to_doc = {}, {}
next_doc_int = 1

ids = []
vecs = []

for key, vec in sent_embeddings.items():
    doc_id, sent_id_str = key.split(":", 1)
    sent_id = int(sent_id_str)

    if doc_id not in doc_to_int:
        doc_to_int[doc_id] = next_doc_int
        int_to_doc[next_doc_int] = doc_id
        next_doc_int += 1

    did = np.int64(doc_to_int[doc_id])
    fid = (did << np.int64(32)) | np.int64(sent_id)   # pack (doc, sent) into 64-bit
    ids.append(fid)
    vecs.append(vec.astype("float32"))

X = np.vstack(vecs).astype("float32")

# (optional) cosine via inner product
faiss.normalize_L2(X)

base = faiss.IndexFlatIP(X.shape[1])
index = faiss.IndexIDMap2(base)
index.add_with_ids(X, np.asarray(ids, dtype="int64"))

In [None]:

all_sentences, sent_doc_ids, sent_offsets = [], [], []
for rec in test_records:
    for sentence in rec["sentences"]:
        all_sentences.append(sentence)
        sent_doc_ids.append(rec["doc_id"])
    sent_offsets.append(len(all_sentences))

# split into minibatches
def batched(lst, batch_size):
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]

all_vecs = []
for batch in tqdm.tqdm(list(batched(all_sentences, 64)), desc="encode sentences"):
    out = transformer.transform({"inputs": batch})
    all_vecs.append(np.array(out))

all_vecs = np.vstack(all_vecs).astype("float32")

# build FAISS index
d = all_vecs.shape[1]
index = faiss.IndexFlatIP(d)
index.add(all_vecs)

print(all_vecs.shape.shape)

In [None]:

# Evaluation

k_list = [1, 5, 10, 20]
k_max = max(k_list)

hits_at_k = {k:0 for k in k_list}
f1_gold, f1_pred = [], []

start = 0
for rec in tqdm.tqdm(test_records, desc="evaluate"):
    end = sent_offsets.pop(0)
    doc_sentence_slice = slice(start, end)
    start = end

    prefix_vec = np.array(
        prefix_predictor.predict({"inputs": [rec["prefix"]]}),
        dtype = "float32",
    )

    # Check this masking because it is not the same exactly
    mask_same_sentence = [
        sent in rec["prefix"] for sent in rec["sentences"]
    ]
    mast_same_sentence = np.array(mask_same_sentence, dtype=bool)

    # ids of candidates to keep
    keep_ids = np.where(~mast_same_sentence)[0] + doc_sentence_slice.start
    keep_vecs = all_vecs[keep_ids]

    # faiss index
    D, I = index.search(prefix_vec, k_max)

    # drop candidates with global id not in keep_ids
    valid = [i for i in I[0] if i in keep_ids][:k_max]
    if len(valid) < k_max:
        extra = [i for i in I[0] if i not in keep_ids]
        valid.extend(extra[: k_max - len(valid)])

    # Compute metrix
    for k in k_list:
        if any(all_sentences[i] == rec["positive"] for i in valid[:k]):
            hits_at_k[k] += 1

    # Binary F1
    pred1 = [1 if all_sentences[i] == rec["positive"] else 0 for i in valid]
    f1_gold.append([1] + [0]*(len(pred1) -1))
    f1_pred.append(pred1)

# Aggregate metrics
n = len(test_records)
recall = {k: hits_at_k[k] / n for k in k_list}

# macro f1
f1_scores = [
    f1_score(g, p, zero_division=0) for g, p in zip(f1_gold, f1_pred)
]
f1_macro = {k: np.mean([f1_scores[i] for i in range(n)]) for k in k_list}

print("Recall:", recall)
print("Macro F1:", f1_macro)

In [None]:
import sys
import os

module_directory = os.path.abspath('code/')

# Add the directory to the Python path
sys.path.append(module_directory)

from inference import model_fn, transform_fn
m = model_fn("model")
body = json.dumps({"inputs": "A quick test sentence."})
out, ctype = transform_fn(m, body, "application/json", "application/json")
print(ctype, out[:80], "...")

In [30]:
import boto3, json

key = f"{prefix}/batch-output/{run_id}.jsonl"

s3 = boto3.client("s3")
obj = s3.get_object(Bucket=bucket, Key=key)

for i, raw in enumerate(obj["Body"].iter_lines(), 1):
    if not raw:
        continue
    try:
        json.loads(raw)
    except Exception as e:
        print(f"Bad line {i}: {raw[:80]!r} -> {e}")
        break
else:
    print("All lines are valid JSON.")


All lines are valid JSON.
