In [2]:
# hf_to_s3.py

from dotenv import load_dotenv
import os
import shutil
import s3fs
from datasets import load_dataset_builder

# 0. load .env
load_dotenv()

# 1. S3 creds + endpoint/region
storage_options = {
    "key": os.getenv("AWS_ACCESS_KEY_ID"),
    "secret": os.getenv("AWS_SECRET_ACCESS_KEY"),
    "client_kwargs": {
        "endpoint_url": os.getenv("AWS_ENDPOINT_URL"),
        "region_name": "US-KS-2"
    }
}
fs = s3fs.S3FileSystem(**storage_options)

# 2. paths & IDs
dataset_id = "mksethi/eli5_sae_features"
local_dir  = "./eli5_sae_features_parquet"
s3_prefix  = "s3://twzsyl285j/eli5_sae_features_parquet/"

# 3. download locally
print("Downloading to local disk…")
builder = load_dataset_builder(dataset_id)
builder.download_and_prepare(local_dir, file_format="parquet")

# 4. upload every file under local_dir to S3
print("Uploading to S3…")
for root, _, files in os.walk(local_dir):
    for fname in files:
        if not fname.endswith(".parquet"):
            continue
        local_path = os.path.join(root, fname)
        rel_path   = os.path.relpath(local_path, local_dir).replace(os.sep, "/")
        s3_path    = f"{s3_prefix}{rel_path}"

        # skip if already uploaded
        if fs.exists(s3_path):
            print("✅ already in S3:", rel_path)
            continue

        # upload in one shot to avoid multipart errors
        print("Uploading:", rel_path)
        fs.put(local_path, s3_path, use_put=True)

# 5. clean up local
print("Removing local files…")
shutil.rmtree(local_dir)

print("All done ✅")


Downloading to local disk…


Repo card metadata block was not found. Setting CardData to empty.


Resolving data files:   0%|          | 0/55 [00:00<?, ?it/s]

Uploading to S3…
✅ already in S3: eli5_sae_features-train-00026-of-00055.parquet
✅ already in S3: eli5_sae_features-train-00018-of-00055.parquet
✅ already in S3: eli5_sae_features-train-00043-of-00055.parquet
✅ already in S3: eli5_sae_features-train-00039-of-00055.parquet
✅ already in S3: eli5_sae_features-train-00007-of-00055.parquet
✅ already in S3: eli5_sae_features-train-00048-of-00055.parquet
✅ already in S3: eli5_sae_features-train-00013-of-00055.parquet
✅ already in S3: eli5_sae_features-train-00032-of-00055.parquet
✅ already in S3: eli5_sae_features-train-00008-of-00055.parquet
✅ already in S3: eli5_sae_features-train-00036-of-00055.parquet
✅ already in S3: eli5_sae_features-train-00053-of-00055.parquet
✅ already in S3: eli5_sae_features-train-00017-of-00055.parquet
✅ already in S3: eli5_sae_features-train-00029-of-00055.parquet
✅ already in S3: eli5_sae_features-train-00003-of-00055.parquet
✅ already in S3: eli5_sae_features-train-00022-of-00055.parquet
✅ already in S3: eli5_s

OSError: [Errno 5] An error occurred (524) when calling the UploadPart operation: 

In [2]:
from transformers import AutoModel, AutoTokenizer


model = AutoModel.from_pretrained("mksethi/gpt2-query2sae", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("gpt2")   

In [15]:
text = "What is the weather in Hong Kong Like?"

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
encoded = tokenizer(text, max_length=256, padding='max_length', return_tensors='pt')

In [11]:
len(encoded)

2

256

In [16]:
out = model(**encoded)

In [18]:
with torch.no_grad():
    model(**input)

NameError: name 'torch' is not defined