In [12]:
import torch
import transformers
import os
import shutil

from transformers import AutoTokenizer
from argparse import Namespace

In [13]:
# change path_to_model_on_disk and model_ckpt

# Options for path_to_model_on_disk:
# "/data3/mmendieta/models/xlmt_finetuned_twitter/worldly-blaze-2/epoch_14"
# "/data3/mmendieta/models/smallLabse_finetuned_twitter/electric-glitter-32/epoch_15"
# "/data3/mmendieta/models/labse_finetuned_twitter/dazzling-violet-5/epoch_19"
# "/data4/mmendieta/models/ml-e5-large_finetuned_twitter_all_labels/"

# Options for model_ckpt
# "cardiffnlp/twitter-xlm-roberta-base"
# "setu4993/LaBSE"
# "setu4993/smaller-LaBSE"
# "intfloat/multilingual-e5-large"  # Hub
# /data3/mmendieta/models/ml_e5_large  # local

config = {
    "cuda_device": 3,
    "path_to_model_on_disk": "/data4/mmendieta/models/labse_finetuned_twitter_all_labels/legendary-eon-1/epoch_19/", 
    "model_ckpt": "setu4993/LaBSE",
    "max_length": 32
}

args = Namespace(**config)

In [14]:
# Instantiate the tokenizer
model_ckpt = args.model_ckpt
tokenizer = AutoTokenizer.from_pretrained(model_ckpt,
                                         model_max_length=args.max_length
                                         )

In [15]:
tokenizer.save_pretrained(args.path_to_model_on_disk)

('/data4/mmendieta/models/labse_finetuned_twitter_all_labels/legendary-eon-1/epoch_19/tokenizer_config.json',
 '/data4/mmendieta/models/labse_finetuned_twitter_all_labels/legendary-eon-1/epoch_19/special_tokens_map.json',
 '/data4/mmendieta/models/labse_finetuned_twitter_all_labels/legendary-eon-1/epoch_19/vocab.txt',
 '/data4/mmendieta/models/labse_finetuned_twitter_all_labels/legendary-eon-1/epoch_19/added_tokens.json',
 '/data4/mmendieta/models/labse_finetuned_twitter_all_labels/legendary-eon-1/epoch_19/tokenizer.json')

### Hugging Face hub

In [5]:
from huggingface_hub import notebook_login, Repository, get_full_repo_name
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from huggingface_hub import get_full_repo_name, list_repo_refs, HfApi

In [7]:
# --- Configuration for your repository and checkpoint ---
# Your local directory that will act as the Git repository root
LOCAL_REPO_PATH = args.path_to_model_on_disk

# Your Hugging Face Hub repository ID
HUB_REPO_ID = "m2im/ml-e5-large_finetuned_violence_twitter_all_labels"

# The full path to your epoch_18 checkpoint (source of model files)
CHECKPOINT_PATH = os.path.join(LOCAL_REPO_PATH, "fanciful-sunset-7", "epoch_18")

# Commit message for your push
COMMIT_MESSAGE = "Update main branch with recovered epoch_18 checkpoint and existing tokenizer"

# The target branch on the Hub
BRANCH_TO_PUSH_TO = "main"

# --- CRITICAL: CONFIRM THIS PATH IS CORRECT AFTER YOUR RECOVERY (PART 1) ---
# This is the exact path where you confirmed 'config.json' and 'pytorch_model.bin' are located.
RECOVERED_MODEL_SOURCE = "/data4/mmendieta/recovered_models/fanciful-sunset-7/epoch_18"

In [8]:
# --- Step 1: Initialize Repository object for the existing local directory ---
# As per your request, we are NOT clearing or re-cloning LOCAL_REPO_PATH.
# We are assuming it's already a valid Git repository connected to the Hub.
print(f"\nInitializing Git repository object for existing local directory: '{LOCAL_REPO_PATH}'...")
hf_repo = Repository(
    local_dir=LOCAL_REPO_PATH,
    # No clone_from or revision, as we're working with an existing local repo
)
print("Git repository object initialized.")
print(f"Current local branch: {hf_repo.current_branch}")


Initializing Git repository object for existing local directory: '/data4/mmendieta/models/ml-e5-large_finetuned_twitter_all_labels'...


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.


Git repository object initialized.
Current local branch: main


In [9]:
# --- Step 2: Ensure all necessary files (Model & Tokenizer) are in LOCAL_REPO_PATH ---

# First, copy the recovered MODEL files from the checkpoint
print(f"\nCopying recovered MODEL files from '{RECOVERED_MODEL_SOURCE}' to '{LOCAL_REPO_PATH}'...")
model_files_to_copy = [
    "config.json",
    "pytorch_model.bin", # Or 'model.safetensors', if that's what your model saved
]
for filename in model_files_to_copy:
    src_file = os.path.join(RECOVERED_MODEL_SOURCE, filename)
    dst_file = os.path.join(LOCAL_REPO_PATH, filename)
    if os.path.exists(src_file):
        shutil.copy2(src_file, dst_file)
        print(f"Copied MODEL file: {filename}")
    else:
        print(f"WARNING: MODEL file '{filename}' not found in '{src_file}'. This might result in an incomplete model on the Hub.")


# Second, explicitly state that we are using the existing TOKENIZER files.
print(f"\nUsing existing TOKENIZER files already present in '{LOCAL_REPO_PATH}' as requested.")
print("Please ensure all necessary tokenizer files (tokenizer.json, tokenizer_config.json,")
print("special_tokens_map.json, sentencepiece.bpe.model, and added_tokens.json if applicable)")
print("are already correctly present in this directory for the push.")


print("All necessary files (model + existing tokenizer) prepared in local repository for pushing.")

# --- DEBUGGING STEP: Print contents of LOCAL_REPO_PATH after all file preparation ---
print(f"\n--- Contents of '{LOCAL_REPO_PATH}' AFTER all file preparation (BEFORE git add/commit/push) ---")
os.system(f"ls -l {LOCAL_REPO_PATH}")
print("--------------------------------------------------------------------------------------------------")


Copying recovered MODEL files from '/data4/mmendieta/recovered_models/fanciful-sunset-7/epoch_18' to '/data4/mmendieta/models/ml-e5-large_finetuned_twitter_all_labels'...
Copied MODEL file: config.json
Copied MODEL file: pytorch_model.bin

Using existing TOKENIZER files already present in '/data4/mmendieta/models/ml-e5-large_finetuned_twitter_all_labels' as requested.
Please ensure all necessary tokenizer files (tokenizer.json, tokenizer_config.json,
special_tokens_map.json, sentencepiece.bpe.model, and added_tokens.json if applicable)
are already correctly present in this directory for the push.
All necessary files (model + existing tokenizer) prepared in local repository for pushing.

--- Contents of '/data4/mmendieta/models/ml-e5-large_finetuned_twitter_all_labels' AFTER all file preparation (BEFORE git add/commit/push) ---
total 2209016
-rw-rw-r-- 1 mmendieta mmendieta       2610 Jul 26 12:58 config.json
-rw-rw-r-- 1 mmendieta mmendieta 2239862893 Jul 26 13:01 pytorch_model.bin
-r

In [10]:
# --- Step 3: Add, Commit, and Push the changes to the Hub ---
print("\nAdding files to Git staging area...")
hf_repo.git_add(auto_lfs_track=True) # Automatically tracks large files with LFS
print("Committing changes...")
hf_repo.git_commit(COMMIT_MESSAGE)
print("Pushing changes to Hugging Face Hub...")
hf_repo.git_push(blocking=True) # blocking=True waits for the push to complete
print("Changes pushed successfully to Hugging Face Hub!")


Adding files to Git staging area...
Committing changes...
Pushing changes to Hugging Face Hub...


To https://huggingface.co/m2im/ml-e5-large_finetuned_violence_twitter_all_labels
   802c163..9d9b915  main -> main



Changes pushed successfully to Hugging Face Hub!


In [11]:
# --- Step 4: Verify the push (Optional) ---
# Use list_repo_refs to confirm the main branch's updated state
print(f"\nVerifying repository refs for {HUB_REPO_ID}...")
try:
    refs = list_repo_refs(HUB_REPO_ID)
    print("Branches on Hub:")
    for branch in refs.branches:
        # Corrected attribute: use 'target_commit' instead of 'target_commit_oid'
        print(f"- {branch.name} (Commit ID: {branch.target_commit})")
    print("\nTags on Hub:")
    for tag in refs.tags:
        # Corrected attribute: use 'target_commit' instead of 'target_commit_oid'
        print(f"- {tag.name} (Commit ID: {tag.target_commit})")
except Exception as e:
    print(f"Could not list repository refs: {e}")
    print("Ensure the repository ID is correct and you have read access.")


Verifying repository refs for m2im/ml-e5-large_finetuned_violence_twitter_all_labels...
Branches on Hub:
- main (Commit ID: 9d9b91526c272e0866a85882043a19ff3c9c082e)
- fanciful-sunset-7 (Commit ID: 428e979290d71390ac7c7a3c3dfc81aa4136518d)

Tags on Hub:
