In [1]:
from huggingface_hub import notebook_login, Repository, list_repo_refs
import os
import shutil # Import shutil for potential directory cleanup if needed

In [2]:
# --- Configuration ---
HUB_REPO_ID = "m2im/ml-e5-large_finetuned_violence_twitter_all_labels"

# IMPORTANT: Choose a NEW, EMPTY directory for recovery.
# DO NOT use your previous '/data4/mmendieta/models/ml-e5-large_finetuned_twitter_all_labels'
# as it might contain partially modified git history.
RECOVERY_DIR = "/data4/mmendieta/recovered_models" # Suggest a new, distinct path

In [3]:
# --- Step 1: List all branches and tags on your Hub repository ---
print(f"\nListing branches and tags for {HUB_REPO_ID} on Hugging Face Hub...")
try:
    refs = list_repo_refs(HUB_REPO_ID)

    print("\n--- Branches on Hub ---")
    if refs.branches:
        for branch in refs.branches:
            print(f"- Branch: {branch.name} (Latest Commit: {branch.target_commit})")
    else:
        print("No branches found.")

    print("\n--- Tags on Hub ---")
    if refs.tags:
        for tag in refs.tags:
            print(f"- Tag: {tag.name} (Commit: {tag.target_commit})")
    else:
        print("No tags found.")

except Exception as e:
    print(f"Error listing repository refs: {e}")
    print("Please ensure your HUB_REPO_ID is correct and you have read access.")
    exit()

print("\n*******************************************************************************")
print("Review the branches and tags above carefully.")
print("Identify the one that likely contains your checkpoints (e.g., 'fanciful-sunset-7' or 'epoch_18').")
print("*******************************************************************************")


Listing branches and tags for m2im/ml-e5-large_finetuned_violence_twitter_all_labels on Hugging Face Hub...

--- Branches on Hub ---
- Branch: main (Latest Commit: 802c1633d891880a2c741461f554d76786ad70b0)
- Branch: fanciful-sunset-7 (Latest Commit: 428e979290d71390ac7c7a3c3dfc81aa4136518d)

--- Tags on Hub ---
No tags found.

*******************************************************************************
Review the branches and tags above carefully.
Identify the one that likely contains your checkpoints (e.g., 'fanciful-sunset-7' or 'epoch_18').
*******************************************************************************


In [7]:
# --- Step 2: Clone the specific branch/tag containing your checkpoints ---
# !!! REPLACE "fanciful-sunset-7" BELOW with the ACTUAL branch or tag name you identified !!!
# For example, if 'epoch_18' appeared in the 'Tags on Hub' list, use 'epoch_18' here.
REVISION_TO_RECOVER = "fanciful-sunset-7" # <--- YOU MUST UPDATE THIS LINE!

print(f"\nAttempting to clone revision '{REVISION_TO_RECOVER}' from '{HUB_REPO_ID}' to '{RECOVERY_DIR}'...")

# Prepare the recovery directory
if os.path.exists(RECOVERY_DIR):
    if os.listdir(RECOVERY_DIR): # Check if directory is not empty
        print(f"WARNING: Recovery directory '{RECOVERY_DIR}' is not empty.")
        user_choice = input("Do you want to clear its contents before cloning? (y/n): ").lower()
        if user_choice == 'y':
            print(f"Clearing contents of '{RECOVERY_DIR}'...")
            for item in os.listdir(RECOVERY_DIR):
                item_path = os.path.join(RECOVERY_DIR, item)
                if os.path.isfile(item_path):
                    os.remove(item_path)
                elif os.path.isdir(item_path):
                    shutil.rmtree(item_path)
            print("Directory cleared.")
        else:
            print("Aborting to prevent accidental data loss. Please choose an empty directory or clear it manually.")
            exit()
else:
    os.makedirs(RECOVERY_DIR, exist_ok=True) # Create the directory if it doesn't exist

try:
    recovered_repo = Repository(
        local_dir=RECOVERY_DIR,
        clone_from=HUB_REPO_ID,
        revision=REVISION_TO_RECOVER # Clone the specific branch or tag
    )
    print(f"\nSuccessfully cloned revision '{REVISION_TO_RECOVER}' to '{RECOVERY_DIR}'.")
    print(f"The recovered Git branch is now: {recovered_repo.current_branch}")

    # --- Step 3: Guide the user to find the files within the recovered directory ---
    print("\n--- Next Steps to find your Checkpoints ---")
    print(f"1. Navigate to the recovered directory in your terminal:")
    print(f"   `cd {RECOVERY_DIR}`")
    print("2. List all contents recursively to find your checkpoints. They are likely nested:")
    print("   `ls -lR`")
    print("   Look for 'config.json', 'pytorch_model.bin' (or 'model.safetensors'),")
    print("   and all your tokenizer files (e.g., 'tokenizer.json', 'special_tokens_map.json').")
    print("   Based on your previous screenshots, they might be inside a 'fanciful-sunset-7/epoch_18/' structure within this recovered directory.")
    print("3. Once located, you can copy the desired checkpoint files back to your original working directory for use or to push to the 'main' branch.")

except Exception as e:
    print(f"Error cloning repository: {e}")
    print(f"Please double-check the 'REVISION_TO_RECOVER' name ('{REVISION_TO_RECOVER}') and ensure it's a valid branch or tag on your Hub repository.")
    print("Also ensure you have network access and write permissions for the 'RECOVERY_DIR'.")

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/m2im/ml-e5-large_finetuned_violence_twitter_all_labels into local empty directory.



Attempting to clone revision 'fanciful-sunset-7' from 'm2im/ml-e5-large_finetuned_violence_twitter_all_labels' to '/data4/mmendieta/recovered_models'...


Download file tokenizer.json:   0%|          | 12.7k/16.3M [00:00<?, ?B/s]

Download file sentencepiece.bpe.model:   0%|          | 24.0k/4.83M [00:00<?, ?B/s]

Checked out fanciful-sunset-7 from fanciful-sunset-7.
Branch 'fanciful-sunset-7' set up to track remote branch 'fanciful-sunset-7' from 'origin'.




Successfully cloned revision 'fanciful-sunset-7' to '/data4/mmendieta/recovered_models'.
The recovered Git branch is now: fanciful-sunset-7

--- Next Steps to find your Checkpoints ---
1. Navigate to the recovered directory in your terminal:
   `cd /data4/mmendieta/recovered_models`
2. List all contents recursively to find your checkpoints. They are likely nested:
   `ls -lR`
   Look for 'config.json', 'pytorch_model.bin' (or 'model.safetensors'),
   and all your tokenizer files (e.g., 'tokenizer.json', 'special_tokens_map.json').
   Based on your previous screenshots, they might be inside a 'fanciful-sunset-7/epoch_18/' structure within this recovered directory.
3. Once located, you can copy the desired checkpoint files back to your original working directory for use or to push to the 'main' branch.
