In [1]:
import torch
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import os
import time
import boto3
import botocore
from pathlib import Path
from huggingface_hub import login
from huggingface_hub import notebook_login

In [2]:
#Some models require a hugginface login; uncomment the following code so you can provide your token

#notebook_login()

#Some models also need you to login to Huggingface using your browser and agree to a use policy.

In [3]:
#Setup variables for this script

In [4]:
models_dir = Path.cwd() / "models"
models_dir.mkdir(parents=True, exist_ok=True)
model_prefix = "BAAI"
model_name = "bge-reranker-v2-m3"
full_model_name = f"{model_prefix}/{model_name}"
save_path = models_dir / model_prefix

In [5]:
#Download model

In [6]:
from huggingface_hub import snapshot_download
snapshot_download(repo_id=full_model_name, local_dir=save_path)

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

BEIR-bge-en-v1.5.png:   0%|          | 0.00/56.4k [00:00<?, ?B/s]

BEIR-e5-mistral.png:   0%|          | 0.00/40.2k [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

llama-index.png:   0%|          | 0.00/106k [00:00<?, ?B/s]

miracl-bge-m3.png:   0%|          | 0.00/52.0k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/795 [00:00<?, ?B/s]

CMTEB-retrieval-bge-zh-v1.5.png:   0%|          | 0.00/51.5k [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

'/opt/app-root/src/openshift-ai-samples/model_prep/models/BAAI'

In [7]:
#Get the s3 connection parameters (these are entered in the workbench setup screens)

In [8]:
aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
endpoint_url = os.environ.get('AWS_S3_ENDPOINT')
region_name = os.environ.get('AWS_DEFAULT_REGION')
bucket_name = os.environ.get('AWS_S3_BUCKET')

In [9]:
session = boto3.session.Session(
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key
          )

In [10]:
s3_resource = session.resource(
               's3',
               config=botocore.client.Config(signature_version='s3v4'),
               endpoint_url=endpoint_url,
               region_name=region_name
              )
bucket = s3_resource.Bucket(bucket_name)
                        
def upload_directory_to_s3(local_directory, s3_prefix):
    for root, dirs, files in os.walk(local_directory):
        for filename in files:
            file_path = os.path.join(root, filename)
            relative_path = os.path.relpath(file_path, local_directory)
            s3_key = os.path.join(s3_prefix, relative_path)
            print(f"{file_path} -> {s3_key}")
            bucket.upload_file(file_path, s3_key)
                                
def list_objects(prefix):
    filter = bucket.objects.filter(Prefix=prefix)
    for obj in filter.all():
        print(obj.key)

In [11]:
#Upload the model to an S3 bucket within the cluster

In [12]:
upload_directory_to_s3(save_path, model_name)

/opt/app-root/src/openshift-ai-samples/model_prep/models/BAAI/tokenizer_config.json -> bge-reranker-v2-m3/tokenizer_config.json
/opt/app-root/src/openshift-ai-samples/model_prep/models/BAAI/config.json -> bge-reranker-v2-m3/config.json
/opt/app-root/src/openshift-ai-samples/model_prep/models/BAAI/.gitattributes -> bge-reranker-v2-m3/.gitattributes
/opt/app-root/src/openshift-ai-samples/model_prep/models/BAAI/sentencepiece.bpe.model -> bge-reranker-v2-m3/sentencepiece.bpe.model
/opt/app-root/src/openshift-ai-samples/model_prep/models/BAAI/README.md -> bge-reranker-v2-m3/README.md
/opt/app-root/src/openshift-ai-samples/model_prep/models/BAAI/special_tokens_map.json -> bge-reranker-v2-m3/special_tokens_map.json
/opt/app-root/src/openshift-ai-samples/model_prep/models/BAAI/tokenizer.json -> bge-reranker-v2-m3/tokenizer.json
/opt/app-root/src/openshift-ai-samples/model_prep/models/BAAI/model.safetensors -> bge-reranker-v2-m3/model.safetensors
/opt/app-root/src/openshift-ai-samples/model_pre