### GPU Check

In [6]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fri May 17 08:19:42 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04             Driver Version: 535.171.04   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A30                     Off | 00000000:02:00.0 Off |                   On |
| N/A   30C    P0              28W / 165W |   1206MiB / 24576MiB |     N/A      Default |
|                                         |                      |              Enabled |
+-----------------------------------------+----------------------+----------------------+

+------------------------------------------------------------------

### Sharded Loading from Storage

In [None]:
import torch
import os
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from safetensors import safe_open
import constants

model_name = "gemma-2b-it"
model_id = "google/" + model_name
model_dir = "models--google--" + model_name

print(f"Loading model configuration and tokenizer for {model_id}")
# Load model configuration and tokenizer
config = AutoConfig.from_pretrained(model_id, cache_dir=constants.CACHE_DIR_LOCAL)
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=constants.CACHE_DIR_LOCAL)

# Initialize model without weights
model = AutoModelForCausalLM.from_config(config)

print(f"Model Initialized without weights: {model}")


# List the shard files

shard_local_files = [
    os.path.join(constants.CACHE_DIR_LOCAL, model_dir, "snapshots", "de144fb2268dee1066f515465df532c05e699d48", f"model-0000{i+1}-of-00002.safetensors")
    for i in range(2)
]

# Helper function to load a shard and move to GPU
def load_shard_to_gpu(model, shard_file, device):
    with safe_open(shard_file, framework="pt") as f:
        for name in f.keys():
            tensor = f.get_tensor(name).to(device)
            # Assign tensor to model's state_dict
            if name in model.state_dict():
                model.state_dict()[name].copy_(tensor)
            else:
                print(f"Warning: Tensor {name} not found in model state_dict")

# Load each shard one by one and move to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Loading model shards to device: {device}")
for shard_file in shard_local_files:
    print(f"Loading shard: {shard_file}")
    load_shard_to_gpu(model, shard_file, device)

### Sharded Loading from HugginFace Repository

In [1]:
import torch
import os
import shutil
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from safetensors import safe_open
import requests
from hf_token import TOKEN

model_name = "gemma-1.1-2b-it"
model_id = "google/" + model_name
model_dir = "models--google--" + model_name
shards = 2
# URLs for the shard files
shard_urls = [
    f"https://huggingface.co/google/{model_name}/resolve/main/model-0000{i+1}-of-0000{shards}.safetensors"
    for i in range(shards)
]
shard_local_files = [
    os.path.join(constants.CACHE_DIR_LOCAL, model_dir, "snapshots", "bf4924f313df5166dee1467161e886e55f2eb4d4", f"pytorch_model-0000{i+1}-of-0000{shards}.bin")
    for i in range(shards)
]


print(f"Loading model configuration and tokenizer for {model_id}")
# Load model configuration and tokenizer
config = AutoConfig.from_pretrained(model_id, cache_dir=constants.CACHE_DIR_LOCAL)
tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=constants.CACHE_DIR_LOCAL)

# Initialize model without weights
model = AutoModelForCausalLM.from_config(config)

print(f"Model Initialized without weights")



# Helper function to download a shard from URL
def download_shard(url, local_file):
    print(f"Downloading shard from {url}")
    headers = {"Authorization": f"Bearer {TOKEN}"}
    response = requests.get(url, headers=headers, stream=True)
    # response = requests.get(url, stream=True)
    with open(local_file, 'wb') as f:
        shutil.copyfileobj(response.raw, f)
    print(f"Shard downloaded to {local_file}")

# Helper function to load shard to GPU and clean up
def load_shard_to_gpu(model, local_file, device):
    print(f"Loading shard from {local_file} to device: {device}")
    with safe_open(local_file, framework="pt") as f:
        for name in f.keys():
            tensor = f.get_tensor(name).to(device)
            if name in model.state_dict():
                model.state_dict()[name].copy_(tensor)
            else:
                print(f"Warning: Tensor {name} not found in model state_dict")
    # Remove the local shard file to save space
    print(f"Removing local shard file {local_file}")
    os.remove(local_file)

# Load each shard one by one, moving it to GPU and cleaning up
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Loading model shards to device: {device}")
for url, local_file in zip(shard_urls, shard_local_files):
    print(f"Processing shard: {url}")
    download_shard(url, local_file)
    load_shard_to_gpu(model, local_file, device)


Loading model configuration and tokenizer for google/gemma-1.1-2b-it
Model Initialized without weights
Loading model shards to device: cuda
Processing shard: https://huggingface.co/google/gemma-1.1-2b-it/resolve/main/model-00001-of-00002.safetensors
Downloading shard from https://huggingface.co/google/gemma-1.1-2b-it/resolve/main/model-00001-of-00002.safetensors
Shard downloaded to /homes/pu22/.cache/huggingface/hub/models--google--gemma-1.1-2b-it/snapshots/bf4924f313df5166dee1467161e886e55f2eb4d4/pytorch_model-00001-of-00002.bin
Loading shard from /homes/pu22/.cache/huggingface/hub/models--google--gemma-1.1-2b-it/snapshots/bf4924f313df5166dee1467161e886e55f2eb4d4/pytorch_model-00001-of-00002.bin to device: cuda
Removing local shard file /homes/pu22/.cache/huggingface/hub/models--google--gemma-1.1-2b-it/snapshots/bf4924f313df5166dee1467161e886e55f2eb4d4/pytorch_model-00001-of-00002.bin
Processing shard: https://huggingface.co/google/gemma-1.1-2b-it/resolve/main/model-00002-of-00002.saf

In [5]:
inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(inputs['input_ids'], max_new_tokens=20, do_sample=False)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

Hello, my name is [Your Name] and I am reaching out to you today to inquire about your services.

I
