In [2]:

import argparse
import os

#set visible cuda devices
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

import torch
from transformers import (AutoModelForSequenceClassification,
                          AutoModelForTokenClassification, 
                          AutoModelForCausalLM,
                          AutoModelForMaskedLM,
                          AutoModel,
                          AutoTokenizer,
)
import yaml
from tqdm import tqdm
import numpy as np

import sys
sys.path.append("../")

from data_utils.model_utils import count_trainable_parameters, freeze_model, unfreeze_model
from thop import profile # for flops calc

In [6]:
# function to get flops n params
def get_flops_and_params(model_name):
    
    if "llama" in model_name:
        print("Got llama model, loading in half precision")
        model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                        torch_dtype = torch.bfloat16,
                                        )
    else:
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        
    input_ids = torch.tensor([[101, 2023, 2003, 1037, 2047, 2814, 1012, 102]])
    flops, params = profile(model, inputs=(input_ids,))
    # convert flops to scientific notation
    flops = "{:.2e}".format(flops)
    return flops, params

# turn above into a function that accepts multiple models and returns the gpu memory needed
# it should take in a list of model names and return a dictionary of model names and gpu memory needed

def get_gpu_memory_needed(model_names):
    device = torch.device('cuda:0') 
    gpu_memory_needed = {}
    for model_name in tqdm(model_names):
        
        if "llama" in model_name:
            print("Got llama model, loading in half precision")
            model = AutoModel.from_pretrained(model_name,
                                            torch_dtype = torch.bfloat16,
                                            device_map = "auto")
        else:
            model = AutoModel.from_pretrained(model_name)
        model.to(device)
        gpu_memory_needed[model_name] = torch.cuda.memory_allocated(device.index)/1024**3
        
        # get flops and num params
        flops, n_params = get_flops_and_params(model_name)
    return gpu_memory_needed

In [3]:
model_names = [
    # "nlpie/bio-mobilebert",
    #                 "nlpie/tiny-biobert",
    #                 "roberta-base",
    #                 "nlpie/distil-biobert",
    #                 "dmis-lab/biobert-v1.1",
                     "meta-llama/Llama-2-7b-hf"
                     ]

gpu_memory_needed = get_gpu_memory_needed(model_names)

  0%|          | 0/1 [00:00<?, ?it/s]

Got llama model, loading in half precision




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:08<00:00,  8.91s/it]


In [4]:
gpu_memory_needed 

{'meta-llama/Llama-2-7b-hf': 12.369651794433594}

In [5]:
# save to json
import json
with open('../gpu_memory_needed.json', 'w') as fp:
    json.dump(gpu_memory_needed, fp)

In [3]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

In [7]:
if getattr(tokenizer, "pad_token_id") is None:
    print("None")

None


In [3]:
# flops

In [4]:
import torch
from transformers import BertModel
from thop import profile

In [4]:
# !pip install thop

In [8]:

# # Create a sample input
# input_ids = torch.tensor([[101, 2023, 2003, 1037, 2047, 2814, 1012, 102]])

# # Use the thop library to profile the model
# flops, params = profile(model, inputs=(input_ids,))

# # Print the estimated FLOPs and number of parameters
# print(f"FLOPs: {flops}")
# print(f"Number of parameters: {params}")




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
get_flops_and_params("meta-llama/Llama-2-7b-hf")

Got llama model, loading in half precision


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

[INFO] Register count_linear() for <class 'torch.nn.modules.linear.Linear'>.


('5.18e+10', 6476005376.0)