In [1]:
import argparse
import pprint
import sys
import os
#os.environ["HUGGINGFACE_HUB_CACHE"]="/submission/hf_cache"
import re, json, requests
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig
#####
from transformers import LlamaForCausalLM, CodeLlamaTokenizer
from transformers import LlamaForCausalLM
import colorama
colorama.init()

####
import torch
from huggingface_hub import login
# from human_eval.data import write_jsonl
import time
from optimum.bettertransformer import BetterTransformer
from transformers import LlamaForCausalLM
from transformers import LlamaTokenizer
import datetime
import pytz
import subprocess
import pynvml
pynvml.nvmlInit()

#########################################
from fastapi import FastAPI

import logging

# Lit-GPT imports
import sys
import time
from pathlib import Path
import json


import lightning as L
import torch

torch.set_float32_matmul_precision("high")

from lit_gpt import GPT, Tokenizer, Config
from lit_gpt.utils import lazy_load, quantization

# Toy submission imports
from helper import toysubmission_generate
from api import (
    ProcessRequest,
    ProcessResponse,
    TokenizeRequest,
    TokenizeResponse,
    Token,
    DecodeRequest,
    DecodeResponse
)





In [2]:

##################
login(token="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")
print("Huggingface login done")
# sys.exit(0)
start = time.time()
print(f"Starting time is: {start}")

# print("Environment variables are: ", os.environ)

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
assert(device=='cuda')

try:
    if torch.backends.mps.is_available():
        device = "mps"
except:
    pass
print(f"Device is: {device}")

if device=="cuda":
    print("___________________________________________")
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    for _i in range(torch.cuda.device_count()):
        print("Stats for device: ", _i)
        print('__CUDA Device Name:',torch.cuda.get_device_name(_i))
        print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(_i).total_memory/1e9)
    print("___________________________________________")

######################################


def print_time():
    # Define the desired timezone using the GMT offset and minutes
    desired_timezone = pytz.timezone('Asia/Calcutta')

    # Get the current time in UTC
    current_time_utc = datetime.datetime.utcnow()

    # Convert the UTC time to the desired timezone
    current_time_in_desired_timezone = current_time_utc.replace(tzinfo=pytz.utc).astimezone(desired_timezone)

    # Format and print the time
    formatted_time = current_time_in_desired_timezone.strftime('%Y-%m-%d %H:%M:%S %Z%z')
    print(f"Time in GMT+5.5 is: {formatted_time}", flush=True)


def print_gpu_memory_stats():
    print("________________")
    def get_gpu_utilization(handle_now):
        info = pynvml.nvmlDeviceGetUtilizationRates(handle_now)
        return info.gpu
    
    def get_gpu_count():
        try:
            output = subprocess.check_output(["nvidia-smi", "--query-gpu=index", "--format=csv,noheader"])
            gpu_count = len(output.strip().split(b"\n"))
            return gpu_count
        except subprocess.CalledProcessError:
            return 0
    #############################################
    num_gpus = get_gpu_count()
    print_time()
    print("Number of available GPUs:", num_gpus)
    handles = [pynvml.nvmlDeviceGetHandleByIndex(curr_gpu_id) for curr_gpu_id in range(num_gpus)]
    TOT_MEM_CONSUMED = 0
    for curr_gpu_id, handle in enumerate(handles):
        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
        memory_used = info.used / 1024 ** 2  # Convert bytes to megabytes
        memory_total = info.total / 1024 ** 2  # Convert bytes to megabytes
        # memory_free = memory_total - memory_used
        frac_used = memory_used/memory_total
        gpu_utilization = get_gpu_utilization(handle)
        gpu_dict = dict()
        gpu_dict = {"curr_gpu":curr_gpu_id,
                    "volatile_gpu_utils": gpu_utilization,
                    "total_gpu_mem":memory_total, 
                    "used_gpu_mem":memory_used, 
                    "frac_used_gpu_mem":frac_used
        }
        TOT_MEM_CONSUMED += memory_used
        # curr_stats_dict[curr_gpu_id] = gpu_dict
        # print(f"Volatile GPU Utilization: {gpu_utilization}%")
        # print(f"{datetime.datetime.now()} : Frac: {frac_used} | Used GPU Memory: {memory_used:.2f} MB | Free GPU Memory: {memory_free:.2f} MB | tOTAL IS: {memory_total}")
        # break
        print(gpu_dict)
    print("Total memory consumed is: ", TOT_MEM_CONSUMED)
    print("________________")
    
def fetch_llama_2_chat_prompt(initial_inst):
    # links to reference: https://huggingface.co/blog/codellama#conversational-instructions , https://www.reddit.com/r/LocalLLaMA/comments/155po2p/comment/jsvn5md/?utm_source=share&utm_medium=web2x&context=3  , https://huggingface.co/blog/llama2#how-to-prompt-llama-2
    prompt_now = lambda x:f'''<s>[INST] <<SYS>>
    You are a helpful, respectful and honest teaching assistant for an introductory programming course in Matlab and C. Your current task is to answer student queries on Piazza. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
    If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
    <</SYS>>
    
{x} [/INST]'''
    return prompt_now(initial_inst)




def get_model(
    load_8bit: bool = False,
    base_model: str = "bigcode/starcoder", 
    load_model_in_4_bit = False):
    assert base_model, ("Please specify a --base_model, e.g. --base_model='bigcode/starcoder'"    )
    print("Base model being used: ", base_model)

    if "meta-llama" in base_model or "anmolagarwal999" in base_model or "models_saved" in base_model:
        print("Model family is LLAMA-2")
        load_8bit = True if "70b" in base_model else False
        try:
            tokenizer = LlamaTokenizer.from_pretrained(base_model)
        except:
            print("Force loading tokenizer of another model")
            tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-13b-hf")
        tokenizer.add_special_tokens(
            {
                
                "pad_token": "<PAD>",
            }
        )
        
        print("Tokenizer has been loaded.")
        model = LlamaForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        device_map="auto", 
        load_in_8bit=load_8bit
            )
        #NOTE: Added tokenizer padding preference to LEFT (special padding side documentation for LLAMA-2: https://github.com/huggingface/transformers/issues/25022#issuecomment-1647573640 )
        # TODO: follow up issue here: https://github.com/huggingface/transformers/issues/26072
        assert(tokenizer.padding_side == 'right')
        model.resize_token_embeddings(model.config.vocab_size + 1) 
        print("Value of load_8bit is: ", load_8bit)
        
    elif "codellama" in base_model:
        print("Model family is CodeLLAMA")
        other_args = dict()
        #########
        # tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
        # model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf")
        
        ########
        
        tokenizer = AutoTokenizer.from_pretrained(base_model)
        
        ###############################################
        tokenizer.add_special_tokens(
        {
         
            "pad_token": "<PAD>",
        }
    )
        
        
        ############################################
        print("Tokenizer has been loaded.")
        
        if "34b" in base_model and  load_model_in_4_bit:
            print("NOTE: Warning: LOADING MODEL IN 4BIT")
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_compute_dtype=torch.float16
            )
            model = AutoModelForCausalLM.from_pretrained(
                base_model,
                device_map="auto",
                quantization_config=quantization_config
            )
        else:
            model = AutoModelForCausalLM.from_pretrained(
                base_model,
                device_map="auto",
                torch_dtype=torch.float16
            )
        #TODO: Know more about the padding
        model.resize_token_embeddings(model.config.vocab_size + 1) 
        tokenizer.padding_side = 'right'
        # assert(tokenizer.padding_side == 'right')

    else:
        print("Model family is WizardCoder")
        tokenizer = AutoTokenizer.from_pretrained(base_model)
        # new addition
        tokenizer.pad_token = tokenizer.eos_token
        print("Tokenizer has been loaded.")
        
        if device == "cuda":
            model = AutoModelForCausalLM.from_pretrained(
                base_model,
                load_in_8bit=load_8bit,
                torch_dtype=torch.float16,
                device_map="auto",
            )
        elif device == "mps":
            model = AutoModelForCausalLM.from_pretrained(
                base_model,
                device_map={"": device},
                torch_dtype=torch.float16,
            )
        #NOTE: Added tokenizer padding preference to LEFT
        tokenizer.padding_side = 'left'
            
    ### Memory stats before BetterTransformer
    print("Memory stats before BetterTransformer")
    print_gpu_memory_stats()
    
    
    # adding BetterTransformer functionality
    # new_model = BetterTransformer.transform(model, keep_original_model=True if os.environ.get("AMLT_DIRSYNC_DIR", None) is None else False)
    new_model = BetterTransformer.transform(model, keep_original_model=False)
    del model
    print("Memory stats after deletion")
    print_gpu_memory_stats()
    model = new_model
    

    model.config.pad_token_id = tokenizer.pad_token_id

    print("Config pad token id is: ", model.config.pad_token_id )

    try:
        if not load_8bit:
            model.half()  # seems to fix bugs for some users.
    except Exception as E:
        print("ERROR: Some error occurred during model.half(): ", E)

    model.eval()
    if torch.__version__ >= "2" and sys.platform != "win32":
        model = torch.compile(model)

    print("Model distribution is: ", model.hf_device_map, flush=True)
    
    ####################################
    ### Memory stats after BetterTransformer
    print("Memory stats AFTER BetterTransformer")
    print_gpu_memory_stats()
    
    
    ######################################
    print("TOKENIZER stats are: ", tokenizer.__dict__, "\n#########\n")
    
    return tokenizer, model

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/anmol/.cache/huggingface/token
Login successful
Huggingface login done
Starting time is: 1697628931.6823015
Device is: cuda
___________________________________________


RuntimeError: cuDNN version incompatibility: PyTorch was compiled  against (8, 9, 2) but found runtime version (8, 5, 0). PyTorch already comes bundled with cuDNN. One option to resolving this error is to ensure PyTorch can find the bundled cuDNN.

In [6]:
base_model = "/home/anmol/nips_challenge/efficiency_challenge_repo/code/00_starter_repo/neurips_llm_efficiency_challenge/sample-submissions/llama_recipes/models_saved/32_32_8fa6081f-d6df-4d0b-95d1-f209962c35b1/WHOLE_best_model_yet_epoch_8"

In [None]:
model = LlamaForCausalLM.from_pretrained(
    base_model,
        return_dict=True,
        device_map="auto", 
        load_in_8bit=False
            )

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
!whoami

In [None]:
        ,
