In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from huggingface_hub import login

login("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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/rr4577/.cache/huggingface/token
Login successful


In [4]:
# Step1 : Accelerator
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [5]:
# Step2: Load the Vectordb and getting it to production.
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

print("loading vector_db ....")

persist_directory = 'faiss_db/'
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

db = FAISS.load_local(persist_directory, embeddings)
    
print("quering question ....")
question = "What factors controls the ability of palladium cathods to attain high loading levels?"
result = db.similarity_search(question)

for i in range(len(result)):
    print(f"Document {i}")
    print(result[i].page_content)
    print(f"\n")

loading vector_db ....
quering question ....
Document 0
The ability of palladium cathodes to attain and maintain high loading levels, at high 
current density and for long times, is controlled by two factors: the condition of the 
electrochemical interface which allows the attainment of high deuterium activity; the defect 
density and mechanical condition of the bulk material which permits the Pd lattice to withstand 
and contain high bulk deuterium activities when these equilibrate to produce extreme pressures of 
deuterium gas inside closed incipient voids within the metal. 
Examples are given from a set of 26 intentionally similar current ramps which show 
three distinct Modes of loading performance: Mode A - a linear decrease of Pd resistance, 
beyond the resistance maximum, with logarithmic increase in electrochemical current; Mode B -an 
initial log-linear decrease on the Mode A trajectory, followed by a rapid increase in resistance 
when the resistance falls below a critical val

In [14]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, AutoConfig

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [15]:
model_id = 'meta-llama/Llama-2-7b-chat-hf'

In [17]:
# begin initializing HF items
model_config = AutoConfig.from_pretrained(model_id)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto'
)

Downloading shards: 100%|██████████| 2/2 [00:00<00:00, 24.32it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:21<00:00, 10.52s/it]


In [18]:
# enable evaluation mode to allow model inference
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm

In [19]:
device = f'cuda:{torch.cuda.current_device()}' if torch.cuda.is_available() else 'cpu'

In [21]:
print(f"Model loaded on {device}")

tokenizer = AutoTokenizer.from_pretrained(model_id)

Model loaded on cuda:0


In [22]:
inputs_not_chat = tokenizer.encode_plus("""[INST] What is range of ion energies in HELIS based on below context?  The ion beam installation HELIS (P.N. Lebedev Physics Institute, Moscow, Russia) represents an 
ion accelerator of light elements with atomic number in the range Z=1-54 with ion energies ranging from 0.5 
to 50 keV operating at deuteron current densities up to 2 A/cm2 and intended to perform a wide spectrum of 
physical experiments related to LENR [/INST]""", return_tensors="pt")['input_ids'].to('cuda')

generated_ids = model.generate(inputs_not_chat, 
                               max_new_tokens=1000, 
                               do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)

In [23]:
print(decoded)

["<s> [INST] What is range of ion energies in HELIS based on below context?  The ion beam installation HELIS (P.N. Lebedev Physics Institute, Moscow, Russia) represents an \nion accelerator of light elements with atomic number in the range Z=1-54 with ion energies ranging from 0.5 \nto 50 keV operating at deuteron current densities up to 2 A/cm2 and intended to perform a wide spectrum of \nphysical experiments related to LENR [/INST]  Based on the context you provided, the range of ion energies in the HELIS ion beam installation is:\n\n* Ion energies ranging from 0.5 to 50 keV (0.5 to 50,000 electronvolts, eV)\n\nThis means that the ion beam can be used to study the properties of light elements with atomic numbers in the range of 1 to 54 (hydrogen to uranium), with ion energies spanning a wide range from low energies (0.5 keV) to high energies (50 keV). The deuteron current densities in the installation can reach up to 2 amperes per square centimeter (A/cm2), which is a relatively high

In [24]:
inputs_not_chat = tokenizer.encode_plus("""[INST] What factors controls the ability of palladium cathods to attain high loading levels given below context?
The ability of palladium cathodes to attain and maintain high loading levels, at high 
current density and for long times, is controlled by two factors: the condition of the 
electrochemical interface which allows the attainment of high deuterium activity; the defect 
density and mechanical condition of the bulk material which permits the Pd lattice to withstand 
and contain high bulk deuterium activities when these equilibrate to produce extreme pressures of 
deuterium gas inside closed incipient voids[/INST]""", return_tensors="pt")['input_ids'].to('cuda')

generated_ids = model.generate(inputs_not_chat, 
                               max_new_tokens=1000, 
                               do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)

In [25]:
print(decoded[0])

<s> [INST] What factors controls the ability of palladium cathods to attain high loading levels given below context?
The ability of palladium cathodes to attain and maintain high loading levels, at high 
current density and for long times, is controlled by two factors: the condition of the 
electrochemical interface which allows the attainment of high deuterium activity; the defect 
density and mechanical condition of the bulk material which permits the Pd lattice to withstand 
and contain high bulk deuterium activities when these equilibrate to produce extreme pressures of 
deuterium gas inside closed incipient voids[/INST]  The ability of palladium cathodes to attain and maintain high loading levels at high current densities and for long times is controlled by two main factors:

1. Electrochemical interface conditions: The condition of the electrochemical interface between the palladium cathode and the electrolyte solution plays a crucial role in determining the high deuterium activi