In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

from huggingface_hub import login
login("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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/gm3314/.cache/huggingface/token
Login successful


In [2]:
# Step1 : Accelerator
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [34]:
# Step2: Load the Vectordb and getting it to production.
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

embedding = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db = Chroma(persist_directory="./chroma_db", embedding_function=embedding)

print("loading vector_db ....")
db = Chroma(persist_directory="./chroma_db", embedding_function=embedding)

print("quering question ....")
question = "What factors controls the ability of palladium cathods to attain high loading levels?"
result = db.similarity_search(question, k=3)

for i in range(3):
    print(f"document {i}")
    print(result[i].page_content)
    print("")

loading vector_db ....
quering question ....
document 0
The ability of palladium cathodes to attain and maintain high loading levels, at high 
current density and for long times, is controlled by two factors: the condition of the 
electrochemical interface which allows the attainment of high deuterium activity; the defect 
density and mechanical condition of the bulk material which permits the Pd lattice to withstand 
and contain high bulk deuterium activities when these equilibrate to produce extreme pressures of 
deuterium gas inside closed incipient voids

document 1
"Quasi-Plasma" Transport Model in Deuterium 
Overloaded Palladium Cathodes

document 2
palladium cathode, 2mm¢> 
x 7.05mm, was a heat source, although the mechanism of the heat generation is still 
uncertain. During the whole period of this run, however, the phenomenon took place 
only once. Neither increase of neutron emission nor that of tritium concentration has 
been detected. Mass analysis showed that any traces of

In [25]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Activate 4-bit precision base model loading
use_4bit = True
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)


model_name='mistralai/Mistral-7B-Instruct-v0.1'
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [26]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

Downloading config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [31]:
inputs_not_chat = tokenizer.encode_plus("""[INST] What is range of ion energies in HELIS based on below context?  The ion beam installation HELIS (P.N. Lebedev Physics Institute, Moscow, Russia) represents an 
ion accelerator of light elements with atomic number in the range Z=1-54 with ion energies ranging from 0.5 
to 50 keV operating at deuteron current densities up to 2 A/cm2 and intended to perform a wide spectrum of 
physical experiments related to LENR [/INST]""", return_tensors="pt")['input_ids'].to('cuda')

generated_ids = model.generate(inputs_not_chat, 
                               max_new_tokens=1000, 
                               do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [32]:
print(decoded)

["<s> [INST] What is range of ion energies in HELIS based on below context?  The ion beam installation HELIS (P.N. Lebedev Physics Institute, Moscow, Russia) represents an \nion accelerator of light elements with atomic number in the range Z=1-54 with ion energies ranging from 0.5 \nto 50 keV operating at deuteron current densities up to 2 A/cm2 and intended to perform a wide spectrum of \nphysical experiments related to LENR [/INST] Based on the provided information, the range of ion energies in HELIS is from 0.5 to 50 keV. However, it's important to note that ion energies are not usually directly related to atomic number. Ion energies are instead determined by the charge and mass of the ions being accelerated in the HELIS ion accelerator.</s>"]


In [35]:
inputs_not_chat = tokenizer.encode_plus("""[INST] What factors controls the ability of palladium cathods to attain high loading levels? [/INST]""", return_tensors="pt")['input_ids'].to('cuda')

generated_ids = model.generate(inputs_not_chat, 
                               max_new_tokens=1000, 
                               do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)
print(decoded)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['<s> [INST] What factors controls the ability of palladium cathods to attain high loading levels? [/INST] There are several factors that can control the ability of palladium cathods to attain high loading levels, including:\n\n1. Electrolyte composition: The composition of the electrolyte used in the cathodic reduction process can affect the loading capacity of palladium cathods. For example, electrolytes that contain high concentrations of oxygen or other electron acceptors can increase the loading capacity of palladium cathods.\n2. Temperature: The temperature at which reduced palladium cathods are formed can affect their loading capacity. Higher temperatures generally result in lower loading capacities, while lower temperatures can lead to higher loading capacities.\n3. Redox potential: The redox potential of the palladium cathods also plays a role in their loading capacity. Higher redox potentials can reduce palladium to lower oxidation states, resulting in higher loading capaciti

In [36]:
inputs_not_chat = tokenizer.encode_plus("""[INST] What factors controls the ability of palladium cathods to attain high loading levels given below context?
The ability of palladium cathodes to attain and maintain high loading levels, at high 
current density and for long times, is controlled by two factors: the condition of the 
electrochemical interface which allows the attainment of high deuterium activity; the defect 
density and mechanical condition of the bulk material which permits the Pd lattice to withstand 
and contain high bulk deuterium activities when these equilibrate to produce extreme pressures of 
deuterium gas inside closed incipient voids[/INST]""", return_tensors="pt")['input_ids'].to('cuda')

generated_ids = model.generate(inputs_not_chat, 
                               max_new_tokens=1000, 
                               do_sample=True)
decoded = tokenizer.batch_decode(generated_ids)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [38]:
print(decoded[0])

<s> [INST] What factors controls the ability of palladium cathods to attain high loading levels given below context?
The ability of palladium cathodes to attain and maintain high loading levels, at high 
current density and for long times, is controlled by two factors: the condition of the 
electrochemical interface which allows the attainment of high deuterium activity; the defect 
density and mechanical condition of the bulk material which permits the Pd lattice to withstand 
and contain high bulk deuterium activities when these equilibrate to produce extreme pressures of 
deuterium gas inside closed incipient voids[/INST] The two factors that control the ability of palladium cathodes to attain high loading levels are as follows:

1. Condition of the electrochemical interface: The ability of palladium cathodes to attain high deuterium activity is largely dependent on the condition of the electrochemical interface between the cathode and the electrolyte. A well-prepared and clean inte