In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import transformers
import torch
import json

SYS_PROMPT = """You are an assistant for answering questions.
You are given the extracted parts of a long document and a question. Don't make up an answer."""

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# use quantization to lower GPU usage                                                
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=bnb_config,
    cache_dir="~/data/models/transformers"
)
terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

FileNotFoundError: [Errno 2] No such file or directory: '../../blobs/7784fbf6342de338b736f884a49b08f270c5e9c8' -> '/home/louis/data/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e5e23bbe8e749ef0efcf16cad411a7d23bd23298/config.json'

In [5]:
formula_question = "What is the material studied in this paper? Format the answer as MATERIAL: {Chemical Formula}. If there are multiple materials, separate them with &. Just give a formula and do not provide any explanation. Here are some example outputs: 'MATERIAL: Ga3As4.5 & Al0.6Fe0.4 & TexS1-x & UF6', 'MATERIAL: PrOs4Sb12 & PrOs3Sb13'"
temp_question = "What is the critical temperature at zero-field of {MATERIAL}? Just give a number and do not provide any explanation. Format the answer as CRITICAL TEMPERATURE: {Number} K. Here are some example outputs: 'CRITICAL TEMPERATURE: 3 K', 'CRITICAL TEMPERATURE: 15.6 K'"
#field_question = "What is upper critical field of {MATERIAL}? Just give a number and do not provide any explanation. Format the answer as MAGNETIC FIELD: {Number} T."

In [9]:
def format_prompt(prompt, paper_text):
  PROMPT = f"Question: {prompt}\nContext: " + paper_text
  return PROMPT

def generate(formatted_prompt):
  formatted_prompt = formatted_prompt[:4000] # to avoid GPU OOM                      
  messages = [{"role":"system","content":SYS_PROMPT}, {"role":"user","content":formatted_prompt}]
  # tell the model to generate                                                       
  input_ids = tokenizer.apply_chat_template(
      messages,
      add_generation_prompt=True,
      return_tensors="pt"
  ).to(model.device)
  outputs = model.generate(
      input_ids,
      max_new_tokens=1024,
      eos_token_id=terminators,
      do_sample=True,
      temperature=0.6,
      top_p=0.9,
  )
  response = outputs[0][input_ids.shape[-1]:]
  return tokenizer.decode(response, skip_special_tokens=True)

In [10]:
import os

answers = {}
answers['questions'] = [formula_question, temp_question]#, field_question]

paper_source_directory = '/home/louis/research/pdf_processor/processed_data/superconductivity_processed/'
file_name = 'text.txt'

for i, directory in enumerate(os.listdir(paper_source_directory)):
    object_path = os.path.join(paper_source_directory, directory)
    print(i, object_path)
    if os.path.isdir(object_path) and not directory.startswith("."):
        
        paper_textfile = os.path.join(paper_source_directory, directory, 'text.txt')
        with open(paper_textfile) as f:
            paper_text = f.read()
            answers[directory] = []
            material_string = generate(format_prompt(formula_question, paper_text))
            print(material_string)
            answers[directory].append(material_string)
            materials = [out.strip() for out in material_string.split(":")[1].split("&")]
            q1list = []
            #q2list = []
            for material in materials:
                q1list.append(generate(format_prompt(temp_question.replace('{MATERIAL}', material), paper_text)))
                #q2list.append(generate(format_prompt(field_question.replace('{MATERIAL}', material), paper_text)))
            answers[directory].append(",".join(q.split(":")[1] for q in q1list))
            #answers[directory].append(",".join(q.split(":")[1] for q in q2list))
            print(answers[directory])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


0 /home/louis/research/pdf_processor/processed_data/superconductivity_processed/physrevb.73.245106


KeyboardInterrupt: 

In [None]:
with open('run3.json', 'w', encoding='utf-8') as f:
    json.dump(dict(sorted(answers.items())), f, ensure_ascii=False, indent=4)