In [None]:
!pip install -U sacremoses
!pip install accelerate



In [1]:
import json
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import concurrent.futures
from concurrent.futures import *
import re

In [None]:
import gdown

# Download expertqa from Google Drive using its ID
file_id = '1xLToa0J8Jyee1RU_mFDlWm40SUJ89nCe'
output_file = 'expertqa.jsonl'  # Choose the output file path

gdown.download(f'https://drive.google.com/uc?id={file_id}', output_file, quiet=False)


Downloading...
From: https://drive.google.com/uc?id=1xLToa0J8Jyee1RU_mFDlWm40SUJ89nCe
To: /content/expertqa.jsonl
100%|██████████| 30.1M/30.1M [00:00<00:00, 126MB/s]


'expertqa.jsonl'

In [2]:
def parse_jsonl_for_fields(file_path, fields):
    """
    Parses a JSONL file and returns dictionaries for specified fields.

    :param file_path: Path to the JSONL file.
    :param fields: List of fields to extract data for.
    :return: A dictionary where each key is a field and the value is another dictionary
             of question-answer pairs for that field.
    """
    data = {field: {} for field in fields}

    with open(file_path, 'r') as file:
        for line in file:
            line_data = json.loads(line)
            field = line_data.get('metadata', {}).get('field')
            question = line_data.get('question')

            for answer_key in line_data.get('answers', {}):
                revised_answer = line_data['answers'][answer_key].get('revised_answer_string')
                if field in fields and question and revised_answer:
                    data[field][question] = revised_answer
                    break # There is only one revised answer per question

            if field in fields and question and revised_answer:
                data[field][question] = revised_answer

    return data

In [3]:
parsed_data = parse_jsonl_for_fields('expertqa.jsonl', ["Healthcare / Medicine", "Law"])

In [None]:
# Display a small part of the data to verify
for field in parsed_data:
    print(f"Field: {field}, Number of entries: {len(parsed_data[field])}")
    for question in list(parsed_data[field].keys())[:2]:  # Displaying the first two entries for brevity
        print(f"  Question: {question}")
        print(f"  Answer: {parsed_data[field][question][:100]}...")  # Displaying first 100 characters of answer
    print("\n")

Field: Healthcare / Medicine, Number of entries: 504
  Question: What are signs and study findings that would indicate follicular lymphoma has transformed to diffuse large B-cell lymphoma?
  Answer: Signs that might indicate a transformation of follicular lymphoma (FL) to diffuse large B-cell lymph...
  Question: A patient with a history of heart failure now presents with newly diagnosed metastatic HER2+ breast cancer. What is her recommended first line of treatment and what additional information should be discussed with the patient given her history of heart failure?
  Answer: According to the web search results, the recommended first line of treatment for HER2+ metastatic br...


Field: Law, Number of entries: 103
  Question: How will the estate of an individual who dies without a will be distributed?
  Answer: When an individual dies without a will, their estate is distributed according to the intestacy rules...
  Question: What are the requirements for claiming inheritance as per 

In [4]:
path = "Mohammed-Altaf/Medical-ChatBot"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = GPT2Tokenizer.from_pretrained(path)
model = GPT2LMHeadModel.from_pretrained(path).to(device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
outputs = {}

prompt_input = (
    "The conversation between human and AI assistant.\n"
    "[|Human|] {input}\n"
    "[|AI|]"
)

count = 0
field_data = parsed_data['Healthcare / Medicine']
for question, answer in field_data.items():
    count += 1
    if count > 379:
      sentence = prompt_input.format_map({'input': question})
      inputs = tokenizer(sentence, return_tensors="pt").to(device)

      with torch.no_grad():
          beam_output = model.generate(**inputs,
                                      min_new_tokens=1,
                                      max_length=512,
                                      num_beams=3,
                                      repetition_penalty=1.2,
                                      early_stopping=True,
                                      eos_token_id=198)
          generated_text = tokenizer.decode(beam_output[0], skip_special_tokens=True)
          response = re.search(r'\[\|AI\|\](.*?)$', generated_text, flags=re.DOTALL).group(1).strip()
          outputs[question] = response

      with open('output.json', 'a') as f:
          json.dump({question: response}, f)
          f.write('\n')

Setting `pad_token_id` to `eos_token_id`:198 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:198 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:198 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:198 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:198 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:198 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:198 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:198 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:198 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:198 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:198 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:198 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:198 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:198 for open-end generation.
Setting `pad_token_i