In [None]:
!pip install replicate

Collecting replicate
  Downloading replicate-0.26.0-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 kB[0m [31m481.9 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx<1,>=0.21.0 (from replicate)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.21.0->replicate)
  Downloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.21.0->replicate)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: h11, httpcore, httpx, replicate
Successfully installed h11-0.14.0 

In [None]:
import google.colab.drive as drive
import replicate
import json
import re
import os

In [None]:
class Dataset:
    def __init__(self, from_drive=True):
        self.dictionaryofdocs = {}
        self.from_drive = from_drive
        # google drive
        print(f"from_drive: {from_drive}")
        if from_drive == True:
            drive.mount('/content/drive')
            print("---------------Connected to your drive---------------")

    def load(self, url):
        # -----------------------------------------
        with open(url, "r") as fh:
            self.pubtaturtext = fh.read()
        # -----------------------------------------
        # parse data and make list of records
        pattern = "\n\n"
        self.listofdocs = re.split(pattern, self.pubtaturtext)
        # -----------------------------------------
        for doc in self.listofdocs:
            if doc.strip() == '':
                self.listofdocs.remove(doc)
        # -----------------------------------------
        for doc in self.listofdocs:
            doc = doc.strip()
            # [id, record]
            id_record = doc.split('|t|')
            # id [id, -]
            doc_id = id_record[0].strip()
            # create key:val = doc_id:{}
            self.dictionaryofdocs[doc_id] = {}
            # record[-, record]
            record =  id_record[1]
            # record = [titel, rels]
            title_rel = record.split(doc_id)
            # title [title, -]
            title = title_rel[0].strip()
            # create key:val = doc_id:{title:{title}}
            self.dictionaryofdocs[doc_id]["title"] = title
            # append each rel of a record to rels list
            self.dictionaryofdocs[doc_id]["rels"] = []
            for rel in title_rel[1:]:
                rel = rel.strip().split('\t')
                relation = rel[0]
                chemical = rel[1]
                disease = rel[2]
                self.dictionaryofdocs[doc_id]["rels"].append({
                    "relation":relation,
                    "chemical": chemical,
                    "disease":disease
                })
        # -----------------------------------------
        return self.dictionaryofdocs

In [None]:
class Makeprompt:
    def __init__(self, dictofdocuments):
        self.dictofdocuments = dictofdocuments

    def prompt(self, doc_id):
        self.doc_id = doc_id
        self.title_as_prompt = self.dictofdocuments[self.doc_id]["title"]
        return self.title_as_prompt

In [None]:
class Llm:
    def __init__(self, cloud_token):
        self.cloud_token = cloud_token
        os.environ["REPLICATE_API_TOKEN"] = cloud_token
        print("---------------Connected to Replicate---------------")

    def load_params(self, model, prompt_template, system_prompt, top_k=0, top_p=0.9 , max_tokens=512, min_tokens=0, temperature=0.6, length_penalty=1, stop_sequences="<|end_of_text|>,<|eot_id|>", presence_penalty=1.15, log_performance_metrics=False):
        self.model = model
        self.prompt_template = prompt_template
        self.input = {
        "top_k": top_k,
        "top_p": top_p,
        "max_tokens": max_tokens,
        "min_tokens": min_tokens,
        "temperature": temperature,
        "system_prompt": system_prompt,
        "length_penalty": length_penalty,
        "stop_sequences": stop_sequences,
        "presence_penalty": presence_penalty,
        "log_performance_metrics": log_performance_metrics}

    def generate(self, promptt):
        self.input["prompt"] = promptt
        self.input["prompt_template"] = self.prompt_template[0] + promptt + self.prompt_template[1]

        self.output = replicate.run(self.model, input=self.input)
        self.output = "".join(self.output)
        return self.output

In [None]:
class Pubtatur:
    def __init__(self, results_as_dictionary, to_drive=True):
        self.results_as_dictionary = results_as_dictionary
        self.to_drive = to_drive
        # if to_drive:
        #     drive.mount('/content/drive')
        #     print("---------------Connected to your drive---------------")

    # ----------------------------------------------------
    def write(self):
        for doc_id in self.results_as_dictionary.keys():
            # grabs title related to the id from results_as_dictionary
            title_as_promptt = self.results_as_dictionary[doc_id]["prompt"]

            # grabs list of generated rels related to the id from results_as_dictionary
            gen_rels = self.results_as_dictionary[doc_id]["rels"]

            if self.to_drive:
                save_path = "/content/drive/MyDrive/relation extraction/results.pubtatur.text"
            else:
                save_path = "results.pubtatur.text"

            with open(save_path, 'a') as file:
                file.write(f"{doc_id}|t|{title_as_promptt}")
                for rel in gen_rels:
                    rel_type = rel["relation"]
                    chemi = rel["chemical"]
                    disea = rel["disease"]
                    file.write(f"\n{doc_id}\t{rel_type}\t{chemi}\t{disea}")
                file.write("\n\n")

        if self.to_drive:
            print(f"\n\n---------------file as Pubtatur format saved in Google drive---------------\n---------------saved path={save_path}---------------")
        else:
            print(f"---------------file as Pubtatur format saved in current directory---------------")




In [None]:
class log:
    def __init__(self, documentid, title_prompt, list_of_dict_of_rels):
        with open("log.txt", "a") as logfile:
            logfile.write(f"id: {documentid}")
            logfile.write(f"\nprompt: {title_prompt}")
            logfile.write(f"\nlist_of_dict_of_rels: {list_of_dict_of_rels}")
            logfile.write('\n\n')
        with open("/content/drive/MyDrive/relation extraction/log.txt", "a") as logfile:
            logfile.write(f"id: {documentid}")
            logfile.write(f"\nprompt: {title_prompt}")
            logfile.write(f"\nlist_of_dict_of_rels: {list_of_dict_of_rels}")
            logfile.write('\n\n')

In [None]:
prompt_template_leftside = f"""
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>
\n\nYou are a knowledgeable physician. You are tasked with extracting chemical-disease relations from medical texts. CID stands for Chemical-induced-disease relationship. Given a medical text, extract relevant relations and present them in a structured format. each relation is expressed as 'CID***chemical***disease'. remember extract at least one relation. Some examples are provided below:

Example 1:
Text: "Electrocardiographic evidence of myocardial injury in psychiatrically hospitalizedcocaine abusers.The electrocardiograms (ECG) of 99 cocaine-abusing patients were compared with theECGs of 50 schizophrenic controls. Eleven of the cocaine abusers and none of thecontrols had ECG evidence of significant myocardial injury defined as myocardialinfarction, ischemia, and bundle branch block."
Relation(s):
CID***cocaine***myocardial infarction
CID***cocaine***bundle branch block

Example 2:
Text: "Lidocaine-induced cardiac asystole.Intravenous administration of a single 50-mg bolus of lidocaine in a 67-year-oldman resulted in profound depression of the activity of the sinoatrial andatrioventricular nodal pacemakers. The patient had no apparent associated conditionswhich might have predisposed him to the development of bradyarrhythmias; and, thus,this probably represented a true idiosyncrasy to lidocaine."
Relation(s):
CID***Lidocaine***cardiac asystole

<|eot_id|>

<|start_header_id|>user<|end_header_id|>
\n\nText: """

prompt_template_rightside = """
<|eot_id|>

<|start_header_id|>assistant<|end_header_id|>
\n\nRelation(s):\n
"""
prompt_template = [prompt_template_leftside, prompt_template_rightside]

system_prompt = """
You are a knowledgeable physician. You are tasked with extracting chemical-disease relations from medical texts. CID stands for Chemical-induced-disease relationship. Given a medical text, extract relevant relations and present them in a structured format. each relation is expressed as 'CID***chemical***disease'. remember extract at least one relation. Some examples are provided below:

Example 1:
Text: "Electrocardiographic evidence of myocardial injury in psychiatrically hospitalizedcocaine abusers.The electrocardiograms (ECG) of 99 cocaine-abusing patients were compared with theECGs of 50 schizophrenic controls. Eleven of the cocaine abusers and none of thecontrols had ECG evidence of significant myocardial injury defined as myocardialinfarction, ischemia, and bundle branch block."
Relation(s):
CID***cocaine***myocardial infarction
CID***cocaine***bundle branch block

Example 2:
Text: "Lidocaine-induced cardiac asystole.Intravenous administration of a single 50-mg bolus of lidocaine in a 67-year-oldman resulted in profound depression of the activity of the sinoatrial andatrioventricular nodal pacemakers. The patient had no apparent associated conditionswhich might have predisposed him to the development of bradyarrhythmias; and, thus,this probably represented a true idiosyncrasy to lidocaine."
Relation(s):
CID***Lidocaine***cardiac asystole
"""

In [None]:
class main:
    def __init__(self, dataset_url, cloudtoken, modelname, from_drive=True, prompt_template=prompt_template, system_prompt=system_prompt):
        fromdrive = Dataset(from_drive)
        docs_as_dict = fromdrive.load(dataset_url)
        chat = Llm(cloud_token=cloudtoken)
        chat.load_params(model=modelname, prompt_template=prompt_template, system_prompt=system_prompt)

        # init Makeprompt object
        make = Makeprompt(docs_as_dict)
        # creating dictionary to put results in it
        results_as_dict = {}
        # creating prompts one by one and pass it to LLM
        passed_docs = 0
        for docid in list(docs_as_dict.keys())[490:500]:
            # set prompt related to id
            my_prompt = make.prompt(docid)
            # pass prompt to llm and get output as list of relations
            llm_output = chat.generate(my_prompt).split('\n')
            # print(llm_output)
            # break
            # make a dictionary of results
            results_as_dict[docid] = {}
            results_as_dict[docid]["prompt"] = my_prompt
            results_as_dict[docid]["rels"] = []
            for relation in llm_output:
                splitted_relation = relation.split("***")
                rel = splitted_relation[0]
                chem = splitted_relation[1]
                dis = splitted_relation[2]
                results_as_dict[docid]["rels"].append({
                    "relation":rel,
                    "chemical":chem,
                    "disease":dis
                })
            print(f"{passed_docs}, {docid}")
            # log
            log(documentid=docid, title_prompt=results_as_dict[docid]["prompt"], list_of_dict_of_rels=results_as_dict[docid]["rels"])
            passed_docs += 1
        make_pub = Pubtatur(results_as_dict)
        make_pub.write()

In [None]:
dataset_url = "/content/drive/MyDrive/relation extraction/test.PubTator_edited.txt"
token = "Replicate Token"
model = "meta/meta-llama-3-70b-instruct"

In [None]:
main(dataset_url=dataset_url, cloudtoken=token, modelname=model)

from_drive: True
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
---------------Connected to your drive---------------
---------------Connected to Replicate---------------
['CID***fluvastatin***liver damage', 'CID***fluvastatin***acute liver injury', 'CID***fluvastatin***hepatic damage']


<__main__.main at 0x7ed630548820>