## Convert BigBIO to custom Tag format to allow processing by advanced NLP models like OPEN AI's GPT

In [40]:
# load Big BIO gold data to json
import json

# Load the JSON data
with open('../data/GraSSco/grascco_hpi_anno_2023_02_08/annotations/json/fine/long/all.json') as f:
    data = json.load(f)

In [41]:
##transform dataformat

def tag_entities(document, text=None, text_start_index = 0, entity_types=None):

    if text == None:
        text = '\n'.join([p['text'] for p in document['passages']])

    #  Dictionaries of entity types and their tags (open + close)

    entity_tags_open = {
        'Diagnosis_or_Pathology': '<Diagnosis_or_Pathology>',
        'Other_Finding': '<Other_Finding>',
        'Therapeutic': '<Therapeutic>',
        'Diagnostic': '<Diagnostic>',
        'Clinical_Drug': '<Clinical_Drug>',
        'External_Substance': '<External_Substance>',
        'Nutrient_or_Body_Substance': '<Nutrient_or_Body_Substance>'
    }

    entity_tags_close = {
        'Diagnosis_or_Pathology': '</Diagnosis_or_Pathology>',
        'Other_Finding': '</Other_Finding>',
        'Therapeutic': '</Therapeutic>',
        'Diagnostic': '</Diagnostic>',
        'Clinical_Drug': '</Clinical_Drug>',
        'External_Substance': '</External_Substance>',
        'Nutrient_or_Body_Substance': '</Nutrient_or_Body_Substance>'
    }


    if entity_types is None:
        entity_types = list(entity_tags_open.keys())
    elif not isinstance(entity_types, list):
        entity_types = [entity_types]


    entities = sorted([entity for entity in document['entities'] if entity['type'] in entity_types
                       and entity['offsets'][0][0] >= text_start_index and entity['offsets'][0][1] <= text_start_index + len(text)],
                      key=lambda e: e['offsets'][0][0], reverse=False)

    text_pos = 0
    offset = 0
    # Loop through all characters in the text
    while text_pos < len(text):
        
        # Check if there is an entity that needs to be tagged
        for entity in entities:
            if entity['offsets'][0][0]-text_start_index == text_pos and entity['type'] in entity_types:
                # Add opening tag
                tag = entity_tags_open[entity['type']]
                start = entity['offsets'][0][0]-text_start_index + offset
                text = text[:start] + tag + text[start:]

                offset += len(tag)
            if entity['offsets'][0][1]-text_start_index == text_pos:
                # Add closing tag
                tag = entity_tags_close[entity['type']]
                end = entity['offsets'][0][1]-text_start_index + offset
                text = text[:end] + tag + text[end:]
                
                offset += len(tag)
        text_pos+=1
    return text



### Convert complete GraSSco into Tag Schema

In [42]:
tagged_text = ""
for doc in data:
    tagged_text += tag_entities(doc)


# Write the tagged text to a file
with open('full_GraSSco.txt', 'w') as f:
    f.write(tagged_text)

## Sample 15 from Grassco

In [43]:
import random

random.seed(42)

entities = [p for d in data for p in d.get('passages', [])]

sampled_entities = random.sample(entities, k=15)


## Filter Sentences from all GraSSco by Entities, Nested Entities and Top Level Category

In [94]:
def has_entities(entities, sentence_offsets):
    for e in entities:
        return any(sentence_offsets[0][0] <= e['offsets'][0][0] and sentence_offsets[0][1] >= e['offsets'][0][1] for e in entities)
    
def has_sub_entities(entities, sentence_offsets):
    for e in entities:
        return any(
            sentence_offsets[0][0] <= e['offsets'][0][0] and sentence_offsets[0][1] >= e['offsets'][0][1]
            and (e['offsets'][0][0] > sentence_offsets[0][0] or e['offsets'][0][1] < sentence_offsets[0][1])
            for e in entities
        )

def has_enough_tokens_and_sub_entities(entities, sentence_offsets, sentence_text):
    for e in entities:
        return any(
            sentence_offsets[0][0] <= e['offsets'][0][0] and sentence_offsets[0][1] >= e['offsets'][0][1]
            and (e['offsets'][0][0] > sentence_offsets[0][0] or e['offsets'][0][1] < sentence_offsets[0][1])
            and sentence_text.count(" ") >= 2  # Counts spaces to check for >3 tokens; nltk is too slow
            for e in entities
        )


# Load the complete JSON data
with open('../data/GraSSco/grascco_hpi_anno_2023_02_08/annotations/json/fine/long/all.json') as f:
    data = json.load(f)

import random

random.seed(42)

sentences = [dict(p, document_id=d['document_id']) for d in data for p in d.get('passages', [])if has_enough_tokens_and_sub_entities(d.get('entities', []),p.get('offsets'),p.get('text'))] #only sentences with entities


In [95]:
sentences_with_entity_types = []

# Loop through passages with entities
for d in data:
    for p in d.get('passages', []):
        if has_enough_tokens_and_sub_entities(d.get('entities', []), p.get('offsets'), p.get('text')):
            sentence = dict(p, document_id=d['document_id'])
            entities_in_sentence = set()
            # Loop through entities in passage
            for entity in d['entities']:
                if any(o[0] >= p['offsets'][0][0] and o[1] <= p['offsets'][-1][1] for o in entity['offsets']):
                    entities_in_sentence.add(entity['type'])
                    sentence = dict(p, document_id=d['document_id'], entity_types=entities_in_sentence)
            # Append sentence with entity types to sentences list
            sentences_with_entity_types.append(sentence)

In [96]:
#entity_type_filter = None
#entity_type_filter = ['Diagnosis_or_Pathology', 'Other_Finding']
#entity_type_filter = ['Therapeutic', 'Diagnostic']
entity_type_filter = ['Clinical_Drug','External_Substance', 'Nutrient_or_Body_Substance']

def filter_sentences(sentences, entity_type=None):
    return  [s for s in sentences if entity_type == None or any(e in s['entity_types'] for e in entity_type)]


filtered_sentences = filter_sentences(sentences_with_entity_types, entity_type_filter)

### Sample 100 Sentence out of the filtered List

In [97]:
sampled_sentences_with_entity_types = random.sample(filtered_sentences, k=100)

In [98]:
# Apply data transformation to sampled data (sentences with tags)

# Define a helper function to tag entities for a given sentence and add it as a new column to the sentence
def tag_and_add(sentence, entity_type_filter=None):
    text = sentence['text']
    text_pos = sentence['offsets'][0][0]
    document_data = [d for d in data if d['document_id'] == sentence['document_id']][0]
    tagged_sentence = tag_entities(document_data, text, text_pos, entity_type_filter)
    sentence['tagged_sentence'] = tagged_sentence
    return sentence
# Tag entities
tagged_sentences = sampled_sentences_with_entity_types
for i in range(len(tagged_sentences)):
    tagged_sentences[i] = tag_and_add(tagged_sentences[i], entity_type_filter)

# Print the updated list of sentences
print(tagged_sentences)


[{'id': 19, 'type': 'sentence', 'text': 'MRT Oberbauch mit KM mit Kontrastmittel i.v. appliziert', 'offsets': [[550, 605]], 'document_id': 'Colon_Fake_H.tsv', 'entity_types': {'Diagnostic', 'Therapeutic', 'External_Substance'}, 'tagged_sentence': 'MRT Oberbauch mit <External_Substance>KM</External_Substance> mit <External_Substance>Kontrastmittel i.v.</External_Substance> appliziert'}, {'id': 45, 'type': 'sentence', 'text': '03.02.2028 Änderung der ABx:', 'offsets': [[3296, 3324]], 'document_id': 'Amanda_Alzheimer.tsv', 'entity_types': {'Clinical_Drug', 'Therapeutic'}, 'tagged_sentence': '03.02.2028 Änderung der <Clinical_Drug>ABx</Clinical_Drug>:'}, {'id': 48, 'type': 'sentence', 'text': 'Konsiliarbefunde : Psychiatrischer Konsiliarbefund v. 3.9.2024: Diagnose: Verd. a. beginnende organische Halluzinose. Therapie: neurolog. Konsil, bei Persistenz der visuell optischen Halluzination. Therapieversuch mit Risperdal.', 'offsets': [[3636, 3864]], 'document_id': 'Koenig.tsv', 'entity_types'

In [99]:
tagged_sentences_Substance = tagged_sentences

In [100]:
#extract Tags in separate column for readability
def extract_tagged_sentence(tagged_sentence):
    complete_tags = []
    for tag in tagged_sentence.split("</"):
        if "<" in tag:
            complete_tags.append("<" + tag.split("<")[1] + "</" + tag.split("<")[1].split(">")[0] + ">")
    return(complete_tags)

for i, tagged_sentence in enumerate(tagged_sentences):
    extracted_tags = extract_tagged_sentence(tagged_sentence['tagged_sentence'])
    tagged_sentences[i]['extracted_tags'] = extracted_tags


In [101]:
from tqdm import tqdm

processed_tagged_sentences = []
retry_sentences = tagged_sentences  # Initialize retry sentences with all sentences

prompt_mapping = {
    None: None,
    ('Diagnosis_or_Pathology', 'Other_Finding'): prompt_examples_finding,
    ('Therapeutic', 'Diagnostic'): prompt_examples_procedure,
    ('Clinical_Drug', 'External_Substance', 'Nutrient_or_Body_Substance'): prompt_examples_substance
}

while retry_sentences:
    for sentence in tqdm(retry_sentences, desc="Processing sentences"):
        try:
            processed_sentence = gpt_processing(sentence['tagged_sentence'], prompt_mapping[tuple(entity_type_filter)])
            sentence['processed_sentence'] = processed_sentence
            processed_tagged_sentences.append(sentence)
        except Exception as e:
            print(f"Error processing sentence: {e}")
            continue

    retry_sentences = [sentence for sentence in retry_sentences if sentence not in processed_tagged_sentences]



Retrying failed sentences:  43%|████▎     | 43/100 [06:01<04:10,  4.39s/it]

In [89]:
processed_tagged_sentences

[{'id': 24,
  'type': 'sentence',
  'text': 'Status: guter AZ/EZ.',
  'offsets': [[921, 941]],
  'document_id': 'Colon_Fake_G.tsv',
  'entity_types': {'Other_Finding'},
  'tagged_sentence': 'Status: <Other_Finding>guter AZ/EZ</Other_Finding>.',
  'extracted_tags': ['<Other_Finding>guter AZ/EZ</Other_Finding>'],
  'processed_sentence': 'Status: <Other_Finding>Gute Allgemein- und Ernährungszustände</Other_Finding>.'},
 {'id': 38,
  'type': 'sentence',
  'text': 'sp02 >90% BGA nach Pleurapunktion unverändert',
  'offsets': [[2796, 2841]],
  'document_id': 'Amanda_Alzheimer.tsv',
  'entity_types': {'Other_Finding', 'Therapeutic'},
  'tagged_sentence': '<Other_Finding>sp02 >90%</Other_Finding> <Other_Finding>BGA nach Pleurapunktion unverändert</Other_Finding>',
  'extracted_tags': ['<Other_Finding>sp02 >90%</Other_Finding>',
   '<Other_Finding>BGA nach Pleurapunktion unverändert</Other_Finding>'],
  'processed_sentence': '<Other_Finding>SpO2-Wert über 90%</Other_Finding> <Other_Finding>Unve

In [90]:
#remove everything behind /n (removes artifacts with Input / Output)
for i, cleaned_sentence in enumerate(processed_tagged_sentences):
    cleaned_sentence = cleaned_sentence['processed_sentence'].split("\n")[0]
    processed_tagged_sentences[i]['processed_sentence_cut_off'] = cleaned_sentence

In [91]:
import re

valid_tags = [
    'Diagnosis_or_Pathology',
    'Other_Finding',
    'Therapeutic',
    'Diagnostic',
    'Clinical_Drug',
    'External_Substance',
    'Nutrient_or_Body_Substance'
]

valid_tags = entity_type_filter


def remove_unwanted_tags(lst, valid_tags):
    # Define the regular expression pattern
    pattern = fr'<\/?(?![\/]?({"|".join(valid_tags)}))[^>]*>'

    # Remove unwanted tags from the list
    new_lst = re.sub(pattern, '', lst)
    return new_lst

#Apply unknown Tag removal
for i, processed_sentence in enumerate(processed_tagged_sentences):
    valid_text = remove_unwanted_tags(processed_sentence['processed_sentence_cut_off'],valid_tags)
    processed_tagged_sentences[i]['tag_cleaned_processed_sentence'] = valid_text


In [92]:
for i, processed_sentence in enumerate(processed_tagged_sentences):
    extracted_tags = extract_tagged_sentence(processed_sentence['tag_cleaned_processed_sentence'])
    processed_tagged_sentences[i]['processed_extracted_tags'] = extracted_tags

In [93]:
import pandas as pd


df = pd.DataFrame(processed_tagged_sentences)

df_expanded = df.explode('extracted_tags')
df_expanded

df_expanded.to_csv('df_100_diagnosis.csv')


In [41]:
import csv

# Combine the three lists (processed, tagged, sampled original) into one
combined_list = list(zip(processed_sentences, tagged_sentences, sampled_sentences))

# Write the combined list to a CSV file
with open('50_long_tokens_sampled_output.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['GPT_processed_sentence','tagged_sentence','original_passage'])  # Write the header to the CSV file
    writer.writerows(combined_list)

## OPEN AI Model application

In [4]:
#run GPT API calls to do conversion magic

import os
import openai
openai.organization = "org-83uYbbOlauNPHsw3VycskO8E"
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_key = "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"
openai.Model.list()

<OpenAIObject list at 0x10c400b80> JSON: {
  "data": [
    {
      "created": 1677532384,
      "id": "whisper-1",
      "object": "model",
      "owned_by": "openai-internal",
      "parent": null,
      "permission": [
        {
          "allow_create_engine": false,
          "allow_fine_tuning": false,
          "allow_logprobs": true,
          "allow_sampling": true,
          "allow_search_indices": false,
          "allow_view": true,
          "created": 1683912666,
          "group": null,
          "id": "modelperm-KlsZlfft3Gma8pI6A8rTnyjs",
          "is_blocking": false,
          "object": "model_permission",
          "organization": "*"
        }
      ],
      "root": "whisper-1"
    },
    {
      "created": 1649358449,
      "id": "babbage",
      "object": "model",
      "owned_by": "openai",
      "parent": null,
      "permission": [
        {
          "allow_create_engine": false,
          "allow_fine_tuning": false,
          "allow_logprobs": true,
         

In [6]:
# Example code to be edited
input = "Die <Therapeutic>Transplantat</Therapeutic>· und <Therapeutic>Entnahmeareale</Therapeutic> sollten <Therapeutic>täglich mehrfach mit <Clinical_Drug>fettenden Salben</Clinical_Drug> (z.B. <Clinical_Drug>Panthenol</Clinical_Drug>) gepflegt</Therapeutic> werden."

# Prompt for the edited code
instruction = f"Substantiviere die folgenden durch Tags markierte Entitäten und verschiebe dabei die Tags entsprechend, sodass die Zuweisung erhalten bleibt."

# Generate edited code using Davinci-Codex
response = openai.Edit.create(
    engine="text-davinci-edit-001",
    instruction=instruction,
    input=input2,
    temperature=0
)

# Extract the edited code from the response
edited_code = response.choices[0].text.strip()

print("Original code:")
print(original_code)
print("Edited code:")
print(edited_code)

NameError: name 'input2' is not defined

In [84]:
##Prompts

prompt_no_examples = "Substantiviere die folgenden durch Tags markierte Entitäten und verschiebe dabei die Tags entsprechend, sodass die Zuweisung erhalten bleibt."

prompt_no_examples2 = "Substantiviere die folgenden durch Tags markierte Entitäten und verschiebe dabei die Tags entsprechend, sodass die Zuweisung erhalten bleibt. Verwende dabei keine weiteren Tags und erstelle keine neuen Tags."

prompt_examples1 = "Substantiviere die folgenden durch Tags markierte Entitäten und verschiebe dabei die Tags entsprechend, sodass die Zuweisung erhalten bleibt. Ein Beispiel: Die Patientin wurde zunächst <Diagnostic>kontrolliert</Diagnostic> und im weiteren Verlauf <Therapeutic>augmentiert maschinell beatmet</Therapeutic>. Die Patientin wurde zunächst  einer <Diagnostic>Kontrolle</Diagnostic> unterzogen und erhielt im weiteren Verlauf <Therapeutic>augmentierde, maschinelle Beatmung</Therapeutic>."

prompt_examples2 = "Substantiviere die folgenden durch Tags markierte Entitäten und verschiebe dabei die Tags entsprechend, sodass die Zuweisung erhalten bleibt. Verwende dabei keine weiteren Tags und erstelle keine neuen Tags. Ein Beispiel: 'Die Patientin wurde zunächst <Diagnostic>kontrolliert</Diagnostic> und im weiteren Verlauf <Therapeutic>augmentiert maschinell beatmet</Therapeutic>.' wird zu 'Die Patientin wurde zunächst  einer <Diagnostic>Kontrolle</Diagnostic> unterzogen und erhielt im weiteren Verlauf <Therapeutic>augmentierde, maschinelle Beatmung</Therapeutic>.'"


prompt_examples_therapeutic_old = "Substantiviere die folgenden durch Tags markierte Entitäten und verschiebe dabei die Tags entsprechend, sodass die Zuordnung erhalten bleibt. Wende dabei keine weiteren Tags auf andere Wörter an und erstelle keine neuen Tags. Schreibe keine Wörter um, wenn es nicht erforderlich ist. \n Ein Beispiel: \n\n Input: 'Die Patientin wurde zunächst <Diagnostic>kontrolliert</Diagnostic> und im weiteren Verlauf <Therapeutic>augmentiert maschinell beatmet</Therapeutic>.' \n Output: 'Die Patientin wurde zunächst  einer <Diagnostic>Kontrolle</Diagnostic> unterzogen und erhielt im weiteren Verlauf <Therapeutic>augmentierde, maschinelle Beatmung</Therapeutic>.'"

prompt_examples_procedure = "Substantiviere die folgenden durch Tags markierten Entitäten und verschiebe dabei die Tags entsprechend, sodass die Zuordnung erhalten bleibt. Verwende keine weiteren Tags für andere Wörter und erstelle keine neuen Tags. Behalte die Wörter unverändert, sofern es nicht erforderlich ist. \n Ein Beispiel: \n\n Input: 'Die Patientin wurde zunächst <Diagnostic>kontrolliert</Diagnostic> und im weiteren Verlauf <Therapeutic>augmentiert maschinell beatmet</Therapeutic>.' \n Output: 'Die Patientin wurde zunächst einer <Diagnostic>Kontrolle</Diagnostic> unterzogen und erhielt im weiteren Verlauf <Therapeutic>augmentierte maschinelle Beatmung</Therapeutic>.'"

prompt_examples_finding = "Substantiviere die folgenden durch Tags markierten Entitäten und verschiebe dabei die Tags entsprechend, sodass die Zuordnung erhalten bleibt. Verwende keine weiteren Tags für andere Wörter und erstelle keine neuen Tags. Behalte die Wörter unverändert, sofern es nicht erforderlich ist. \n Beispiele: \n\n Input: 'Durch verschiedene Arbeitsgruppen wurde das Konzept der Barriere propagiert, d.h. die anatomische Grenzschicht von <Diagnosis_or_Pathology>nicht-tumorbefallenem</Diagnosis_or_Pathology> Gewebe (Faszie, Periost etc.), ist ebenfalls von großer Bedeutung.' \n Output: 'Durch verschiedene Arbeitsgruppen wurde das Konzept der Barriere propagiert, d.h. die anatomische Grenzschicht von Gewebe mit <Diagnosis_or_Pathology>Nicht-Tumorbefallenheit</Diagnosis_or_Pathology> (Faszie, Periost etc.), ist ebenfalls von großer Bedeutung.' \n\n Input: 'Der Patient war <Other_Finding>kardiopulmonal stabil</Other_Finding>, <Diagnosis_or_Pathology>fieberhafte Infekte</Diagnosis_or_Pathology> traten nicht auf.' \n Output: Der Patient war <Other_Finding> in einem kardiopulmonalen Stabilitätszustand</Other_Finding>, <Diagnosis_or_Pathology>fieberhafte Infekte</Diagnosis_or_Pathology> traten nicht auf."

prompt_examples_substance = "Substantiviere die folgenden durch Tags markierten Entitäten und verschiebe dabei die Tags entsprechend, sodass die Zuordnung erhalten bleibt. Verwende keine weiteren Tags für andere Wörter und erstelle keine neuen Tags. Behalte die Wörter unverändert, sofern es nicht erforderlich ist. \n Ein Beispiel: \n\n Input: 'Das Problem war in erster Linie <Clinical_Drug>kortisonbedingt</Clinical_Drug>.' \n Output: 'Das Problem war in erster Linie bedingt durch <Clinical_Drug>Kortison</Clinical_Drug>.'"



#Korrgiere keine Zeichensätzung oder Rechtschreibung in den Sätzen. --> hat keine Auswirkung gezeigt


In [8]:
## Inputs

#Input sentences with verbal constructions that should be substantiated
input1 = "Die <Therapeutic>Transplantat</Therapeutic>· und <Therapeutic>Entnahmeareale</Therapeutic> sollten <Therapeutic>täglich mehrfach mit <Clinical_Drug>fettenden Salben</Clinical_Drug> (z.B. <Clinical_Drug>Panthenol</Clinical_Drug>) gepflegt</Therapeutic> werden."

input2 = "Die Patientin wurde vom  20.3.2029 bis zum 27.3.2029  zunächst  <Diagnostic>kontrolliert</Diagnostic> und  im weiteren Verlauf unter <Therapeutic>reduzierter Analgosedierung</Therapeutic> <Therapeutic>augmentiert maschinell beatmet</Therapeutic>."

input3 = "Lediglich am rechten retroauriculären Hals und an kleineren Arealen am Kopf musste im Verlauf am 23.04 2029 nochmals <Therapeutic>nachtransplantiert</Therapeutic> werden."

input4 = "Beide Ohren sowie insbesondere die re. Hals-/Nackenregion waren <Diagnosis_or_Pathology>ll bis Ill gradig verbrannt</Diagnosis_or_Pathology>."

input5 = "Darunter war sie im Verlauf <Diagnosis_or_Pathology>weitgehend distanziert</Diagnosis_or_Pathology> von <Diagnosis_or_Pathology>psychotischem Erleben</Diagnosis_or_Pathology>, <Other_Finding>ruhig</Other_Finding>, <Other_Finding>gut im Kontakt</Other_Finding> und <Other_Finding>bedauerte</Other_Finding>, was sie sich da angetan hatte und nachdem sie zunächst eine stationäre psychiatrische Verlegung ablehnte, willigte sie nach einem <Therapeutic>motivierenden Gespräch</Therapeutic> schließlich ein, sich freiwillig in lhre Abteilung verlegen zu lassen."

input6 = "Anschließend wurde Frau Albers <Therapeutic>intuobiert</Therapeutic> und <Therapeutic>beatmet</Therapeutic> auf die lntensivstation des <Diagnosis_or_Pathology>Brandverletztenzentrums</Diagnosis_or_Pathology> aufgenommen."

input7 = "<Diagnostic>Fremdanamnestisch</Diagnostic> war von der Freundin und der <Therapeutic>ambulant behandelnden</Therapeutic> Ärztin für Psychiatrie und <Therapeutic>Psychotherapie</Therapeutic> zu erfahren, dass die Patientin bereits seit Anfang des Jahres <Diagnosis_or_Pathology>akut psychotisch</Diagnosis_or_Pathology> gewesen sei."

#test sentences without verbal constructions (testing outputs' completeness)
input_no_verb_1 = "Auf dem Weg dorthin sei Frau Albers aus dem Taxi gesprungen und hatte sich zuhause o.g. <Diagnosis_or_Pathology>Verletzungen</Diagnosis_or_Pathology> zugefügt."

input_no_verb_2 = "Nach <Therapeutic>Extubation</Therapeutic> gelang der <Therapeutic>orale Kostaufbau</Therapeutic> problemlos Die Patientin nimmt Wunschkost zu sich."

input_no_verb_3 = "Die <Therapeutic>vorbestehende antiepileptische Medikation mit <Clinical_Drug>Levothiracetam</Therapeutic></Clinical_Drug> (<Clinical_Drug>Keppra</Clinical_Drug>®) führten wir fort." #nicht ideales Ergebnisbeispiel

input_no_verb_4 = "Hierzu zählen beispielsweise <Therapeutic>operative Sehnen- und Gelenklösungen</Therapeutic> zur <Therapeutic>Verbesserung der <Other_Finding>Beweglichkeit</Therapeutic> der Hand</Other_Finding> sowie <Therapeutic>Korrektureingriffe im Bereich der <Diagnosis_or_Pathology>Narben</Therapeutic> insbesondere im Kopf-/Halsbereich und der haarlosen Kopfareale</Diagnosis_or_Pathology>."#verändert und verschiebt Entities

input_no_verb_5 = "Darüber hinaus ist eine sozialpsychiatrische Einbettung in ein ambulantes Setting nach dieser <Diagnosis_or_Pathology>schweren Selbstverletzung</Diagnosis_or_Pathology> angestrebt sowie die <Therapeutic>therapeutische Begleitung</Therapeutic> bei der <Therapeutic>Bewältigung der <Diagnosis_or_Pathology>Folgeschaden</Therapeutic> im Alltag</Diagnosis_or_Pathology>."
#gutes result für "Begleitung in der Therapie", vertauscht aber Closing Tags am Satzende

input_no_verb_6 = "Claudia Dupuytren"


In [35]:
import os
import openai



def gpt_processing(input=input_no_verb_6, prompt=prompt_examples_finding):

  completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "assistant", "content": prompt + '\n\n' + 'Input:' + input + '\nOutput:'}
    ],
    temperature = 0 #führt zu konstanteren Ergebnissen als ohne temp

  )

  return str(completion.choices[0].message.content)

In [37]:
input = "Die <Therapeutic>laufende Medik. ist <Clinical_Drug>schmerzmittelbedingt</Therapeutic></Clinical_Drug>."#input_no_verb_6
print(input)

print(gpt_processing(input,prompt_examples_substance))

Die <Therapeutic>laufende Medik. ist <Clinical_Drug>schmerzmittelbedingt</Therapeutic></Clinical_Drug>.
Die <Therapeutic>laufende Medikation ist bedingt durch <Clinical_Drug>Schmerzmittel</Clinical_Drug>.</Therapeutic>


## Convert Data Back

In [97]:
'''#convert back to BigBIO and save

import re

# Dictionary of entity tags and their types
entity_tags = {
    '<Diagnosis_or_Pathology>': 'Diagnosis_or_Pathology',
    '<Other_Finding>': 'Other_Finding',
    '<Therapeutic>': 'Therapeutic',
    '<Diagnostic>': 'Diagnostic',
    '<Clinical_Drug>': 'Clinical_Drug',
    '<External_Substance>': 'External_Substance',
    '<Nutrient_or_Body_Substance>': 'Nutrient_or_Body_Substance'
}

# Load the tagged text from file
with open('tagged_text.txt', 'r') as f:
    tagged_text = f.read()

# Find all entities in the tagged text using regular expressions
entities = []
new_tagged_text = tagged_text  # Updated tagged text without entities
tags_length = 0
for tag, entity_type in entity_tags.items():
    pattern = re.compile(f'{tag}(.*?){tag.replace("<", "</")}')
    matches = re.findall(pattern, new_tagged_text)
    for match in matches:
        start = new_tagged_text.find(tag)  # Find the start position of the match
        end = new_tagged_text.find(tag.replace("<", "</"), start) + len(tag)  # Find the end position of the match
        offsets = [(start, end)]
        entitiy_text = new_tagged_text[start+len(tag):end-len(tag.replace("<", "</"))+1]
        print(entitiy_text)
        for sub_tag, _ in entity_tags.items():
            sub_pattern = re.compile(f'{sub_tag}(.*?){sub_tag.replace("<", "</")}')
            print(sub_tag)
            print(sub_pattern)
            submatches = re.findall(sub_pattern, entitiy_text)
            print(submatches)
            for submach in submatches:
                    print(submach)
                    start_sub = entitiy_text.find(sub_tag)
                    end_sub = entitiy_text.find(sub_tag.replace("<", "</"), start) + len(sub_tag)
                    print(entitiy_text)
                    entitiy_text = entitiy_text[:start_sub] + entitiy_text[start_sub+len(sub_tag):end_sub-len(sub_tag.replace("<", "</"))+1] + entitiy_text[end_sub+1:]
                    print(entitiy_text)
                    end = end - len(sub_tag)
                    if end_sub:
                        end = end - len(sub_tag)+1
                        print('hallo')
        entities.append({'text':entitiy_text, 'type': entity_type, 'offsets': offsets})
        new_tagged_text = new_tagged_text[:start] + new_tagged_text[start+len(tag):end-len(tag.replace("<", "</"))+1] + new_tagged_text[end:]  # Remove the entity tags from the tagged text
        print(new_tagged_text)
        tags_length += 2*len(tag)+1


# Sort the entities by their starting position in the text
entities = sorted(entities, key=lambda e: e['offsets'][0][0])

# Construct the JSON output
output = {'passages': [{'id':1,'type': 'sentence','text': tagged_text,'offsets':[[0,len(tagged_text)-tags_length]]}], 'entities': entities}
#id and offsets are incorrect placeholders



# Write the JSON output to a file
with open('output_back.json', 'w') as f:
    json.dump(output, f, indent=4)
'''

Transplantat
<Diagnosis_or_Pathology>
re.compile('<Diagnosis_or_Pathology>(.*?)</Diagnosis_or_Pathology>')
[]
<Other_Finding>
re.compile('<Other_Finding>(.*?)</Other_Finding>')
[]
<Therapeutic>
re.compile('<Therapeutic>(.*?)</Therapeutic>')
[]
<Diagnostic>
re.compile('<Diagnostic>(.*?)</Diagnostic>')
[]
<Clinical_Drug>
re.compile('<Clinical_Drug>(.*?)</Clinical_Drug>')
[]
<External_Substance>
re.compile('<External_Substance>(.*?)</External_Substance>')
[]
<Nutrient_or_Body_Substance>
re.compile('<Nutrient_or_Body_Substance>(.*?)</Nutrient_or_Body_Substance>')
[]
Die Transplantat>· und <Therapeutic>Entnahmeareale</Therapeutic> sollten <Therapeutic>täglich mehrfach mit <Clinical_Drug>fettenden Salben</Clinical_Drug> (z.B. <Clinical_Drug>Panthenol</Clinical_Drug>) gepflegt</Therapeutic> werden.
Entnahmeareale
<Diagnosis_or_Pathology>
re.compile('<Diagnosis_or_Pathology>(.*?)</Diagnosis_or_Pathology>')
[]
<Other_Finding>
re.compile('<Other_Finding>(.*?)</Other_Finding>')
[]
<Therapeutic>
r

In [124]:
import re
import json

# Dictionary of entity tags and their types
entity_tags = {
    '<Diagnosis_or_Pathology>': 'Diagnosis_or_Pathology',
    '<Other_Finding>': 'Other_Finding',
    '<Therapeutic>': 'Therapeutic',
    '<Diagnostic>': 'Diagnostic',
    '<Clinical_Drug>': 'Clinical_Drug',
    '<External_Substance>': 'External_Substance',
    '<Nutrient_or_Body_Substance>': 'Nutrient_or_Body_Substance'
}

# Load the tagged text from file
with open('tagged_text.txt', 'r') as f:
    tagged_text = f.read()

# Find all entities in the tagged text using regular expressions
entities = []
new_tagged_text = tagged_text  # Updated tagged text without entities
tags_length = 0
for tag, entity_type in entity_tags.items():
    pattern = re.compile(f'{tag}(.*?){tag.replace("<", "</")}')
    matches = re.findall(pattern, new_tagged_text)
    for match in matches:
        end_diff = 0
        start = new_tagged_text.find(tag)  # Find the start position of the match
        end = new_tagged_text.find(tag.replace("<", "</"), start) + len(tag)  # Find the end position of the match
        entity_text = new_tagged_text[start + len(tag):end - len(tag.replace("<", "</")) + 1]
        for sub_tag, _ in entity_tags.items():
            submatches = re.findall(pattern, entity_text)
            for submatch in submatches:
                # start, end = re.search(tag, new_tagged_text).span()  # Find the start and end positions of the match
                start_sub = entity_text.find(sub_tag)
                end_sub = entity_text.find(sub_tag.replace("<", "</"), start_sub) + len(sub_tag)
                entity_text = entity_text[:start_sub] + entity_text[start_sub + len(sub_tag):end_sub - len(sub_tag.replace("<", "</")) + 1] + entity_text[end_sub:]
                end_diff += len(sub_tag)
                if end_sub:
                    end_diff +=  len(tag)+1

        offsets = [(start, end-2*len(tag)-1-end_diff)]#check if end offset is correct, or +1 needed
        entities.append({'text': entity_text, 'type': entity_type, 'offsets': offsets})
        new_tagged_text = new_tagged_text[:start] + new_tagged_text[start + len(tag):end - len(tag.replace("<", "</")) + 1] + new_tagged_text[end:]  # Remove the entity tags from the tagged text
        tags_length += 2 * len(tag) + 1

# Sort the entities by their starting position in the text
entities = sorted(entities, key=lambda e: e['offsets'][0][0])

# Construct the JSON output
output = {'passages': [{'id': 1, 'type': 'sentence', 'text': tagged_text, 'offsets': [[0, len(tagged_text) - tags_length]]}], 'entities': entities}
# id and offsets are incorrect placeholders


# Write the JSON output to a file
with open('output_back.json', 'w') as f:
    json.dump(output, f, indent=4)


In [123]:
#GPT's take with search()

In [None]:
import re
import json

# Dictionary of entity tags and their types
entity_tags = {
    '<Diagnosis_or_Pathology>': 'Diagnosis_or_Pathology',
    '<Other_Finding>': 'Other_Finding',
    '<Therapeutic>': 'Therapeutic',
    '<Diagnostic>': 'Diagnostic',
    '<Clinical_Drug>': 'Clinical_Drug',
    '<External_Substance>': 'External_Substance',
    '<Nutrient_or_Body_Substance>': 'Nutrient_or_Body_Substance'
}

# Load the tagged text from file
with open('tagged_text.txt', 'r') as f:
    tagged_text = f.read()

# Find all entities in the tagged text using regular expressions
entities = []
new_tagged_text = tagged_text  # Updated tagged text without entities
tags_length = 0

for tag, entity_type in entity_tags.items():
    pattern = re.compile(f'{tag}(.*?){tag.replace("<", "</")}')
    matches = re.findall(pattern, new_tagged_text)

    for match in matches:
        start, end = re.search(tag, new_tagged_text).span()  # Find the start and end positions of the match
        entity_text = new_tagged_text[start + len(tag):end - len(tag.replace("<", "</")) + 1]

        for sub_tag in entity_tags.keys():
            if sub_tag in entity_text:
                sub_pattern = re.compile(f'{sub_tag}(.*?){sub_tag.replace("<", "</")}')
                submatches = re.findall(sub_pattern, entity_text)
                for submatch in submatches:
                    start_sub, end_sub = re.search(sub_tag, entity_text).span()
                    entity_text = entity_text[:start_sub] + entity_text[start_sub + len(sub_tag):end_sub - len(sub_tag.replace("<", "</")) + 1] + entity_text[end_sub:]

        offsets = [(start, end - 2 * len(tag) - 1 - len(entity_text))]
        entities.append({'text': entity_text, 'type': entity_type, 'offsets': offsets})
        new_tagged_text = new_tagged_text[:start] + new_tagged_text[start + len(tag):end - len(tag.replace("<", "</")) + 1] + new_tagged_text[end:]
        tags_length += 2 * len(tag) + 1

# Sort the entities by their starting position in the text
entities = sorted(entities, key=lambda e: e['offsets'][0][0])

# Construct the JSON output
output = {'passages': [{'id': 1, 'type': 'sentence', 'text': tagged_text, 'offsets': [[0, len(tagged_text) - tags_length]]}], 'entities': entities}
# id and offsets are incorrect placeholders

# Write the JSON output to a file
with open('output_back.json', 'w') as f:
    json.dump(output, f, indent=4)
