In [None]:
import pandas as pd
import numpy as np
import os
import xml.etree.ElementTree as ET

In [None]:
labels = ['Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion',
                 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion',
                 'Pleural Other', 'Fracture', 'Support Devices']


In [None]:
file_path = '/content/drive/MyDrive/NLMCXR_reports/ecgen-radiology'
files = os.listdir('/content/drive/MyDrive/NLMCXR_reports/ecgen-radiology')

In [None]:
data = []
for file in files:
  if file.endswith('.xml'):
    tree = ET.parse(os.path.join(file_path, file))
    root = tree.getroot()

    findings = root.find(".//AbstractText[@Label='FINDINGS']")
    impressions = root.find(".//AbstractText[@Label='IMPRESSION']")

    captions = ""
    if findings is not None and findings.text is not None: # Check if findings.text is not None
      captions += findings.text + " "
    if impressions is not None and impressions.text is not None: # Check if impressions.text is not None
      captions += impressions.text
      data.append(captions)


In [None]:
data

['Both lungs are clear and expanded. Heart and mediastinum normal. No active disease.',
 'Normal heart. Clear lungs. No pneumothorax. No pleural effusion. Normal chest exam.',
 'Stable cardiomediastinal silhouette. Mild patchy right upper lobe opacities, similar to slightly improved from XXXX. Left lung clear. No pleural effusion or pneumothorax. 1. Persistent mild right upper lobe infiltrate, similar to slightly improved from XXXX. 2. Left lung grossly clear.',
 'Heart size and mediastinal contours are within normal limits given AP projection. The right lung appears clear. There is minimal patchy atelectasis or early infiltrate in left lung base. No visible pleural effusion or pneumothorax. There is a partially visualized IVC XXXX on the lateral view. There are partially visualized surgical changes the cervical spine compatible with prior fusion procedure. Minimal patchy left basilar atelectasis or infiltrate.',
 'Stable normal cardiomediastinal silhouette. Bilateral calcified hilar/p

In [None]:
#keyword parsing
keywords = ['clear lungs', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity', 'Lung Lesion',
                 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion',
                 'Pleural Other', 'Fracture', 'Support Devices']
for i in range(len(keywords)):
  keywords[i] = keywords[i].lower()

In [None]:
for i in range(len(data)):
  data[i] = data[i].lower().strip('XXXX xxxx')

In [None]:
data

['both lungs are clear and expanded. heart and mediastinum normal. no active disease.',
 'normal heart. clear lungs. no pneumothorax. no pleural effusion. normal chest exam.',
 'stable cardiomediastinal silhouette. mild patchy right upper lobe opacities, similar to slightly improved from xxxx. left lung clear. no pleural effusion or pneumothorax. 1. persistent mild right upper lobe infiltrate, similar to slightly improved from xxxx. 2. left lung grossly clear.',
 'heart size and mediastinal contours are within normal limits given ap projection. the right lung appears clear. there is minimal patchy atelectasis or early infiltrate in left lung base. no visible pleural effusion or pneumothorax. there is a partially visualized ivc xxxx on the lateral view. there are partially visualized surgical changes the cervical spine compatible with prior fusion procedure. minimal patchy left basilar atelectasis or infiltrate.',
 'stable normal cardiomediastinal silhouette. bilateral calcified hilar/p

In [None]:
prompt_template = """
### Instruction:
{instruction}

### Response"""

In [None]:
import re

def keywords_matching(caption, keywords):
  matched_keywords = []
  # for caption in data:
  for word in keywords:
    if word in caption:
      if re.search(rf'\bno\s+{word}\b', caption):
        matched_keywords.append('no ' + word)
      else:
        matched_keywords.append(word)
    if not matched_keywords:
      continue
  return matched_keywords

The problem is in keyword_str

In [None]:
instruction_response = []
from pprint import pprint
import re

intruction_response = []
for caption in data:
  keywords_matched = keywords_matching(caption, keywords)
  #keyword_str = []


  keyword_str = ', '.join(keywords_matched)
  if keywords_matched == []:
      # If no keywords are matched, use the "no problems found" instruction
      instruction_text = "Please make a caption for no problems found"
  else:
      # Otherwise, include the matched keywords in the instruction
      instruction_text = f"Please make a caption for {keyword_str}"

  prompt = prompt_template.format(instruction= instruction_text)
  processed_entry = {
        "instruction": prompt,
        "response": caption.replace('xxxx', ' ').strip()
    }
  instruction_response.append(processed_entry)




In [None]:
data = [
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Describe the chest X-ray findings for clear lungs\n'
                       '\n'
                       '### Response',
        'response': 'Clear lungs. No pneumothorax. No pleural effusion. No abnormalities noted.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Summarize the X-ray report for pneumonia\n'
                       '\n'
                       '### Response',
        'response': 'Increased opacities in the right lower lobe consistent with pneumonia. No pleural effusion or pneumothorax.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'What does the X-ray reveal about a suspected pleural effusion?\n'
                       '\n'
                       '### Response',
        'response': 'Large left-sided pleural effusion. Right lung appears clear. No pneumothorax.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Provide a caption for the chest X-ray showing cardiomegaly\n'
                       '\n'
                       '### Response',
        'response': 'Enlarged cardiac silhouette consistent with cardiomegaly. Clear lungs. No pleural effusion or pneumothorax.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Write a summary of findings for clear lungs and no abnormalities\n'
                       '\n'
                       '### Response',
        'response': 'Lungs are clear bilaterally. Heart size is normal. No acute cardiopulmonary findings.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Generate a report caption for pneumothorax on the left side\n'
                       '\n'
                       '### Response',
        'response': 'Left-sided pneumothorax. Mild shift of mediastinum to the right. Right lung appears clear.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Please analyze the X-ray for findings related to consolidation\n'
                       '\n'
                       '### Response',
        'response': 'Dense consolidation in the right upper lobe. No pleural effusion or pneumothorax. Heart size normal.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'What are the findings in this X-ray with no pneumothorax?\n'
                       '\n'
                       '### Response',
        'response': 'Clear lungs. No pneumothorax. No pleural effusion. Normal mediastinal contours.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Summarize the findings of the X-ray for pulmonary edema\n'
                       '\n'
                       '### Response',
        'response': 'Bilateral diffuse haziness consistent with pulmonary edema. Cardiomegaly noted. No pneumothorax.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Can you describe an X-ray showing a normal heart size and clear lungs?\n'
                       '\n'
                       '### Response',
        'response': 'Heart size is within normal limits. Lungs are clear. No effusion or pneumothorax detected.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Write a report caption for a patient with both pneumothorax and pleural effusion\n'
                       '\n'
                       '### Response',
        'response': 'Right-sided pneumothorax with associated pleural effusion. Left lung appears clear. Mediastinal structures are shifted to the left.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Provide a report caption for X-ray showing pleural thickening\n'
                       '\n'
                       '### Response',
        'response': 'Bilateral pleural thickening noted. No pneumothorax or pleural effusion. Lungs are otherwise clear.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Describe findings in an X-ray with evidence of fibrosis\n'
                       '\n'
                       '### Response',
        'response': 'Bilateral linear opacities at lung bases consistent with fibrosis. No pleural effusion or pneumothorax. Heart size is normal.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Analyze and summarize the findings for this X-ray showing lung opacities\n'
                       '\n'
                       '### Response',
        'response': 'Diffuse bilateral lung opacities. Heart size normal. No pleural effusion or pneumothorax.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'What are the findings for a chest X-ray with interstitial lung disease?\n'
                       '\n'
                       '### Response',
        'response': 'Reticular interstitial markings in bilateral lung fields. No pneumothorax. Heart size is normal.'
    }
]


In [None]:
data.extend([
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Generate a report for a chest X-ray with lung hyperinflation\n'
                       '\n'
                       '### Response',
        'response': 'Hyperinflated lungs with flattened diaphragms. No focal consolidation. Heart size normal.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Describe the findings for a suspected pulmonary embolism on the X-ray\n'
                       '\n'
                       '### Response',
        'response': 'No clear evidence of pulmonary embolism. Lungs are clear. No pleural effusion or pneumothorax detected.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Write a caption for a chest X-ray showing a suspicious mass\n'
                       '\n'
                       '### Response',
        'response': 'Solitary mass in the left lower lobe, likely a neoplasm. No pleural effusion or pneumothorax. Mediastinal structures are normal.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Summarize the findings in a chest X-ray showing evidence of tuberculosis\n'
                       '\n'
                       '### Response',
        'response': 'Cavitary lesion in the right upper lobe. Bilateral infiltrates consistent with tuberculosis. No pneumothorax or pleural effusion.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Provide a report for an X-ray with findings of lung collapse\n'
                       '\n'
                       '### Response',
        'response': 'Right lung collapse with associated shift of mediastinum. No pleural effusion noted. Left lung appears clear.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Describe an X-ray showing early signs of congestive heart failure\n'
                       '\n'
                       '### Response',
        'response': 'Cardiomegaly and pulmonary vascular congestion suggestive of congestive heart failure. No pneumothorax or pleural effusion.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Summarize the findings for a chest X-ray with pneumonia and atelectasis\n'
                       '\n'
                       '### Response',
        'response': 'Consolidation in the right lower lobe with associated atelectasis. No pneumothorax or pleural effusion.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'What are the findings in an X-ray showing chronic obstructive pulmonary disease (COPD)?\n'
                       '\n'
                       '### Response',
        'response': 'Hyperinflated lungs with flattening of the diaphragms. No acute findings of pneumothorax or pleural effusion.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Provide a caption for X-ray findings showing fibrosis with honeycombing\n'
                       '\n'
                       '### Response',
        'response': 'Honeycombing and reticular changes at lung bases consistent with interstitial fibrosis. No pleural effusion or pneumothorax.'
    },
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Write a summary for an X-ray showing pulmonary fibrosis\n'
                       '\n'
                       '### Response',
        'response': 'Pulmonary fibrosis with increased interstitial markings. No evidence of pneumothorax or pleural effusion.'
    }
])


In [None]:
data.extend([
    # Clear Lungs
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Generate a report for clear lungs\n'
                       '\n'
                       '### Response',
        'response': 'The lungs are clear with no evidence of pneumonia or pneumothorax. The heart size and mediastinal contour are normal.'
    },

    # Enlarged Cardiomediastinum
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Write a caption for enlarged cardiomediastinum\n'
                       '\n'
                       '### Response',
        'response': 'The cardiomediastinum appears enlarged, possibly indicating aortic enlargement or lymphadenopathy. No significant lung abnormalities noted.'
    },

    # Cardiomegaly
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Describe findings for cardiomegaly\n'
                       '\n'
                       '### Response',
        'response': 'Cardiomegaly is observed with a heart-to-thoracic ratio greater than 50%. No signs of pulmonary congestion or pleural effusion.'
    },

    # Lung Opacity
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Generate a report for lung opacity\n'
                       '\n'
                       '### Response',
        'response': 'There is a focal lung opacity in the left upper lobe, potentially indicating a mass or infection. No pleural effusion or pneumothorax.'
    },

    # Lung Lesion
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Describe findings for a lung lesion\n'
                       '\n'
                       '### Response',
        'response': 'A solitary lung lesion is noted in the right middle lobe, with irregular margins. Further evaluation with CT is recommended.'
    },

    # Edema
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Summarize the findings for pulmonary edema\n'
                       '\n'
                       '### Response',
        'response': 'Bilateral pulmonary edema is evident with diffuse alveolar opacities. Cardiomegaly is also present, suggesting congestive heart failure.'
    },

    # Consolidation
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Provide a report for consolidation in the right lung\n'
                       '\n'
                       '### Response',
        'response': 'Consolidation is observed in the right lower lobe, consistent with pneumonia. No evidence of pneumothorax or pleural effusion.'
    },

    # Pneumonia
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Write a caption for pneumonia in the left lung\n'
                       '\n'
                       '### Response',
        'response': 'Patchy infiltrates in the left lower lobe suggest pneumonia. No pleural effusion or pneumothorax detected.'
    },

    # Atelectasis
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Describe findings for atelectasis\n'
                       '\n'
                       '### Response',
        'response': 'Atelectasis is observed in the right lower lobe with associated volume loss. No significant pleural effusion or pneumothorax.'
    },

    # Pneumothorax
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Generate a report for pneumothorax\n'
                       '\n'
                       '### Response',
        'response': 'A small pneumothorax is noted in the left lung apex. No pleural effusion or other lung abnormalities detected.'
    },

    # Pleural Effusion
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Describe findings for pleural effusion\n'
                       '\n'
                       '### Response',
        'response': 'A moderate-sized pleural effusion is observed on the right side. The lungs appear clear with no consolidation or pneumothorax.'
    },

    # Pleural Other
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Summarize findings for other pleural abnormalities\n'
                       '\n'
                       '### Response',
        'response': 'Pleural thickening is noted in the left lung, likely post-inflammatory. No effusion or pneumothorax observed.'
    },

    # Fracture
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Generate a report for a rib fracture\n'
                       '\n'
                       '### Response',
        'response': 'There is a non-displaced fracture of the 4th rib on the left. No associated pneumothorax or pleural effusion.'
    },

    # Support Devices
    {
        'instruction': '\n'
                       '### Instruction:\n'
                       'Write a caption for chest X-ray with support devices\n'
                       '\n'
                       '### Response',
        'response': 'Support devices, including an endotracheal tube and central venous catheter, are visible. Lungs are clear, with no evidence of pneumothorax or pleural effusion.'
    }
])


In [None]:
instruction_response.extend(data)

In [None]:
pprint(instruction_response)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
              'mediastinal widening. the lungs are hyperexpanded. scattered '
              'granuloma. no focal airspace disease. no large pleural effusion '
              'or pneumothorax. the   are intact. no acute cardiopulmonary '
              'abnormalities.'},
 {'instruction': '\n'
                 '### Instruction:\n'
                 'Please make a caption for no consolidation, pneumothorax\n'
                 '\n'
                 '### Response',
  'response': 'the cardiac and mediastinal contours are within normal limits. '
              'there are calcifications of the aortic  . the lungs are '
              'hyperinflated with increased retrosternal airspace and '
              'flattening of hemidiaphragms. there is haziness in the right '
              'lung apex. there is a 1.7 cm nodular density in the medial '
              'right lung base seen on the frontal view, not identified on the '
             

In [None]:
for entry in instruction_response:
    if not isinstance(entry, dict):
        print("Invalid entry:", entry)


In [None]:
pip install jsonlines

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Installing collected packages: jsonlines
Successfully installed jsonlines-4.0.0


In [None]:
import jsonlines
with jsonlines.open('/content/drive/My Drive/dataset_llm_finetuning_final.jsonl', 'w') as f:
  f.write_all(instruction_response)

In [None]:
print("Length of instruction_response:", len(instruction_response))


Length of instruction_response: 3970


In [None]:
import json
filename = "/content/drive/My Drive/dataset_llm_finetuning_final.jsonl"
with open(filename, "r") as f:
    for line in f:
        print(json.loads(line))  # Ensure it parses back correctly


{'instruction': '\n### Instruction:\nPlease make a caption for no problems found\n\n### Response', 'response': 'both lungs are clear and expanded. heart and mediastinum normal. no active disease.'}
{'instruction': '\n### Instruction:\nPlease make a caption for clear lungs, no pneumothorax, no pleural effusion\n\n### Response', 'response': 'normal heart. clear lungs. no pneumothorax. no pleural effusion. normal chest exam.'}
{'instruction': '\n### Instruction:\nPlease make a caption for pneumothorax, no pleural effusion\n\n### Response', 'response': 'stable cardiomediastinal silhouette. mild patchy right upper lobe opacities, similar to slightly improved from  . left lung clear. no pleural effusion or pneumothorax. 1. persistent mild right upper lobe infiltrate, similar to slightly improved from  . 2. left lung grossly clear.'}
{'instruction': '\n### Instruction:\nPlease make a caption for atelectasis, pneumothorax, pleural effusion\n\n### Response', 'response': 'heart size and mediasti