In [2]:
import pandas as pd
import re
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import json


In [3]:
!pip install transformers accelerate einops
!pip install -U modelscope
!pip install bitsandbytes --prefer-binary --upgrade



In [4]:
from google.colab import files

uploaded = files.upload()


Saving open_ave_data.csv to open_ave_data.csv


In [4]:


# Load your CSV file
df = pd.read_csv('open_ave_data.csv')

# Drop rows with any missing values
df.dropna(inplace=True)

# Preview
df.head()


Unnamed: 0.1,Unnamed: 0,ReportText,findings,clinicaldata,ExamName,impression
0,0,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: Cough. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,IMPRESSION: Normal 2-view chest radiography.
1,1,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,IMPRESSION: No acute cardiopulmonary abnormali...
2,2,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,CLINICAL HISTORY: CHEST PAIN. \n\n,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,IMPRESSION: No acute cardiopulmonary process.
3,3,Exam: - CHEST-PORTABLE History: Chest pain Com...,Findings: Heart size appears normal. Lungs cle...,History: Chest pain \n\n,Exam: - CHEST-PORTABLE\n\nComparison: None,Impression: Lungs clear
4,4,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,FINDINGS: Lungs/Pleura: No focal opacities evi...,"CLINICAL HISTORY: CHEST PAIN, SHORTNESS OF BRE...",EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,IMPRESSION: Normal single view chest.


In [7]:
def extract_metadata_with_labels(text):
    exam = re.search(r'EXAM:\s*(.*?)(?:\s*CLINICAL|FINDINGS:|IMPRESSION:|$)', text, re.IGNORECASE)
    clinical = re.search(r'CLINICAL HISTORY:\s*(.*?)(?:COMPARISON:|TECHNIQUE:|FINDINGS:|IMPRESSION:|$)', text, re.IGNORECASE)
    findings = re.search(r'FINDINGS:\s*(.*?)(?:IMPRESSION:|$)', text, re.IGNORECASE)
    impression = re.search(r'IMPRESSION:\s*(.*?)(?:$)', text, re.IGNORECASE)

    return {
        'ExamName': f"EXAM: {exam.group(1).strip()}" if exam else "",
        'clinicaldata': f"CLINICAL HISTORY: {clinical.group(1).strip()}" if clinical else "",
        'findings': f"FINDINGS: {findings.group(1).strip()}" if findings else "",
        'impression': f"IMPRESSION: {impression.group(1).strip()}" if impression else ""
    }



In [6]:
def format_few_shot(row):
    return f"""Report:\n{row['ReportText']}\n\nOutput:\n{{
  "ExamName": "{row['ExamName']}",
  "clinicaldata": "{row['clinicaldata']}",
  "findings": "{row['findings']}",
  "impression": "{row['impression']}"
}}\n"""

few_shots_prompt = "\n---\n".join(few_shot_df.apply(format_few_shot, axis=1))


In [26]:
def build_prompt(report_text, few_shots_prompt):
    return f"""You are a clinical language processing expert working in English. All your output must be in English. You will return the extracted results in valid JSON using English field labels and English content only. Your task is to extract structured information from unstructured radiology reports.

Each report typically contains the following sections:
- EXAM or Exam
- EXAM DATE
- CLINICAL HISTORY or INDICATION
- COMPARISON
- TECHNIQUE
- FINDINGS
- IMPRESSION

You will extract the following fields:
- "ExamName": contains the exam name and optionally technique or comparison.
- "clinicaldata": the clinical history, indication, or reported symptoms.
- "findings": the radiologist's findings from the scan.
- "impression": the final interpretation or conclusion.

Step-by-step approach:
1. Find and group information starting from **EXAM**. Include **EXAM DATE**, **TECHNIQUE**, and **COMPARISON** if found in the same section.
2. Extract **CLINICAL HISTORY**, **INDICATION**, or **History** for "clinicaldata".
3. Extract the **FINDINGS** section exactly.
4. Extract the **IMPRESSION** section exactly.

Return the results in strict JSON format like this:
{{
  "ExamName": "...",
  "clinicaldata": "...",
  "findings": "...",
  "impression": "..."
}}

If any section is not available, return an empty string for that field.

Here are a few examples:

{few_shots_prompt}
---
Now extract the following report:

Report:
{report_text}

Output:
"""


In [46]:
few_shot_df = df[['ReportText', 'ExamName', 'clinicaldata', 'findings', 'impression']].head(30)
few_shot_df.head()

Unnamed: 0,ReportText,ExamName,clinicaldata,findings,impression
0,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/01/2019 ...,CLINICAL HISTORY: Cough. \n\n,FINDINGS: Lungs/Pleura: No focal opacities evi...,IMPRESSION: Normal 2-view chest radiography.
1,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,EXAM: CHEST RADIOGRAPHY EXAM DATE: 05/23/2020 ...,CLINICAL HISTORY: CHEST PAIN. \n\n,FINDINGS: Lungs/Pleura: No focal opacities evi...,IMPRESSION: No acute cardiopulmonary abnormali...
2,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,EXAM: CHEST RADIOGRAPHY EXAM DATE: 12/13/2019 ...,CLINICAL HISTORY: CHEST PAIN. \n\n,FINDINGS: Lungs/Pleura: No focal opacities evi...,IMPRESSION: No acute cardiopulmonary process.
3,Exam: - CHEST-PORTABLE History: Chest pain Com...,Exam: - CHEST-PORTABLE\n\nComparison: None,History: Chest pain \n\n,Findings: Heart size appears normal. Lungs cle...,Impression: Lungs clear
4,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,EXAM: CHEST RADIOGRAPHY EXAM DATE: 06/17/2021 ...,"CLINICAL HISTORY: CHEST PAIN, SHORTNESS OF BRE...",FINDINGS: Lungs/Pleura: No focal opacities evi...,IMPRESSION: Normal single view chest.


In [48]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import json
import re

# Load Qwen model
model_id = "Qwen/Qwen1.5-0.5B-Chat"

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to("cuda")

# Format few-shot examples as Qwen messages
import json

def build_chat_messages(few_shot_df, test_report):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a clinical language processing expert. Your task is to extract structured fields from radiology reports. "
                "You will be given several examples, followed by a new report. "
                "Your response must match the format of the previous examples exactly, using valid JSON with the following fields only: "
                '"ExamName", "clinicaldata", "findings", "impression".\n'
                "• Use exact phrasing from the report.\n"
                "• Include section labels if they appear in the text.\n"
                "• If a section is missing, return an empty string for it.\n"
                "• All output must be in English. Do not generate Chinese or non-English characters."
            )
        }
    ]

    for _, row in few_shot_df.iterrows():
        messages.append({
            "role": "user",
            "content": f"Report:\n{row['ReportText']}\n\nOutput (JSON):"
        })
        messages.append({
            "role": "assistant",
            "content": json.dumps({
                "ExamName": row["ExamName"],
                "clinicaldata": row["clinicaldata"],
                "findings": row["findings"],
                "impression": row["impression"]
            }, indent=2)
        })

    messages.append({
        "role": "user",
        "content": f"Report:\n{test_report}\n\nOutput (JSON):"
    })

    return messages



# Apply Qwen chat template
def run_extraction_qwen(test_index, df, few_shot_df):
    messages = build_chat_messages(few_shot_df, df.loc[test_index, 'ReportText'])
    prompt_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=512,
            temperature=0.1
        )

    # Remove prompt part from the generated output
    generated_ids = output_ids[:, inputs["input_ids"].shape[1]:]
    decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    return re.sub(r'[^\x00-\x7F]+', '', decoded)# to sub other charters




# Example usage
response = run_extraction_qwen(test_index=31, df=df, few_shot_df=few_shot_df)
print(response)


{
  "ExamName": "EXAM:XR CHEST 1vw\n\nTECHNIQUE: Single AP view of the chest was submitted. \n\n",
  "clinicaldata": "CLINICAL: Shortness of breath \n\n",
  "findings": "FINDINGS: The lungs are clear. No pleural effusion is evident. The cardiomediastinal silhouette is within normal limits. The bones are unremarkable. \n\n",
  "impression": "Impression: No radiographic evidence for acute cardiopulmonary disease. Dictated by: [[PERSONALNAME], MD Dictated date/time: 05/17/2018 6:12 PM CDT Electronically Signed by: [[PERSONALNAME], MD Electronically Signed date/time: 05/17/2018 6:13 PM CDT"
}


In [54]:
from tqdm import tqdm
import pandas as pd
import json
import re
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Settings
start_index = 31
end_index = 50 # exclusive
results = []

# Inference loop with progress bar
for i in tqdm(range(start_index, end_index), desc="Running extractions"):
    output_str = run_extraction_qwen(test_index=i, df=df, few_shot_df=few_shot_df)

    try:
        parsed = json.loads(output_str)
    except json.JSONDecodeError:
        print(f"[Warning] Failed to parse JSON at index {i}: {output_str}")
        parsed = {
            "ExamName": "",
            "clinicaldata": "",
            "findings": "",
            "impression": ""
        }

    results.append(parsed)

# Convert to DataFrame
predicted_df = pd.DataFrame(results)

# Ground truth
true_df = df.iloc[start_index:end_index][['ExamName', 'clinicaldata', 'findings', 'impression']].reset_index(drop=True)

# Combine predicted and true
combined_df = pd.concat([true_df, predicted_df.add_suffix("_pred")], axis=1)

# BLEU Scoring
smooth = SmoothingFunction().method1

def compute_bleu(reference, hypothesis):
    return sentence_bleu([reference.split()], hypothesis.split(), smoothing_function=smooth)

print("\n--- BLEU Scores ---")
for col in ['ExamName', 'clinicaldata', 'findings', 'impression']:
    scores = [
        compute_bleu(str(ref), str(pred))
        for ref, pred in zip(combined_df[col], combined_df[col + "_pred"])
    ]
    avg_bleu = sum(scores) / len(scores)
    print(f"Average BLEU for '{col}': {avg_bleu:.4f}")


Running extractions:  11%|█         | 2/19 [00:33<04:39, 16.44s/it]

  "ExamName": "CheST, ONE VIEWXR ~~CLINICAL: Dyspnea~~COMPARaison: [DATE]: AP Chest.~~FINDINGS: ~~Heart is normal in size. Mild aortic atherosclerosis. Normal pulmonary vascularity. Lungs and pleural spaces are clear. Unremarkable soft tissues and bones.~~~IMpression:~~1. No acute chest disease.~*****Final Report******~~Dictated Date/Time: 09/20/00 06:41 am [[PERSONALNAME]] Interpreted By: [[PERSONALNAME]] Signature Date Time: 09/20/00 06:41 am [[PERSONALNAME]] :[[PERSONALNAME]] Signed By: [[PERSONALNAME]] Electronically Signified
}


Running extractions:  95%|█████████▍| 18/19 [04:31<00:20, 20.87s/it]

  "ExamName": "EXAM: CHEST radiosography EXAM DATE: 04/26/2021 05:43 AM.\n\nTECHNIQUE: 1 view.\n\nCOMPARaison: X-RAY CHEST 1 VIEW PORTABLE 04/25/2021 5:25 AM. TECHNIQUE: 1 view. Findings: Lungs/Pleura: Right basilar airspace disease and left basilar airspace disease, similar to prior. No pneumothorax. Mediastinum: Within exam limitations, the cardiomediastinal contour is normal. Other: Endotracheal tube tip located 2.7 cm above the carina. Enteric tube courses below the diaphragm into the region of the stomach with tip off the confines of the film. Right upper extremity PICC tip located in the region of the junction of the SVC and right atrium. Imppression: Right basilar airspace disease and left basilar airspace disease, similar to prior. Support hardware are located in standard positions, as described above. \n\n",
  "clinicaldata": "Acute respiratory failure. \n\n",
  "findings": "FINDINGS: Lungs/Pleura: Right basilar airspace disease and left basilar airspace disease, similar to pr

Running extractions: 100%|██████████| 19/19 [04:45<00:00, 15.01s/it]


--- BLEU Scores ---
Average BLEU for 'ExamName': 0.4665
Average BLEU for 'clinicaldata': 0.1652
Average BLEU for 'findings': 0.5783
Average BLEU for 'impression': 0.3646



