In [3]:
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import base64
import os
import torch
from sklearn.metrics.pairwise import cosine_similarity
import sys
import wandb
sys.path.append("/mnt/data2/datasets_lfay/MedImageInsights")
from medimageinsightmodel import MedImageInsight

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load training and test datasets
df_train = pd.read_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/train.csv")
df_test = pd.read_csv("/mnt/data2/datasets_lfay/MedImageInsights/data/MIMIC-v1.0-512/test.csv")

# Initialize model
classifier = MedImageInsight(
    model_dir="/mnt/data2/datasets_lfay/MedImageInsights/2024.09.27",
    vision_model_name="medimageinsigt-v1.0.0.pt",
    language_model_name="language_model.pth"
)
classifier.load_model()

# Define diseases to filter single disease occurrence
diseases = ['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
            'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
            'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
            'Support Devices']

# Step 1: Filter training reports for single disease occurrence
no_finding_samples_train = df_train[(df_train['No Finding'] == 1) & (df_train[diseases[1:]] == 0).all(axis=1)]
pneumonia_samples_train = df_train[(df_train['Pneumonia'] == 1) & (df_train[['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly', 
                                                        'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 
                                                        'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 
                                                        'Fracture', 'Support Devices']]== 0).all(axis=1)]
    
print(f"Number of 'No Finding' reports in training: {len(no_finding_samples_train)}")
print(f"Number of 'Pneumonia' reports in training: {len(pneumonia_samples_train)}")

# Randomly sample 10 reports for each condition
no_finding_reports = no_finding_samples_train.Path.sample(10, random_state=42)
pneumonia_reports = pneumonia_samples_train.Path.sample(10, random_state=42)

with torch.no_grad():
# Encode the selected reports
    report_texts_paths = list(no_finding_reports) + list(pneumonia_reports)
    report_texts = []
    for text in report_texts_paths:
        with open(os.path.join("/",*text.split('/')[:-1])+'.txt', 'r') as f:
            report_texts.append(f.read())


Model loaded successfully on device: cuda
Number of 'No Finding' reports in training: 12835
Number of 'Pneumonia' reports in training: 614


In [7]:
import re

def extract_findings_and_impressions(reports):
    extracted_data = []
    
    for report in reports:
        # Normalize spaces and line breaks
        report = " ".join(report.split())
        
        # Extract findings
        findings_match = re.search(r"FINDINGS:\s(.*?)(?=\sIMPRESSION:|\sCONCLUSION:|$)", report, re.IGNORECASE)
        findings = findings_match.group(1).strip() if findings_match else None
        
        # Extract impression
        impression_match = re.search(r"IMPRESSION:\s(.*?)(?=\sFINDINGS:|$)", report, re.IGNORECASE)
        impression = impression_match.group(1).strip() if impression_match else None
        
        extracted_data.append({
            "findings": findings,
            "impression": impression
        })
    
    return extracted_data


extracted_sections = extract_findings_and_impressions(report_texts)

# Display the extracted sections
for i, report in enumerate(extracted_sections, start=1):
    print(f"Report {i}:")
    print(f"Findings: {report['findings']}")
    print(f"Impression: {report['impression']}")
    print("-" * 80)


Report 1:
Findings: Left subclavian central venous catheter terminates in the lower superior vena cava. A nasogastric tube terminates in the stomach. The lung volumes are low. The cardiac, mediastinal and hilar contours appear stable. Minimal opacities at each lung base suggest minor atelectasis. Otherwise, the lungs appear clear. High density material in the visualized transverse and splenic flexure portions of the colon suggests a recent prior contrast administration for a radiologic study. Small quantity of hyperdense material at the base of the right lung may represent trace prior aspiration of barium.
Impression: No evidence of acute cardiopulmonary disease.
--------------------------------------------------------------------------------
Report 2:
Findings: No significant interval change. Left-sided pacemaker defibrillator device is unchanged. No focal consolidation, effusion, edema, or pneumothorax. Mild left basilar atelectasis. Moderate cardiomegaly is unchanged. Aortic knob mi