In [1]:
import pandas as pd
import json
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

In [2]:
def clean_dataframe_for_arrow(df):
      """
      Clean DataFrame to make it compatible with PyArrow conversion
      """
      print("Cleaning DataFrame for PyArrow compatibility...")

      # Keep only the essential columns for training
      essential_columns = [
          'report_id', 'mapped_clinic_id', 'grouped_modality', 'clinic_modality',
          'training_input', 'training_output', 'clean_findings', 'clean_impression',
          'clean_history', 'clean_technique'
      ]

      # Filter to only columns that exist in the dataframe
      available_columns = [col for col in essential_columns if col in df.columns]
      df_clean = df[available_columns].copy()

      # Convert all columns to string type to avoid Arrow conversion issues
      for col in df_clean.columns:
          if col not in ['report_id']:  # Keep report_id as is if it's numeric
              df_clean[col] = df_clean[col].astype(str)
              # Replace 'nan' strings with None
              df_clean[col] = df_clean[col].replace(['nan', 'None', ''], None)

      # Fill any remaining NaN values
      df_clean = df_clean.fillna('')

      print(f"Cleaned DataFrame shape: {df_clean.shape}")
      print(f"Columns retained: {list(df_clean.columns)}")

      return df_clean

def load_processed_data_to_huggingface():
    """
    Convert processed radiology data to HuggingFace Dataset format
    """

    # Define paths
    data_dir = "./processed_data"

    # Load the processed datasets
    print("Loading processed data...")
    train_df = pd.read_csv(f"{data_dir}/train_data.csv", low_memory=False)
    val_df = pd.read_csv(f"{data_dir}/val_data.csv", low_memory=False)
    test_df = pd.read_csv(f"{data_dir}/test_data.csv", low_memory=False)

    # Clean DataFrames for Arrow compatibility
    train_df = clean_dataframe_for_arrow(train_df)
    val_df = clean_dataframe_for_arrow(val_df)
    test_df = clean_dataframe_for_arrow(test_df)

    # Load reference banks for style learning
    with open(f"{data_dir}/reference_banks.json", 'r') as f:
        reference_banks = json.load(f)

    print(f"Loaded datasets:")
    print(f"  Train: {len(train_df)} samples")
    print(f"  Validation: {len(val_df)} samples")
    print(f"  Test: {len(test_df)} samples")
    print(f"  Reference banks: {len(reference_banks)} clinics")

    # Radiology-specific system message
    system_message = """You are an expert radiologist assistant specializing in generating accurate and concise medical impressions from radiology
findings.

Your task is to:
1. **Analyze the findings**: Carefully review all clinical findings, history, and technique information
2. **Generate focused impressions**: Create clear, prioritized conclusions that directly address the clinical question
3. **Maintain clinical accuracy**: Ensure all significant findings are appropriately characterized
4. **Use appropriate medical terminology**: Follow standard radiological reporting conventions
5. **Adapt communication style**: Match the institutional reporting style and level of detail expected

Generate only the IMPRESSION section based on the provided clinical information."""

    def create_radiology_chat_format(sample):
        """Convert radiology data to chat format with clinic/modality context"""

        # Extract fields - handle potential None values
        clinic = sample.get('mapped_clinic_id', 'unknown')
        modality = sample.get('grouped_modality', 'unknown')
        findings_input = sample.get('training_input', '')
        impression_output = sample.get('training_output', '')
        clinic_modality = sample.get('clinic_modality', f"{clinic}_{modality}")

        # Skip samples with empty essential fields
        if not findings_input or not impression_output:
            return {
                "messages": [],
                "clinic_id": clinic,
                "modality": modality,
                "findings": findings_input,
                "impression": impression_output,
                "clinic_modality": clinic_modality,
                "is_valid": False
            }

        # Create user message with clinical context
        user_content = f"Please generate an appropriate radiology impression for this {modality} study from {clinic}:\n\n{findings_input}"

        # Create chat messages
        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": impression_output}
        ]

        return {
            "messages": messages,
            "clinic_id": clinic,
            "modality": modality,
            "findings": findings_input,
            "impression": impression_output,
            "clinic_modality": clinic_modality,
            "is_valid": True
        }

    # Convert DataFrames to HuggingFace Datasets
    print("Converting to HuggingFace Dataset format...")

    try:
        train_dataset = Dataset.from_pandas(train_df)
        val_dataset = Dataset.from_pandas(val_df)
        test_dataset = Dataset.from_pandas(test_df)
        print("✅ DataFrame to Dataset conversion successful!")

    except Exception as e:
        print(f"❌ Error in DataFrame conversion: {e}")
        print("Attempting alternative conversion method...")

        # Alternative: Convert to dict first
        train_dataset = Dataset.from_dict(train_df.to_dict('list'))
        val_dataset = Dataset.from_dict(val_df.to_dict('list'))
        test_dataset = Dataset.from_dict(test_df.to_dict('list'))
        print("✅ Alternative conversion successful!")

    # Apply chat formatting
    print("Applying chat formatting...")
    train_dataset = train_dataset.map(create_radiology_chat_format, batched=False)
    val_dataset = val_dataset.map(create_radiology_chat_format, batched=False)
    test_dataset = test_dataset.map(create_radiology_chat_format, batched=False)

    # Filter out invalid samples
    train_dataset = train_dataset.filter(lambda x: x['is_valid'])
    val_dataset = val_dataset.filter(lambda x: x['is_valid'])
    test_dataset = test_dataset.filter(lambda x: x['is_valid'])

    # Remove the is_valid column
    train_dataset = train_dataset.remove_columns(['is_valid'])
    val_dataset = val_dataset.remove_columns(['is_valid'])
    test_dataset = test_dataset.remove_columns(['is_valid'])

    # Create DatasetDict
    dataset_dict = DatasetDict({
        "train": train_dataset,
        "validation": val_dataset,
        "test": test_dataset
    })

    print("Dataset conversion complete!")
    print(f"Final dataset structure:")
    print(dataset_dict)

    return dataset_dict, reference_banks

def apply_chat_template_to_dataset(dataset_dict, tokenizer, max_length=2048):
    """
    Apply tokenizer chat template to convert messages to model input format
    """

    def tokenize_function(examples):
        """Convert chat messages to tokenized format"""

        # Apply chat template to convert messages to text
        texts = []
        for messages in examples["messages"]:
            text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=False
            )
            texts.append(text)

        # Tokenize the formatted text
        tokenized = tokenizer(
            texts,
            truncation=True,
            padding=False,  # We'll pad during training
            max_length=max_length,
            return_overflowing_tokens=False,
        )

        return {
            "input_ids": tokenized["input_ids"],
            "attention_mask": tokenized["attention_mask"],
            "text": texts
        }

    print("Applying chat template and tokenization...")

    # Apply tokenization to all splits
    tokenized_datasets = dataset_dict.map(
        tokenize_function,
        batched=True,
        remove_columns=["messages"],  # Remove original messages, keep metadata
        desc="Tokenizing"
    )

    print("Tokenization complete!")
    return tokenized_datasets

def prepare_style_reference_data(reference_banks, tokenizer):
    """
    Prepare reference data for style learning during training
    """

    print("Preparing style reference data...")

    style_references = {}

    for clinic_id, impressions in reference_banks.items():
        # Tokenize reference impressions for this clinic
        tokenized_refs = tokenizer(
            impressions,
            truncation=True,
            padding=True,
            max_length=512,  # Shorter for impressions
            return_tensors="pt"
        )

        style_references[clinic_id] = {
            "impressions": impressions,
            "tokenized": tokenized_refs
        }

    print(f"Style references prepared for {len(style_references)} clinics")
    return style_references

# Usage example for your notebook:
def load_radiology_datasets():
    """
    Main function to load and prepare all radiology datasets for training
    """

    # Load model name (same as in original notebook)
    model_name = "microsoft/MediPhi-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    # Ensure tokenizer has pad token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Load and convert data
    dataset_dict, reference_banks = load_processed_data_to_huggingface()

    # Apply chat template and tokenization
    tokenized_datasets = apply_chat_template_to_dataset(dataset_dict, tokenizer)

    # Prepare style references
    style_references = prepare_style_reference_data(reference_banks, tokenizer)

    # Save processed datasets for later use
    tokenized_datasets.save_to_disk("./radiology_datasets")

    with open("./style_references.json", 'w') as f:
        # Save just the text data, not tokenized tensors
        text_only_refs = {k: v["impressions"] for k, v in style_references.items()}
        json.dump(text_only_refs, f, indent=2)

    print("All datasets saved successfully!")

    return tokenized_datasets, style_references, tokenizer

# Sample the data to verify format
def sample_dataset(dataset_dict, n_samples=2):
    """
    Display sample data to verify correct formatting
    """
    print("=== SAMPLE TRAINING DATA ===")

    for i in range(min(n_samples, len(dataset_dict["train"]))):
        sample = dataset_dict["train"][i]
        print(f"\nSample {i+1}:")
        print(f"Clinic: {sample['clinic_id']}")
        print(f"Modality: {sample['modality']}")
        print(f"Text preview: {sample['text'][:200]}...")
        print("-" * 80)

In [3]:
dataset_dict, reference_banks = load_processed_data_to_huggingface()

Loading processed data...
Cleaning DataFrame for PyArrow compatibility...
Cleaned DataFrame shape: (18742, 10)
Columns retained: ['report_id', 'mapped_clinic_id', 'grouped_modality', 'clinic_modality', 'training_input', 'training_output', 'clean_findings', 'clean_impression', 'clean_history', 'clean_technique']
Cleaning DataFrame for PyArrow compatibility...
Cleaned DataFrame shape: (4019, 10)
Columns retained: ['report_id', 'mapped_clinic_id', 'grouped_modality', 'clinic_modality', 'training_input', 'training_output', 'clean_findings', 'clean_impression', 'clean_history', 'clean_technique']
Cleaning DataFrame for PyArrow compatibility...
Cleaned DataFrame shape: (4032, 10)
Columns retained: ['report_id', 'mapped_clinic_id', 'grouped_modality', 'clinic_modality', 'training_input', 'training_output', 'clean_findings', 'clean_impression', 'clean_history', 'clean_technique']
Loaded datasets:
  Train: 18742 samples
  Validation: 4019 samples
  Test: 4032 samples
  Reference banks: 6 clinic

Map:   0%|          | 0/18742 [00:00<?, ? examples/s]

Map:   0%|          | 0/4019 [00:00<?, ? examples/s]

Map:   0%|          | 0/4032 [00:00<?, ? examples/s]

Filter:   0%|          | 0/18742 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4019 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4032 [00:00<?, ? examples/s]

Dataset conversion complete!
Final dataset structure:
DatasetDict({
    train: Dataset({
        features: ['report_id', 'mapped_clinic_id', 'grouped_modality', 'clinic_modality', 'training_input', 'training_output', 'clean_findings', 'clean_impression', 'clean_history', 'clean_technique', 'messages', 'clinic_id', 'modality', 'findings', 'impression'],
        num_rows: 18742
    })
    validation: Dataset({
        features: ['report_id', 'mapped_clinic_id', 'grouped_modality', 'clinic_modality', 'training_input', 'training_output', 'clean_findings', 'clean_impression', 'clean_history', 'clean_technique', 'messages', 'clinic_id', 'modality', 'findings', 'impression'],
        num_rows: 4019
    })
    test: Dataset({
        features: ['report_id', 'mapped_clinic_id', 'grouped_modality', 'clinic_modality', 'training_input', 'training_output', 'clean_findings', 'clean_impression', 'clean_history', 'clean_technique', 'messages', 'clinic_id', 'modality', 'findings', 'impression'],
      

In [4]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['report_id', 'mapped_clinic_id', 'grouped_modality', 'clinic_modality', 'training_input', 'training_output', 'clean_findings', 'clean_impression', 'clean_history', 'clean_technique', 'messages', 'clinic_id', 'modality', 'findings', 'impression'],
        num_rows: 18742
    })
    validation: Dataset({
        features: ['report_id', 'mapped_clinic_id', 'grouped_modality', 'clinic_modality', 'training_input', 'training_output', 'clean_findings', 'clean_impression', 'clean_history', 'clean_technique', 'messages', 'clinic_id', 'modality', 'findings', 'impression'],
        num_rows: 4019
    })
    test: Dataset({
        features: ['report_id', 'mapped_clinic_id', 'grouped_modality', 'clinic_modality', 'training_input', 'training_output', 'clean_findings', 'clean_impression', 'clean_history', 'clean_technique', 'messages', 'clinic_id', 'modality', 'findings', 'impression'],
        num_rows: 4032
    })
})

In [6]:
# reference_banks