In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
user_credential = user_secrets.get_gcloud_credential()
user_secrets.set_tensorflow_credential(user_credential)

In [2]:
!pip install -q datasets
!pip install -q evaluate
!pip install -q jiwer

In [3]:
path_to_dataset = '/kaggle/input/medical-speech-transcription-and-intent/Medical Speech, Transcription, and Intent'

In [4]:
import pandas as pd

df = pd.read_csv(path_to_dataset + "/overview-of-recordings.csv")
df.sample(3)

Unnamed: 0,audio_clipping,audio_clipping:confidence,background_noise_audible,background_noise_audible:confidence,overall_quality_of_the_audio,quiet_speaker,quiet_speaker:confidence,speaker_id,file_download,file_name,phrase,prompt,writer_id
3826,no_clipping,1.0,no_noise,0.6826,3.33,audible_speaker,1.0,43856216,https://ml.sandbox.cf3.us/cgi-bin/index.cgi?do...,1249120_43856216_87443859.wav,It itches inside my ears.,Ear ache,1883056
4701,no_clipping,1.0,no_noise,1.0,3.67,audible_speaker,1.0,43620482,https://ml.sandbox.cf3.us/cgi-bin/index.cgi?do...,1249120_43620482_36193265.wav,Chronic disease of hair follicles and sebaceou...,Acne,44218005
2714,no_clipping,1.0,light_noise,1.0,4.0,audible_speaker,1.0,38202325,https://ml.sandbox.cf3.us/cgi-bin/index.cgi?do...,1249120_38202325_40303876.wav,I feel pain inside I do not know what it is,Internal pain,43730599


In [5]:
import os
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
import numpy as np

# Define paths
base_dir = path_to_dataset
csv_path = os.path.join(base_dir, "overview-of-recordings.csv")
recordings_dir = os.path.join(base_dir, "recordings")

# Debug: Print paths and verify existence
print("CSV path:", csv_path)
print("CSV exists:", os.path.exists(csv_path))
print("Recordings directory:", recordings_dir)
print("Recordings dir exists:", os.path.exists(recordings_dir))

# Load and clean CSV data
try:
    # Read CSV with all columns as strings initially
    df = pd.read_csv(csv_path, dtype=str)

    # Clean column names - remove any spaces and special characters
    df.columns = df.columns.str.strip().str.lower()

    # Extract relevant columns
    required_columns = ['file_name', 'phrase', 'prompt', 'writer_id']
    data = df[required_columns].copy()

    print("\nInitial data shape:", data.shape)
    print("\nSample of initial data:")
    print(data.head())

    # Add full file paths for each split
    def get_file_path(filename):
        # Check each subdirectory for the file
        for split in ['train', 'test', 'validate']:
            path = os.path.join(recordings_dir, split, filename)
            if os.path.exists(path):
                return path
        return None

    # Add file paths and filter for existing files
    data['file_path'] = data['file_name'].apply(get_file_path)
    data = data.dropna(subset=['file_path'])

    print("\nData after filtering for existing files:", len(data))

    if not data.empty:
        # Split the data according to the existing directory structure
        def get_split(file_path):
            if 'train' in file_path:
                return 'train'
            elif 'test' in file_path:
                return 'test'
            else:
                return 'validate'

        data['split'] = data['file_path'].apply(get_split)

        # Create datasets for each split
        train_data = data[data['split'] == 'train']
        test_data = data[data['split'] == 'test']
        validate_data = data[data['split'] == 'validate']

        print("\nSplit sizes:")
        print(f"Train: {len(train_data)}")
        print(f"Test: {len(test_data)}")
        print(f"Validate: {len(validate_data)}")

        # Convert to Hugging Face Datasets
        train_dataset = Dataset.from_pandas(train_data)
        test_dataset = Dataset.from_pandas(test_data)
        validate_dataset = Dataset.from_pandas(validate_data)

        # Function to safely show random elements
        def show_random_elements(dataset, num_examples=5):
            if len(dataset) == 0:
                print("Dataset is empty")
                return

            max_examples = min(num_examples, len(dataset))
            if max_examples > 0:
                # Convert to list to avoid numpy int64 issues
                indices = list(np.random.choice(len(dataset), max_examples, replace=False))
                print(f"\nShowing {max_examples} random examples:")
                for i, idx in enumerate(indices):
                    print(f"\nExample {i + 1}:")
                    example = dataset[int(idx)]  # Convert index to int
                    print(f"Phrase: {example['phrase']}")
                    print(f"Prompt: {example['prompt']}")
                    print(f"File: {os.path.basename(example['file_path'])}")

        # Show samples from each split
        print("\nSamples from train dataset:")
        show_random_elements(train_dataset)

        print("\nSamples from validation dataset:")
        show_random_elements(validate_dataset)

        print("\nSamples from test dataset:")
        show_random_elements(test_dataset)

    else:
        print("No valid files found after filtering.")

except Exception as e:
    print(f"An error occurred: {str(e)}")
    print("Please check if the CSV format matches the expected structure.")

CSV path: /kaggle/input/medical-speech-transcription-and-intent/Medical Speech, Transcription, and Intent/overview-of-recordings.csv
CSV exists: True
Recordings directory: /kaggle/input/medical-speech-transcription-and-intent/Medical Speech, Transcription, and Intent/recordings
Recordings dir exists: True

Initial data shape: (6661, 4)

Sample of initial data:
                       file_name  \
0  1249120_43453425_58166571.wav   
1  1249120_43719934_43347848.wav   
2  1249120_43719934_53187202.wav   
3  1249120_31349958_55816195.wav   
4  1249120_43719934_82524191.wav   

                                              phrase            prompt  \
0                    When I remember her I feel down    Emotional pain   
1  When I carry heavy things I feel like breaking...  Hair falling out   
2          there is too much pain when i move my arm       Heart hurts   
3  My son had his lip pierced and it is swollen a...    Infected wound   
4             My muscles in my lower back are achi

In [6]:
data.sample(5)

Unnamed: 0,file_name,phrase,prompt,writer_id,file_path,split
897,1249120_41087148_106385661.wav,"I've tried reading books, but nothing can chee...",Emotional pain,44292353,/kaggle/input/medical-speech-transcription-and...,test
5125,1249120_18172663_76557062.wav,I have this strange rash on my arm.,Skin issue,44140394,/kaggle/input/medical-speech-transcription-and...,test
2127,1249120_43604449_33808272.wav,After an hard working day I have foot ache,Foot ache,38687371,/kaggle/input/medical-speech-transcription-and...,test
6307,1249120_40419625_54798514.wav,I have a throbbing in my joints,Joint pain,44164300,/kaggle/input/medical-speech-transcription-and...,test
3752,1249120_43612961_108585632.wav,i can't breath because of Cough,Cough,44124309,/kaggle/input/medical-speech-transcription-and...,test


In [7]:
data = data.drop(columns=['prompt', 'writer_id', 'file_name'])

In [8]:
data = data.sample(200)

In [None]:
import torch
import torchaudio
import numpy as np
import pandas as pd
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import wandb
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Initialize wandb for experiment tracking
wandb.init(project="medical-speech-recognition", name="whisper-medical-finetuning")

# Load metric
wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels since they have to be of different lengths and need different padding methods
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding with -100 for loss calculation
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        batch["labels"] = labels

        return batch

def prepare_dataset(data):
    """Prepare dataset for training"""
    
    def load_audio(example):
        audio_input, sample_rate = torchaudio.load(example["file_path"])
        
        # Convert to mono if needed
        if audio_input.shape[0] > 1:
            audio_input = audio_input.mean(dim=0, keepdim=True)
            
        # Resample if needed
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            audio_input = resampler(audio_input)
            
        example["input_features"] = processor(
            audio_input.squeeze(0),
            sampling_rate=16000,
            return_tensors="pt"
        )["input_features"].squeeze(0)
        
        example["labels"] = processor.tokenizer(example["phrase"])["input_ids"]
        return example

    dataset = Dataset.from_pandas(data)
    processed_dataset = dataset.map(load_audio, remove_columns=dataset.column_names)
    return processed_dataset

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Handle the case where pred_ids is logits
    if isinstance(pred_ids, tuple):
        pred_ids = pred_ids[0]

    # Get the most likely token ids if we have logits
    if len(pred_ids.shape) == 3:
        pred_ids = np.argmax(pred_ids, axis=-1)

    # Prepare the arrays for decoding
    pred_ids = [ids for ids in pred_ids]
    label_ids = [ids for ids in label_ids]

    # Replace -100 with pad token id in labels
    cleaned_label_ids = []
    for labels in label_ids:
        clean_labels = [processor.tokenizer.pad_token_id if label == -100 else label for label in labels]
        cleaned_label_ids.append(clean_labels)

    # Decode to texts
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(cleaned_label_ids, skip_special_tokens=True)

    # Compute metrics
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer, "cer": cer}

# Load model and processor
model_name = "openai/whisper-tiny"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)

# Prepare datasets
train_data, eval_data = train_test_split(data, test_size=0.2, random_state=42)
train_dataset = prepare_dataset(train_data)
eval_dataset = prepare_dataset(eval_data)

# Initialize data collator
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-medical-finetuned",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    warmup_steps=10,
    max_steps=400,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    eval_steps=10,
    save_steps=40,
    logging_steps=10,
    report_to="wandb",
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
print("Starting training...")
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msajidala09[0m ([33msajidala09-jamia-millia-islamia[0m). Use [1m`wandb login --relogin`[0m to force relogin


Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Starting training...


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...


In [None]:
# Save the final model
# trainer.save_model("./whisper-medical-finetuned-final")

# Create visualizations
def plot_training_metrics(trainer):
    # Get training logs
    logs = pd.DataFrame(trainer.state.log_history)
    
    # Training loss plot
    plt.figure(figsize=(12, 6))
    plt.plot(logs[logs['loss'].notna()]['step'], logs[logs['loss'].notna()]['loss'])
    plt.title('Training Loss Over Time')
    plt.xlabel('Step')
    plt.ylabel('Loss')
    plt.savefig('training_loss.png')
    wandb.log({"training_loss_plot": wandb.Image('training_loss.png')})
    
    # WER and CER plot
    plt.figure(figsize=(12, 6))
    eval_logs = logs[logs['eval_wer'].notna()]
    plt.plot(eval_logs['step'], eval_logs['eval_wer'], label='WER')
    plt.plot(eval_logs['step'], eval_logs['eval_cer'], label='CER')
    plt.title('WER and CER Over Time')
    plt.xlabel('Step')
    plt.ylabel('Error Rate')
    plt.legend()
    plt.savefig('error_rates.png')
    wandb.log({"error_rates_plot": wandb.Image('error_rates.png')})

# Plot metrics after training
plot_training_metrics(trainer)

# Evaluate on test set
print("Evaluating model on test set...")
eval_results = trainer.evaluate()
print(f"Final evaluation results: {eval_results}")

# Function to analyze error patterns
def analyze_error_patterns(trainer, eval_dataset, num_samples=100):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    predictions = []
    references = []
    
    for i in tqdm(range(min(num_samples, len(eval_dataset)))):
        input_features = eval_dataset[i]["input_features"].unsqueeze(0).to(device)
        with torch.no_grad():
            pred_ids = model.generate(input_features)
        
        pred_text = processor.batch_decode(pred_ids, skip_special_tokens=True)[0]
        ref_text = processor.decode(eval_dataset[i]["labels"], skip_special_tokens=True)
        
        predictions.append(pred_text)
        references.append(ref_text)
    
    # Calculate word-level differences
    from difflib import SequenceMatcher
    
    def get_word_differences(pred, ref):
        pred_words = pred.split()
        ref_words = ref.split()
        matcher = SequenceMatcher(None, pred_words, ref_words)
        return [tag for tag in matcher.get_opcodes() if tag[0] != 'equal']
    
    error_types = {'substitution': 0, 'deletion': 0, 'insertion': 0}
    
    for pred, ref in zip(predictions, references):
        differences = get_word_differences(pred, ref)
        for diff in differences:
            error_types[diff[0]] += 1
    
    # Plot error type distribution
    plt.figure(figsize=(10, 6))
    plt.bar(error_types.keys(), error_types.values())
    plt.title('Distribution of Error Types')
    plt.ylabel('Count')
    plt.savefig('error_distribution.png')
    wandb.log({"error_distribution": wandb.Image('error_distribution.png')})
    
    return error_types

# Analyze error patterns
error_patterns = analyze_error_patterns(trainer, eval_dataset)
print(f"Error pattern analysis: {error_patterns}")

# Finish wandb run
wandb.finish()