In [1]:
!pip install datasets



In [4]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import kagglehub
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import accuracy_score
import os

## Importing Data

In [5]:
from datasets import load_dataset

ds = load_dataset("spikecodes/911-call-transcripts")

In [6]:
ds

DatasetDict({
    train: Dataset({
        features: ['messages'],
        num_rows: 518
    })
})

In [7]:
# Convert the 'train' split to a pandas DataFrame
df = ds['train'].to_pandas()

# Display the DataFrame
print(df)

                                              messages
0    [{'role': 'assistant', 'content': '9-1-1, what...
1    [{'role': 'assistant', 'content': '9-1-1, what...
2    [{'role': 'assistant', 'content': '9-1-1, what...
3    [{'role': 'assistant', 'content': '9-1-1, what...
4    [{'role': 'assistant', 'content': '9-1-1, what...
..                                                 ...
513  [{'role': 'assistant', 'content': '9-1-1, what...
514  [{'role': 'assistant', 'content': '9-1-1, what...
515  [{'role': 'assistant', 'content': '9-1-1, what...
516  [{'role': 'assistant', 'content': '9-1-1, what...
517  [{'role': 'assistant', 'content': '9-1-1, what...

[518 rows x 1 columns]


## Data Preprocessing

In [9]:
df['text'] = df['messages'].apply(lambda x: ' '.join([m['content'] for m in x if m['content'] is not None]))

In [10]:
# Create a list to store the expanded data
expanded_data = []

# Iterate through each row in the original DataFrame
for index, row in df.iterrows():
    for message in row['messages']:
        expanded_data.append({
            'original_index': index,
            'role': message['role'],
            'content': message['content']
        })

# Create the expanded DataFrame from the list of dictionaries
expanded_df = pd.DataFrame(expanded_data)

# Display the first few rows of the new DataFrame
print(expanded_df.head(10))

   original_index       role  \
0               0  assistant   
1               0       user   
2               0  assistant   
3               0       user   
4               0  assistant   
5               0       user   
6               0  assistant   
7               0       user   
8               0  assistant   
9               0       user   

                                             content  
0                      9-1-1, what's your emergency?  
1  I'm at West High School. There's a guy with a ...  
2                                 Which high school?  
3                                         West High.  
4  Okay, we have the police dispatched. Can you g...  
5  I don't know. The guy is just running through ...  
6   Can someone give me a description of the person?  
7      I don't know. Can anybody give a description?  
8  Do we know where in the building? Is he white,...  
9                                      I don't know.  


In [11]:
expanded_df['label'] = expanded_df['role'].apply(lambda x: 1 if x == 'assistant' else 0)

In [13]:
expanded_df.dropna(inplace=True)

In [15]:
expanded_df.set_index('original_index', inplace=True)
expanded_df

Unnamed: 0_level_0,role,content,label
original_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,assistant,"9-1-1, what's your emergency?",1
0,user,I'm at West High School. There's a guy with a ...,0
0,assistant,Which high school?,1
0,user,West High.,0
0,assistant,"Okay, we have the police dispatched. Can you g...",1
...,...,...,...
517,assistant,Are you on a cordless phone?,1
517,user,"I have a cordless phone, but I use a walker.",0
517,assistant,You can go ahead and hang up with me and go ah...,1
517,user,All right. Bye.,0


In [19]:
expanded_df['content'].dropna(inplace=True)

In [20]:
# First look at what we're removing
print("Entries being removed:")
print(expanded_df[expanded_df.index.get_level_values('original_index').isin([78, 451])])

# Then remove them
expanded_df = expanded_df[~expanded_df.index.get_level_values('original_index').isin([78, 451])]

Entries being removed:
                     role                                            content  \
original_index                                                                 
78              assistant                      9-1-1, what's your emergency?   
78                   user                                  1620 Green Place.   
78              assistant                                         1620 what?   
78                   user                                       Green Place.   
78              assistant                 Green Place. Green like the color?   
...                   ...                                                ...   
451                  user                                   It's the police.   
451             assistant  Okay. Step outside and do what they say. Just ...   
451                  user                                      They're here.   
451             assistant  Okay. Just put the phone down and do what they...   
451              

In [22]:
train_texts, val_texts, train_labels, val_labels = train_test_split(expanded_df['content'].tolist(), expanded_df['label'].tolist(), test_size=0.2, random_state=42)

In [23]:
# Print some statistics
print(f"Total samples: {len(expanded_df)}")
print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")

Total samples: 25799
Training samples: 20639
Validation samples: 5160


In [None]:
expanded_df

In [None]:
class EmergencyCallDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Set up tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
# Create datasets
train_dataset = EmergencyCallDataset(train_texts, train_labels, tokenizer, max_length=512)
val_dataset = EmergencyCallDataset(val_texts, val_labels, tokenizer, max_length=512)

In [None]:
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [None]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_loader) * 10  # 10 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

## Model Training

In [None]:
# Set up device (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
for epoch in range(10):
    # Training phase
    model.train()
    for batch in train_loader:
        # Clear previous gradients
        optimizer.zero_grad()
        
        # Move data to device
        inputs = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device),
            'labels': batch['labels'].to(device)
        }
        
        # Forward pass and calculate loss
        outputs = model(**inputs)
        loss = outputs.loss
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Validation phase
    model.eval()  # Set model to evaluation mode
    val_loss = 0
    correct = 0
    total = 0
    
    # No gradient calculation needed for validation
    with torch.no_grad():
        for batch in val_loader:
            # Move data to device
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device),
                'labels': batch['labels'].to(device)
            }
            
            # Get model predictions
            outputs = model(**inputs)
            
            # Calculate validation loss
            val_loss += outputs.loss.item()
            
            # Calculate accuracy
            _, predictions = torch.max(outputs.logits, 1)
            total += inputs['labels'].size(0)
            correct += (predictions == inputs['labels']).sum().item()
    
    # Print epoch results
    avg_val_loss = val_loss / len(val_loader)
    accuracy = (correct / total) * 100
    print(f'Epoch {epoch+1}:')
    print(f'  Validation Loss: {avg_val_loss:.4f}')
    print(f'  Accuracy: {accuracy:.2f}%')
    print('-' * 50)

In [None]:
# Save the fine-tuned model
model.save_pretrained('fine_tuned_bert_emergency_calls')
tokenizer.save_pretrained('fine_tuned_bert_emergency_calls')

## Hyperparameter Tuning

In [None]:
# Tokenize the data
tokenized_train = tokenizer(train_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
tokenized_test = tokenizer(val_texts, padding=True, truncation=True, return_tensors="pt", max_length=512)


# Hyperparameter grid
learning_rates = [1e-5, 5e-5, 1e-4]
dropout_rates = [0.1, 0.3, 0.5]
batch_sizes = [16, 32]

best_accuracy = 0
best_params = {}
best_model = None

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Iterate over hyperparameters
for lr in learning_rates:
    for dropout in dropout_rates:
        for batch_size in batch_sizes:
            print(f"Training with lr={lr}, dropout={dropout}, batch_size={batch_size}")

            # Create model with specific dropout
            model = BertForSequenceClassification.from_pretrained(
                "bert-base-uncased", 
                num_labels=2,
                hidden_dropout_prob=dropout
            )
            model.to(device)

            # DataLoader with specific batch size
            train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=batch_size)

            # Define optimizer and criterion
            optimizer = optim.AdamW(model.parameters(), lr=lr)
            criterion = nn.CrossEntropyLoss()

            # Train and evaluate
            for epoch in range(3):  # Fixed number of epochs for tuning
                model.train()
                total_loss = 0
                
                for batch in train_loader:
                    optimizer.zero_grad()
                    inputs = {key: val.to(device) for key, val in batch.items()}
                    outputs = model(**inputs)
                    loss = criterion(outputs.logits, inputs['labels'])
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.item()

                # Evaluate on validation set
                model.eval()
                all_preds = []
                all_labels = []
                
                with torch.no_grad():
                    for batch in val_loader:
                        inputs = {key: val.to(device) for key, val in batch.items()}
                        outputs = model(**inputs)
                        preds = torch.argmax(outputs.logits, dim=1)
                        all_preds.extend(preds.cpu().numpy())
                        all_labels.extend(inputs['labels'].cpu().numpy())

                val_accuracy = accuracy_score(all_labels, all_preds)
                print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}, Val Accuracy: {val_accuracy}")
            if val_accuracy > best_accuracy:
                best_accuracy = val_accuracy
                best_params = {
                    "learning_rate": lr,
                    "dropout": dropout,
                    "batch_size": batch_size,
                }
                best_model = model.state_dict()  # Save the model state

print(f"Best Accuracy: {best_accuracy}")
print(f"Best Parameters: {best_params}")

## Save the model with the best hyperparameters

In [None]:
# Save the best model
if best_model is not None:
    save_path = 'best_model.pth'
    torch.save(best_model, save_path)
    print(f"Best model saved to {save_path}")