In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.0 MB/s[0m eta [36m0:00:0

In [None]:
!pip install gdown  # Install gdown if it's not installed
import gdown

# The ID from the Google Drive file link
file_id = '19QoMJGGB1M_vYMp5QRgvysqZ-DSbvLyb'

# Construct the download URL
download_url = f'https://drive.google.com/uc?id={file_id}'

# Download the file to your current directory
output = '1_1_1_small.json'
gdown.download(download_url, output, quiet=False)




Downloading...
From: https://drive.google.com/uc?id=19QoMJGGB1M_vYMp5QRgvysqZ-DSbvLyb
To: /content/1_1_1_small.json
100%|██████████| 904k/904k [00:00<00:00, 7.51MB/s]


'1_1_1_small.json'

In [None]:
# Import required libraries
import os
import json
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, f1_score
import time

In [None]:
# Load raw data

with open('1_1_1_small.json', 'r') as f:
    raw_data = json.load(f)

# Preprocess data
def preprocess_data(raw_data):
    processed_data = []
    for key in raw_data.keys():
        for idx, conv in tqdm(enumerate(raw_data[key]), desc=f"Processing {key} conversations"):
            try:
                if isinstance(conv, dict):
                    processed = {
                        'convo_id': conv.get('convo_id', 'unknown'),
                        'scenario': conv.get('scenario', {}),
                        'conversation': ' '.join([f"{turn[0]}: {turn[1]}" for turn in conv.get('original', [])]),
                        'flow': conv.get('scenario', {}).get('flow', 'unknown'),
                        'subflow': conv.get('scenario', {}).get('subflow', 'unknown'),
                        'name': conv.get('name', 'unknown'),  # Extract name
                        'email': conv.get('email', 'unknown'),  # Extract email
                        'phone': conv.get('phone', 'unknown')   # Extract phone number
                    }
                    processed_data.append(processed)
                else:
                    print(f"Unexpected data structure at index {idx}: {conv}")
            except Exception as e:
                print(f"Error processing conversation at index {idx}: {e}")

    return processed_data

# Convert processed data to a list of dictionaries
processed_data = preprocess_data(raw_data)

# Save processed data to JSON
output_path = '/content/drive/MyDrive/ABCD/processed_data.json'
try:
    with open(output_path, 'w') as json_file:
        json.dump(processed_data, json_file, indent=4)
    print(f"Processed data saved to {output_path}")
except Exception as e:
    print(f"Error saving processed data: {e}")




Processing train conversations: 100it [00:00, 89202.55it/s]
Processing test conversations: 100it [00:00, 40804.59it/s]
Processing dev conversations: 100it [00:00, 79937.18it/s]

Error saving processed data: [Errno 2] No such file or directory: '/content/drive/MyDrive/ABCD/processed_data.json'





In [None]:

# Define a Dataset for fine-tuning
def create_dataset(df):
    def preprocess_function(examples):
        return tokenizer(examples['conversation'], truncation=True, padding=True)

    # Convert processed DataFrame to a Dataset
    dataset = Dataset.from_pandas(df[['conversation', 'flow', 'email', 'name', 'phone']])


    # Convert 'flow' to numerical labels
    label_list = df['flow'].unique().tolist()  # Use DataFrame's 'flow' column
    label_to_id = {label: i for i, label in enumerate(label_list)}
    dataset = dataset.map(lambda x: {'labels': label_to_id[x['flow']]})

    # Tokenize the dataset
    tokenized_dataset = dataset.map(preprocess_function, batched=True)
    return tokenized_dataset, label_list

#new functions added
# Function to calculate accuracy for a specific field
def field_accuracy(eval_dataset, predictions, field='email'):
    """Calculates accuracy for a specific field like 'email', 'name', or 'phone'."""
    labels = eval_dataset[field]
    preds = np.argmax(predictions, axis=-1)

    # Assuming that `preds` contains predictions for emails, name, or phone
    correct_preds = [1 if label == pred else 0 for label, pred in zip(labels, preds)]

    # Field-level accuracy
    accuracy = sum(correct_preds) / len(correct_preds)
    return accuracy



# Evaluate the model for email, name, and phone accuracies
def evaluate_model_with_fields(trainer, eval_dataset):
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=-1)  # Assuming predictions output labels

    # Check if fields exist in the eval_dataset
    if 'email' in eval_dataset.column_names:
        email_accuracy = field_accuracy(eval_dataset, predictions.predictions, field='email')
    else:
        email_accuracy = None  # Handle if field doesn't exist

    if 'name' in eval_dataset.column_names:
        name_accuracy = field_accuracy(eval_dataset, predictions.predictions, field='name')
    else:
        name_accuracy = None  # Handle if field doesn't exist

    if 'phone' in eval_dataset.column_names:
        phone_accuracy = field_accuracy(eval_dataset, predictions.predictions, field='phone')
    else:
        phone_accuracy = None  # Handle if field doesn't exist

    overall_accuracy = accuracy_score(predictions.label_ids, preds)
    return {
        'overall_accuracy': overall_accuracy,
        'email_accuracy': email_accuracy,
        'name_accuracy': name_accuracy,
        'phone_accuracy': phone_accuracy
    }


def evaluate_model(trainer, eval_dataset):
    predictions = trainer.predict(eval_dataset)
    preds = np.argmax(predictions.predictions, axis=-1)
    labels = predictions.label_ids
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': accuracy, 'f1': f1}

def compare_models(results):
    print("\nModel Comparison:")
    print("=" * 80)
    print(f"{'Model':<15} {'Accuracy':<10} {'F1 Score':<10} {'Train Time':<15} {'Inference Time':<15}")
    print("-" * 80)
    for result in results:
        print(f"{result['model']:<15} {result['accuracy']:.4f}     {result['f1']:.4f}      {result['train_time']:.2f}s         {result['inference_time']:.2f}s")

# List of models to evaluate
model_names = [
    'bert-base-uncased',  # BERT model
    'distilbert-base-uncased',  # DistilBERT model
    'roberta-base',  # RoBERTa model
    'xlm-roberta-base'  # XLM-RoBERTa model
]

# Split data into train and validation sets
df = pd.DataFrame(processed_data)
train_df = df.sample(frac=0.8, random_state=42)  # 80% for training
val_df = df.drop(train_df.index)  # 20% for validation

# Evaluate models
results = []

for model_name in model_names:
    print(f"Evaluating model: {model_name}")

    # Initialize tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenized_train, label_list = create_dataset(train_df)
    tokenized_val, _ = create_dataset(val_df)  # Create validation dataset
    num_labels = len(label_list)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
    )

    # Create Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
    )

    # Train the model
    trainer.train()

    # new implementation to show field accuracy separately for name, email id and phone number
    # Evaluate the model on validation set and get field-level accuracies
    metrics = evaluate_model_with_fields(trainer, tokenized_val)
    results.append({
        'model': model_name,
        'overall_accuracy': metrics['overall_accuracy'],
        'email_accuracy': metrics['email_accuracy'],
        'name_accuracy': metrics['name_accuracy'],
        'phone_accuracy': metrics['phone_accuracy'],
    })

    # Compare models with field-specific accuracies
    print("\nModel Comparison (Field-Specific Accuracies):")
    print("=" * 80)
    print(f"{'Model':<15} {'Overall Accuracy':<18} {'Email Accuracy':<15} {'Name Accuracy':<15} {'Phone Accuracy':<15}")
    print("-" * 80)
    for result in results:
        print(f"{result['model']:<15} {result['overall_accuracy']:.4f}     {result['email_accuracy']:.4f}     {result['name_accuracy']:.4f}     {result['phone_accuracy']:.4f}")


Evaluating model: bert-base-uncased


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.294426
2,No log,2.251012
3,No log,2.272046



Model Comparison (Field-Specific Accuracies):
Model           Overall Accuracy   Email Accuracy  Name Accuracy   Phone Accuracy 
--------------------------------------------------------------------------------
bert-base-uncased 0.1667     0.0000     0.0000     0.0000
Evaluating model: distilbert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.327229
2,No log,2.331289
3,No log,2.310186



Model Comparison (Field-Specific Accuracies):
Model           Overall Accuracy   Email Accuracy  Name Accuracy   Phone Accuracy 
--------------------------------------------------------------------------------
bert-base-uncased 0.1667     0.0000     0.0000     0.0000
distilbert-base-uncased 0.0333     0.0000     0.0000     0.0000
Evaluating model: roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.31914
2,No log,2.387768
3,No log,2.415371



Model Comparison (Field-Specific Accuracies):
Model           Overall Accuracy   Email Accuracy  Name Accuracy   Phone Accuracy 
--------------------------------------------------------------------------------
bert-base-uncased 0.1667     0.0000     0.0000     0.0000
distilbert-base-uncased 0.0333     0.0000     0.0000     0.0000
roberta-base    0.0167     0.0000     0.0000     0.0000
Evaluating model: xlm-roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,2.316398
2,No log,2.303854
3,No log,2.311295



Model Comparison (Field-Specific Accuracies):
Model           Overall Accuracy   Email Accuracy  Name Accuracy   Phone Accuracy 
--------------------------------------------------------------------------------
bert-base-uncased 0.1667     0.0000     0.0000     0.0000
distilbert-base-uncased 0.0333     0.0000     0.0000     0.0000
roberta-base    0.0167     0.0000     0.0000     0.0000
xlm-roberta-base 0.0833     0.0000     0.0000     0.0000
