In [8]:
#import statements
import json
import re
import time
from typing import Dict, List, Tuple
from transformers import pipeline

# Initialize NER pipeline
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")

#functions for each extraction
def extract_email(text: str) -> List[str]:
    #email
    """Extract email addresses from text using regex."""
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    return re.findall(email_pattern, text)

def extract_phone(text: str) -> List[str]:
    #phone
    """Extract phone numbers from text using regex."""
    phone_pattern = r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'
    return re.findall(phone_pattern, text)

def extract_name(text: str) -> List[str]:
    #name
    """Extract names from text using BERT NER."""
    entities = ner_pipeline(text)
    return [entity['word'] for entity in entities if entity['entity_group'] == 'PER']

def extract_order_id(text: str) -> List[str]:
    #order_id
    """Extract order IDs from text using regex."""
    order_id_pattern = r'\b[0-9A-Z]{8,}\b'  # Adjust pattern based on order ID format
    return re.findall(order_id_pattern, text)

def get_ground_truth(scenario: Dict) -> Dict[str, str]:
    #compare to ground truth values
    """Extract ground truth values from the scenario data."""
    personal = scenario.get('personal', {})
    order = scenario.get('order', {})
    return {
        'email': personal.get('email', ''),
        'phone': personal.get('phone', ''),
        'name': personal.get('customer_name', '').title(),
        'order_id': order.get('order_id', '')
    }

def extract_entities_from_conversation(conversation: List[List[str]]) -> Dict[str, List[str]]:
    #extract features based on definitions above
    """Extract entities from all messages in a conversation."""
    entities = {
        'email': set(),
        'phone': set(),
        'name': set(),
        'order_id': set()
    }
    #combine all messages into a single string
    text = ' '.join(message for _, message in conversation)

    # xxtract entities
    entities['email'].update(extract_email(text))
    entities['phone'].update(extract_phone(text))
    entities['name'].update(extract_name(text))
    entities['order_id'].update(extract_order_id(text))

    return {k: list(v) for k, v in entities.items()}

  #calculate metrics for each
def evaluate_extraction(predicted: Dict[str, List[str]], actual: Dict[str, str]) -> Dict[str, Dict[str, float]]:
    """Calculate precision, recall, and F1 score for each entity type."""
    metrics = {}
    for entity_type in ['email', 'phone', 'name', 'order_id']:
        # convert to sets for comparison
        true_entities = set([actual[entity_type]]) if actual[entity_type] else set()
        predicted_entities = set(predicted[entity_type])

        # calculate true positives, false positives, and false negatives
        tp = len(true_entities & predicted_entities)
        fp = len(predicted_entities - true_entities)
        fn = len(true_entities - predicted_entities)

        # compute precision, recall, F1, and accuracy
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        accuracy = tp / len(true_entities) if len(true_entities) > 0 else 0

        #make list for metrics
        metrics[entity_type] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'support': len(true_entities),
            'accuracy': accuracy
        }

    return metrics

#go through each convo and calculate
def process_conversations(data: Dict) -> Tuple[Dict[str, Dict[str, float]], List[Dict], float, float]:
    """Process all conversations, calculate overall metrics, measure parsing time, and overall accuracy."""
    all_metrics = []
    overall_metrics = {
        'email': {'precision': 0, 'recall': 0, 'f1': 0, 'support': 0, 'accuracy': 0},
        'phone': {'precision': 0, 'recall': 0, 'f1': 0, 'support': 0, 'accuracy': 0},
        'name': {'precision': 0, 'recall': 0, 'f1': 0, 'support': 0, 'accuracy': 0},
        'order_id': {'precision': 0, 'recall': 0, 'f1': 0, 'support': 0, 'accuracy': 0}
    }
    total_time = 0  # total time for processing all conversations
    correct_fields = 0  # total correct fields across all conversations
    total_fields = 0  # total fields that needed to be extracted

    #in training
    for conversation_data in data['train']:
        #start timing the conversation processing
        start_time = time.time()

        #extract ground truth
        ground_truth = get_ground_truth(conversation_data['scenario'])

        #extract entities from conversation
        extracted_entities = extract_entities_from_conversation(conversation_data['original'])

        #calculate metrics
        metrics = evaluate_extraction(extracted_entities, ground_truth)

        #count correctly extracted fields for overall accuracy
        for entity_type in ['email', 'phone', 'name', 'order_id']:
            if ground_truth[entity_type] and ground_truth[entity_type] in extracted_entities[entity_type]:
                correct_fields += 1
            if ground_truth[entity_type]:
                total_fields += 1

        #end timing
        end_time = time.time()
        total_time += (end_time - start_time)

        #store results
        result = {
            'convo_id': conversation_data['convo_id'],
            'extracted': extracted_entities,
            'ground_truth': ground_truth,
            'metrics': metrics
        }
        #all results stored in all_metrics list
        all_metrics.append(result)

        #update overall metrics
        for entity_type in overall_metrics:
            for metric in ['precision', 'recall', 'f1', 'accuracy']:
                overall_metrics[entity_type][metric] += metrics[entity_type][metric] * metrics[entity_type]['support']
            overall_metrics[entity_type]['support'] += metrics[entity_type]['support']

    # calculate weighted averages
    for entity_type in overall_metrics:
        if overall_metrics[entity_type]['support'] > 0:
            for metric in ['precision', 'recall', 'f1', 'accuracy']:
                overall_metrics[entity_type][metric] /= overall_metrics[entity_type]['support']

    #calculate average time per conversation
    avg_time_per_conversation = total_time / len(data['train']) if len(data['train']) > 0 else 0

    #calculate overall accuracy
    overall_accuracy = correct_fields / total_fields if total_fields > 0 else 0

    return overall_metrics, all_metrics, avg_time_per_conversation, overall_accuracy

#example usage
if __name__ == "__main__":
    #load your JSON data
    with open('/content/8_1_1_small.json.json', 'r') as f:
        data = json.load(f)

    #process all conversations
    overall_metrics, detailed_results, avg_time_per_conversation, overall_accuracy = process_conversations(data)

    #print overall metrics
    print("\nOverall Metrics:")
    for entity_type, metrics in overall_metrics.items():
        print(f"\n{entity_type.title()} Extraction:")
        for metric_name, value in metrics.items():
            if metric_name != 'support':
                print(f"{metric_name}: {value:.3f}")

    #print average time per conversation
    print(f"\nTime per Conversation Parsing: {avg_time_per_conversation:.3f} seconds")

    #print overall accuracy
    print(f"\nOverall Accuracy: {overall_accuracy:.3f}")

    #print detailed results for first few conversations
    print("\nDetailed Results (first 5 conversations):")
    for result in detailed_results[:5]:
        print(f"\nConversation ID: {result['convo_id']}")
        print("Extracted:", result['extracted'])
        print("Ground Truth:", result['ground_truth'])
        print("Metrics:", result['metrics'])


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.



Overall Metrics:

Email Extraction:
precision: 0.440
recall: 0.452
f1: 0.444
accuracy: 0.452

Phone Extraction:
precision: 0.083
recall: 0.103
f1: 0.090
accuracy: 0.103

Name Extraction:
precision: 0.354
recall: 0.487
f1: 0.396
accuracy: 0.487

Order_Id Extraction:
precision: 0.585
recall: 0.851
f1: 0.674
accuracy: 0.851

Time per Conversation Parsing: 1.765 seconds

Overall Accuracy: 0.429

Detailed Results (first 5 conversations):

Conversation ID: 9784
Extracted: {'email': [], 'phone': ['4905628193'], 'name': ['Sanya Afzal'], 'order_id': ['BCMZTYKLKU', '4905628193']}
Ground Truth: {'email': '', 'phone': '(320) 090-2639', 'name': 'Sanya Afzal', 'order_id': '4905628193'}
Metrics: {'email': {'precision': 0, 'recall': 0, 'f1': 0, 'support': 0, 'accuracy': 0}, 'phone': {'precision': 0.0, 'recall': 0.0, 'f1': 0, 'support': 1, 'accuracy': 0.0}, 'name': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'support': 1, 'accuracy': 1.0}, 'order_id': {'precision': 0.5, 'recall': 1.0, 'f1': 0.6666666