# HMM-based NER Model Analysis

This notebook provides analysis of the HMM-based Named Entity Recognition models for Twitter data. We'll examine:
1. Data distribution
2. Model performance comparison
3. Error analysis
4. Visualization of HMM parameters

In [None]:
import sys
import os
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_utils import load_data, preprocess_data
from src.hmm_tagger import HMMTagger
from src.evaluation import evaluate_model, compute_metrics
from src.visualization import (
    plot_confusion_matrix, 
    plot_tag_distribution, 
    plot_metrics_comparison,
    plot_transition_heatmap
)

import pickle
import json

%matplotlib inline
plt.style.use('ggplot')

## 1. Data Loading and Exploration

In [None]:
# Load data
train_data = load_data('../data/train.txt')
valid_data = load_data('../data/valid.txt')
test_data = load_data('../data/test.txt')

print(f"Training sentences: {len(train_data)}")
print(f"Validation sentences: {len(valid_data)}")
print(f"Test sentences: {len(test_data)}")

In [None]:
# Extract all tags from training data
all_tags = [tag for sentence in train_data for _, tag in sentence]
unique_tags = list(set(all_tags))
print(f"Number of unique tags: {len(unique_tags)}")
print(f"Tags: {unique_tags}")

# Plot tag distribution
plot_tag_distribution(all_tags, title='Training Data Tag Distribution')

## 2. Model Performance Analysis

In [None]:
# Load model results
with open('../configs/model_results.json', 'r') as f:
    results = json.load(f)

# Compare models
plot_metrics_comparison(results, metric='accuracy', title='Model Accuracy Comparison')
plot_metrics_comparison(results, metric='precision', title='Model Precision Comparison')
plot_metrics_comparison(results, metric='recall', title='Model Recall Comparison')
plot_metrics_comparison(results, metric='f1_score', title='Model F1 Score Comparison')

## 3. Error Analysis

In [None]:
# Load the best model
with open('../models/best_model.pkl', 'rb') as f:
    best_model = pickle.load(f)

# Predict on test data
test_words = [[word for word, _ in sentence] for sentence in test_data]
test_true = [[tag for _, tag in sentence] for sentence in test_data]
test_pred = [best_model.viterbi_decode(sentence) for sentence in test_words]

# Calculate confusion matrix
from sklearn.metrics import confusion_matrix
true_flat = [tag for sent in test_true for tag in sent]
pred_flat = [tag for sent in test_pred for tag in sent]
cm = confusion_matrix(true_flat, pred_flat, labels=unique_tags)

# Plot confusion matrix
plot_confusion_matrix(cm, unique_tags, title='Best Model Confusion Matrix')

## 4. HMM Parameter Visualization

In [None]:
# Visualize start probabilities
start_probs = pd.Series(best_model.start_prob)
plt.figure(figsize=(12, 6))
sns.barplot(x=start_probs.index, y=start_probs.values)
plt.title('Start Probabilities')
plt.ylabel('Probability')
plt.xlabel('Tag')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Visualize transition probabilities
if hasattr(best_model, 'trans_prob'):
    trans_matrix = np.zeros((len(unique_tags), len(unique_tags)))
    for i, tag1 in enumerate(unique_tags):
        for j, tag2 in enumerate(unique_tags):
            if (tag1, tag2) in best_model.trans_prob:
                trans_matrix[i, j] = best_model.trans_prob[(tag1, tag2)]
    
    plot_transition_heatmap(trans_matrix, unique_tags, title='Transition Probabilities')

## 5. Case Studies and Error Analysis

In [None]:
# Find examples of common errors
error_cases = []
for i, (sent_true, sent_pred, words) in enumerate(zip(test_true, test_pred, test_words)):
    for j, (true_tag, pred_tag, word) in enumerate(zip(sent_true, sent_pred, words)):
        if true_tag != pred_tag:
            error_cases.append((word, true_tag, pred_tag))
            if len(error_cases) >= 20:  # Limit to 20 examples
                break
    if len(error_cases) >= 20:
        break

# Display error cases
error_df = pd.DataFrame(error_cases, columns=['Word', 'True Tag', 'Predicted Tag'])
error_df

In [None]:
# Analyze most confused tag pairs
confused_pairs = {}
for word, true_tag, pred_tag in error_cases:
    pair = (true_tag, pred_tag)
    if pair not in confused_pairs:
        confused_pairs[pair] = 0
    confused_pairs[pair] += 1

# Sort by frequency
sorted_pairs = sorted(confused_pairs.items(), key=lambda x: x[1], reverse=True)
for (true_tag, pred_tag), count in sorted_pairs[:10]:  # Top 10 confused pairs
    print(f"True: {true_tag}, Predicted: {pred_tag}, Count: {count}")

## 6. Performance on Specific Named Entity Types

In [None]:
# Calculate metrics per tag
from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1, support = precision_recall_fscore_support(true_flat, pred_flat, labels=unique_tags)

# Create per-tag metrics dataframe
tag_metrics = pd.DataFrame({
    'Tag': unique_tags,
    'Precision': precision,
    'Recall': recall,
    'F1 Score': f1,
    'Support': support
})

# Sort by F1 score
tag_metrics = tag_metrics.sort_values('F1 Score', ascending=False)
tag_metrics

In [None]:
# Visualize per-tag F1 scores
plt.figure(figsize=(12, 6))
sns.barplot(x='Tag', y='F1 Score', data=tag_metrics)
plt.title('F1 Score by Tag')
plt.xticks(rotation=45)
plt.ylim(0, 1.0)
plt.tight_layout()
plt.show()

## 7. Conclusions and Recommendations

Based on the analysis above, we can draw the following conclusions:

1. **Model Performance**: The trigram model with context during emission probability calculation performs the best overall, with an F1 score of X.XX.

2. **Entity Type Performance**:
   - Best performing entity types: [List top 3 from the analysis]
   - Worst performing entity types: [List bottom 3 from the analysis]

3. **Common Errors**:
   - [Summarize the most common error patterns observed]
   - [Analyze why these errors might be occurring]

4. **Recommendations**:
   - [Suggest improvements to the model]
   - [Suggest additional features that could help]
   - [Discuss alternative approaches that might work better]