In [1]:
import stanza
import spacy
import requests
import conllu
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List, Dict, Tuple
from collections import defaultdict
from difflib import SequenceMatcher

class POSTaggerEvaluator:
    def __init__(self):
        self.stanza_nlp = stanza.Pipeline('en', processors='tokenize,pos')
        self.spacy_nlp = spacy.load("en_core_web_sm")
        
    def align_tokens(self, source: List[str], target: List[str]) -> List[Tuple[int, int]]:
        """Alinea tokens entre dos secuencias usando SequenceMatcher."""
        s = SequenceMatcher(None,
                           [t.lower() for t in source],
                           [t.lower() for t in target])
        
        alignments = []
        for block in s.get_matching_blocks():
            i, j, n = block
            for k in range(n):
                alignments.append((i + k, j + k))
        return alignments

    def download_ud_data(self) -> str:
        url = "https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu"
        response = requests.get(url)
        return response.text

    def extract_sentence_tokens(self, conllu_data: str, num_tokens: int = 300) -> List[Dict]:
        sentences = list(conllu.parse(conllu_data))
        tokens = []
        current_sent = []
        
        for sentence in sentences:
            for token in sentence:
                if not token['form'].strip():
                    continue
                current_sent.append({
                    'text': token['form'],
                    'gold_tag': token['xpos']
                })
            if current_sent:
                tokens.extend(current_sent)
                current_sent = []
            if len(tokens) >= num_tokens:
                break
                
        return tokens[:num_tokens]

    def evaluate_tagger(self, tagger_output: List[Tuple[str, str]], 
                       gold_data: List[Dict]) -> Dict:
        """Evalúa un etiquetador usando alineamiento de tokens."""
        pred_tokens = [t[0] for t in tagger_output]
        pred_tags = [t[1] for t in tagger_output]
        gold_tokens = [t['text'] for t in gold_data]
        gold_tags = [t['gold_tag'] for t in gold_data]
        
        # Alinear tokens
        alignments = self.align_tokens(pred_tokens, gold_tokens)
        
        # Crear DataFrame de evaluación
        eval_data = []
        for pred_idx, gold_idx in alignments:
            eval_data.append({
                'Token': gold_tokens[gold_idx],
                'Gold': gold_tags[gold_idx],
                'Predicted': pred_tags[pred_idx],
                'Correct': gold_tags[gold_idx] == pred_tags[pred_idx]
            })
        
        eval_df = pd.DataFrame(eval_data)
        
        # Calcular métricas por etiqueta
        tag_metrics = eval_df.groupby('Gold').agg({
            'Correct': ['count', 'sum']
        })
        tag_metrics.columns = ['Total', 'Correct']
        tag_metrics['Accuracy'] = tag_metrics['Correct'] / tag_metrics['Total']
        
        # Matriz de confusión
        confusion_matrix = pd.crosstab(
            eval_df['Gold'], 
            eval_df['Predicted'], 
            margins=True
        )
        
        return {
            'accuracy': eval_df['Correct'].mean(),
            'tag_metrics': tag_metrics,
            'confusion_matrix': confusion_matrix,
            'detailed_results': eval_df,
            'total_tokens': len(eval_df)
        }

    def save_results(self, text: str, gold_data: List[Dict], 
                    stanza_output: List[Tuple[str, str]], 
                    spacy_output: List[Tuple[str, str]],
                    evaluation_results: Dict):
        """Guarda todos los resultados en archivos."""
        # Guardar textos
        with open('texto_ud.txt', 'w') as f:
            f.write(text)
            
        with open('texto_ud_gold.txt', 'w') as f:
            f.write(' '.join(f"{t['text']}/{t['gold_tag']}" for t in gold_data))
            
        with open('texto_stanza.txt', 'w') as f:
            f.write(' '.join(f"{t[0]}/{t[1]}" for t in stanza_output))
            
        with open('texto_spacy.txt', 'w') as f:
            f.write(' '.join(f"{t[0]}/{t[1]}" for t in spacy_output))
        
        # Guardar resultados detallados en un formato que plot_results pueda usar
        stanza_results = evaluation_results['stanza']['detailed_results'].assign(Tagger='Stanza')
        spacy_results = evaluation_results['spacy']['detailed_results'].assign(Tagger='spaCy')
        results_df = pd.concat([stanza_results, spacy_results])
        results_df.to_csv('resultados.csv', index=False)
        
        # Guardar métricas individuales
        evaluation_results['stanza']['tag_metrics'].to_csv('stanza_metrics.csv')
        evaluation_results['spacy']['tag_metrics'].to_csv('spacy_metrics.csv')
        
        return results_df
    
    def plot_results(self, results_df):
        """Genera visualizaciones mejoradas de los resultados."""
        plt.style.use('seaborn-v0_8-whitegrid')
        sns.set_context("paper")
        
        # 1. Accuracy por etiqueta y etiquetador (versión mejorada)
        plt.figure(figsize=(15, 6))
        accuracy_by_tag = results_df.groupby(['Tagger', 'Gold'])['Correct'].mean()
        # Filtrar solo las categorías con accuracy < 100% para mayor claridad
        accuracy_by_tag = accuracy_by_tag[accuracy_by_tag < 1.0].unstack()
        
        if not accuracy_by_tag.empty:
            ax = accuracy_by_tag.plot(kind='bar', width=0.8)
            # plt.title('Accuracy por Etiqueta y Etiquetador (solo categorías con errores)', pad=20)
            plt.xlabel('Etiquetador', labelpad=10)
            plt.ylabel('Accuracy', labelpad=10)
            plt.xticks(rotation=45)
            plt.grid(axis='y', linestyle='--', alpha=0.7)
            
            # Añadir valores sobre las barras
            for container in ax.containers:
                ax.bar_label(container, fmt='%.2f%%', padding=3)
                
            plt.legend(title='POS Tag', bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.tight_layout()
            plt.savefig('accuracy_comparison.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        # 2. Gráfico de tasas de error
        error_rates = (1 - results_df.groupby(['Tagger', 'Gold'])['Correct'].mean()).reset_index()
        error_rates = error_rates[error_rates['Correct'] > 0]
        
        if not error_rates.empty:
            plt.figure(figsize=(12, max(6, len(error_rates) * 0.4)))
            sns.barplot(data=error_rates, x='Correct', y='Gold', 
                       hue='Tagger', palette='Set2', orient='h')
            # plt.title('Error Rate by POS Tag', pad=20)
            plt.xlabel('Error Rate', labelpad=10)
            plt.ylabel('POS Tag', labelpad=10)
            plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
            plt.tight_layout()
            plt.savefig('error_rates.png', dpi=300, bbox_inches='tight')
            plt.close()
    
        # 3. Matrices de confusión
        for tagger in ['Stanza', 'spaCy']:
            plt.figure(figsize=(12, 10))
            tagger_data = results_df[results_df['Tagger'] == tagger]
            conf_matrix = pd.crosstab(tagger_data['Gold'], tagger_data['Predicted'])
            mask = conf_matrix == 0
            
            sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='YlOrRd',
                       square=True, cbar_kws={'label': 'Frequency'},
                       mask=mask)
            
            # plt.title(f'Confusion Matrix - {tagger}', pad=20)
            plt.xlabel('Predicted Tag', labelpad=10)
            plt.ylabel('True Tag', labelpad=10)
            plt.tight_layout()
            plt.savefig(f'confusion_matrix_{tagger.lower()}.png', dpi=300, bbox_inches='tight')
            plt.close()
    
        # 4. Distribución de etiquetas
        plt.figure(figsize=(15, 6))
        tag_dist = results_df[results_df['Tagger'] == 'Stanza']['Gold'].value_counts()
        
        sns.barplot(x=tag_dist.index, y=tag_dist.values,
                   color=sns.color_palette("Blues_d")[0])
        # plt.title('POS Tag Distribution in Gold Standard', pad=20)
        plt.xlabel('POS Tag', labelpad=10)
        plt.ylabel('Frequency', labelpad=10)
        plt.xticks(rotation=45, ha='right')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig('tag_distribution.png', dpi=300, bbox_inches='tight')
        plt.close() 
    
    
    def run_evaluation(self, num_tokens: int = 300) -> None:
        """Ejecuta la evaluación completa y guarda resultados."""
        print("Descargando y procesando datos de Universal Dependencies...")
        ud_data = self.download_ud_data()
        gold_data = self.extract_sentence_tokens(ud_data, num_tokens)
        
        # Preparar texto
        text = ' '.join(t['text'] for t in gold_data)
        
        print("Evaluando Stanza...")
        stanza_output = [(w.text, w.xpos) 
                        for s in self.stanza_nlp(text).sentences 
                        for w in s.words]
        
        print("Evaluando spaCy...")
        spacy_output = [(t.text, t.tag_) 
                       for t in self.spacy_nlp(text)]
        
        # Evaluar cada etiquetador
        evaluation_results = {
            'stanza': self.evaluate_tagger(stanza_output, gold_data),
            'spacy': self.evaluate_tagger(spacy_output, gold_data)
        }
        
        # Guardar y visualizar resultados
        results_df = self.save_results(text, gold_data, stanza_output, 
                                     spacy_output, evaluation_results)
        self.plot_results(results_df)
        
        # Mostrar resultados
        for tagger, metrics in evaluation_results.items():
            print(f"\n{tagger.upper()} RESULTS:")
            print(f"Overall accuracy: {metrics['accuracy']:.2%}")
            print(f"\nAccuracy by tag:")
            display(metrics['tag_metrics'].sort_values('Total', ascending=False))
            print("\nConfusion Matrix:")
            display(metrics['confusion_matrix'])

def main():
    evaluator = POSTaggerEvaluator()
    evaluator.run_evaluation(298)

if __name__ == "__main__":
    main()

2025-02-10 19:57:39 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-02-10 19:57:39 INFO: Downloaded file to /Users/telemaco/stanza_resources/resources.json
2025-02-10 19:57:39 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| mwt       | combined        |
| pos       | combined_charlm |

2025-02-10 19:57:39 INFO: Using device: cpu
2025-02-10 19:57:39 INFO: Loading: tokenize
2025-02-10 19:57:40 INFO: Loading: mwt
2025-02-10 19:57:40 INFO: Loading: pos
2025-02-10 19:57:42 INFO: Done loading processors!


Descargando y procesando datos de Universal Dependencies...
Evaluando Stanza...
Evaluando spaCy...

STANZA RESULTS:
Overall accuracy: 98.99%

Accuracy by tag:


Unnamed: 0_level_0,Total,Correct,Accuracy
Gold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
IN,44,44,1.0
NNP,36,36,1.0
DT,33,33,1.0
NN,30,30,1.0
NNS,18,18,1.0
VBD,16,16,1.0
.,14,14,1.0
JJ,13,12,0.923077
VBG,11,11,1.0
PRP,11,11,1.0



Confusion Matrix:


Predicted,",",-LRB-,-RRB-,.,:,CC,CD,DT,EX,HYPH,...,RP,TO,VB,VBD,VBG,VBN,VBP,VBZ,WP,All
Gold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
",",7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
-LRB-,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
-RRB-,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
.,0,0,0,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14
:,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
CC,0,0,0,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
CD,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,5
DT,0,0,0,0,0,0,0,33,0,0,...,0,0,0,0,0,0,0,0,0,33
EX,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
HYPH,0,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,4



SPACY RESULTS:
Overall accuracy: 97.31%

Accuracy by tag:


Unnamed: 0_level_0,Total,Correct,Accuracy
Gold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
IN,44,44,1.0
NNP,36,34,0.944444
DT,33,33,1.0
NN,30,30,1.0
NNS,18,17,0.944444
VBD,16,16,1.0
.,14,14,1.0
JJ,13,13,1.0
VBG,11,11,1.0
PRP,11,11,1.0



Confusion Matrix:


Predicted,",",-LRB-,-RRB-,.,:,CC,CD,DT,EX,HYPH,...,TO,VB,VBD,VBG,VBN,VBP,VBZ,WP,XX,All
Gold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
",",7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
-LRB-,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
-RRB-,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
.,0,0,0,14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,14
:,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
CC,0,0,0,0,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
CD,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,5
DT,0,0,0,0,0,0,0,33,0,0,...,0,0,0,0,0,0,0,0,0,33
EX,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
HYPH,0,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,4


<Figure size 1500x600 with 0 Axes>