In [2]:
import pandas as pd

data = pd.read_csv('data/MetaHate<lang>.tsv', sep='\t')
text_hate = data.loc[data['label'] == '1', 'text'].tolist()
text_no_hate = data.loc[data['label'] == '0', 'text'].tolist()

In [None]:
import spacy
from collections import Counter
from tqdm import tqdm

nlp = spacy.load("<pt_es_en>_core_news_sm")

def analyze_named_entities(text):
    if isinstance(text, str):
        doc = nlp(text)
    
        named_entities = [(ent.text, ent.label_) for ent in doc.ents]
    
        if not named_entities:
            return {}, None 
    
        entity_types = [ent_type for _, ent_type in named_entities]
        entity_type_counts = Counter(entity_types)
    
        total_entities = len(entity_types)
        entity_type_percentages = {ent_type: count / total_entities * 100 for ent_type, count in entity_type_counts.items()}
        most_referenced_entity, _ = max(Counter(named_entities), key=lambda x: x[1])
    
        return entity_type_percentages, most_referenced_entity

text_data = text_hate

results = [analyze_named_entities(text) for text in tqdm(text_data)]

total_entity_type_percentages = Counter()
most_referenced_entities = {}

for entity_type_percentages, most_referenced_entity in tqdm(results):
    total_entity_type_percentages.update(entity_type_percentages)

    if most_referenced_entity:
        most_referenced_entities[text_data[results.index((entity_type_percentages, most_referenced_entity))]] = most_referenced_entity

total_entities_across_texts = sum(total_entity_type_percentages.values())
overall_entity_type_percentages = {ent_type: count / total_entities_across_texts * 100 for ent_type, count in total_entity_type_percentages.items()}

print("Named Entity Type Percentages:")
for ent_type, percentage in overall_entity_type_percentages.items():
    print(f"{ent_type}: {percentage:.2f}%")