# NER

In [None]:
!pip install spacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import pandas as pd

import spacy
from tqdm import tqdm
from collections import Counter

nlp = spacy.load("en_core_web_sm")

## Reading the data

In [None]:
data = pd.read_csv('/data/metahate.csv', sep='\t', names=['label', 'text'])

text_hate = data.loc[data['label'] == 1, 'text'].tolist()
text_no_hate = data.loc[data['label'] == 0, 'text'].tolist()

In [None]:
def analyze_named_entities(text):
    """
    Analyze named entities in the given text.

    Parameters:
    - text (str): The input text to analyze.

    Returns:
    Tuple[dict, Union[tuple, None]]: A tuple containing:
        - A dictionary of named entity type percentages.
        - A tuple representing the most referenced named entity and its entity type, or None if no entities are found.
    """
    doc = nlp(text)

    named_entities = [(ent.text, ent.label_) for ent in doc.ents]

    if not named_entities:
        return {}, None 

    entity_types = [ent_type for _, ent_type in named_entities]
    entity_type_counts = Counter(entity_types)

    total_entities = len(entity_types)
    entity_type_percentages = {ent_type: count / total_entities * 100 for ent_type, count in entity_type_counts.items()}

    # Identify the most referenced named entity
    most_referenced_entity, _ = max(Counter(named_entities), key=lambda x: x[1])

    return entity_type_percentages, most_referenced_entity

## Analyzing named entities

In [None]:
text_data = text_hate # Later for text_no_hate
results = [analyze_named_entities(text) for text in tqdm(text_data)]

## Analyzing and summarizing named entities across the texts

In [None]:
total_entity_type_percentages = Counter()
most_referenced_entities = {}

for entity_type_percentages, most_referenced_entity in tqdm(results):
    total_entity_type_percentages.update(entity_type_percentages)

    if most_referenced_entity:
        most_referenced_entities[text_data[results.index((entity_type_percentages, most_referenced_entity))]] = most_referenced_entity
        
        
total_entities_across_texts = sum(total_entity_type_percentages.values())
overall_entity_type_percentages = {ent_type: count / total_entities_across_texts * 100 for ent_type, count in total_entity_type_percentages.items()}

## Printing the results

In [None]:
print("Named Entity Type Percentages:")
for ent_type, percentage in overall_entity_type_percentages.items():
    print(f"{ent_type}: {percentage:.2f}%")