In [None]:
import pandas as pd

data = pd.read_csv('data/MetaHate<lang>.tsv', sep='\t')
text_hate = data.loc[data['label'] == 1, 'text'].tolist()
text_no_hate = data.loc[data['label'] == 0, 'text'].tolist()

In [None]:
import spacy
from tqdm import tqdm
from collections import Counter

nlp = spacy.load("<es_pt_en>_core_news_sm")

def get_pronoun_counts(text):
    doc = nlp(text)

    pronoun_counter = Counter(token.text.lower() for token in doc if token.pos_ == "PRON")
    
    #first_person_singular_count = pronoun_counter["yo"]
    #first_person_plural_count = pronoun_counter["nosotros"] + pronoun_counter["nosotras"]
    #second_person_singular_count = pronoun_counter["tu"] + pronoun_counter["usted"] + pronoun_counter["vos"] +  pronoun_counter["tú"]
    #second_person_plural_count = pronoun_counter["vosotros"] + pronoun_counter["vosotras"] + pronoun_counter["ustedes"]
    #third_person_singular_count = pronoun_counter["él"] + pronoun_counter["el"] + pronoun_counter["ella"]
    #third_person_plural_count = pronoun_counter["ellos"] + pronoun_counter["ellas"] 
    
    #first_person_singular_count = pronoun_counter["eu"]
    #first_person_plural_count = pronoun_counter["nos"] + pronoun_counter["nós"]
    #second_person_singular_count = pronoun_counter["tu"] + pronoun_counter["você"]
    #second_person_plural_count = pronoun_counter["vos"] + pronoun_counter["vós"] + pronoun_counter["voces"] +  pronoun_counter["vocês"]
    #third_person_singular_count = pronoun_counter["ele"] + pronoun_counter["ela"]
    #third_person_plural_count = pronoun_counter["eles"] + pronoun_counter["elas"] 
        
    first_person_singular_count = pronoun_counter["eu"]
    first_person_plural_count = pronoun_counter["nos"] + pronoun_counter["nós"] + pronoun_counter["nosoutros"] + pronoun_counter["nosoutras"]
    second_person_singular_count = pronoun_counter["ti"] + pronoun_counter["tu"]
    second_person_plural_count = pronoun_counter["vos"] + pronoun_counter["vós"] + pronoun_counter["vosoutros"] +  pronoun_counter["vosoutras"]
    third_person_singular_count = pronoun_counter["el"] + pronoun_counter["ela"] + pronoun_counter["vostede"]
    third_person_plural_count = pronoun_counter["eles"] + pronoun_counter["elas"] + pronoun_counter["vostedes"]
        
    past_count = 0
    present_count = 0
    future_count = 0
            
    for token in doc:
        if token.pos_ == "VERB" or token.pos_ == "AUX":
            tense = token.morph.get("Tense")
            if "Past" in tense:
                past_count += 1
            elif "Pres" in tense:
                present_count += 1
            elif "Fut" in tense:
                future_count += 1

    return (
        first_person_singular_count,
        first_person_plural_count,
        second_person_singular_count,
        second_person_plural_count,
        third_person_singular_count,
        third_person_plural_count,
        past_count,
        present_count,
        future_count
    )

total_first_person_singular_count = 0
total_first_person_plural_count = 0
total_second_person_singular_count = 0
total_second_person_plural_count = 0
total_third_person_singular_count = 0
total_third_person_plural_count = 0
total_past_count = 0
total_present_count = 0
total_future_count = 0

for text in tqdm(text_hate):
    first_person_singular_count, first_person_plural_count, second_person_singular_count, second_person_plural_count, third_person_singular_count, third_person_plural_count, past_count, present_count, future_count = get_pronoun_counts(text)
    total_first_person_singular_count += first_person_singular_count
    total_first_person_plural_count += first_person_plural_count
    total_second_person_singular_count += second_person_singular_count
    total_second_person_plural_count += second_person_plural_count
    total_third_person_singular_count += third_person_singular_count
    total_third_person_plural_count += third_person_plural_count
    total_past_count += past_count
    total_present_count += present_count
    total_future_count += future_count

total_pronouns = (
    total_first_person_singular_count +
    total_first_person_plural_count +
    total_second_person_singular_count +
    total_second_person_plural_count +
    total_third_person_singular_count +
    total_third_person_plural_count
)

total_verbs = (total_past_count + total_present_count + total_future_count)

first_person_singular_percentage = (
    (total_first_person_singular_count / total_pronouns) * 100
) if total_pronouns > 0 else 0

first_person_plural_percentage = (
    (total_first_person_plural_count / total_pronouns) * 100
) if total_pronouns > 0 else 0

second_person_singular_percentage = (
    (total_second_person_singular_count / total_pronouns) * 100
) if total_pronouns > 0 else 0

second_person_plural_percentage = (
    (total_second_person_plural_count / total_pronouns) * 100
) if total_pronouns > 0 else 0

third_person_singular_percentage = (
    (total_third_person_singular_count / total_pronouns) * 100
) if total_pronouns > 0 else 0

third_person_plural_percentage = (
    (total_third_person_plural_count / total_pronouns) * 100
) if total_pronouns > 0 else 0

past_percentage = (total_past_count / total_verbs) * 100 if total_verbs > 0 else 0
present_percentage = (total_present_count / total_verbs) * 100 if total_verbs > 0 else 0
future_percentage = (total_future_count / total_verbs) * 100 if total_verbs > 0 else 0

print(f"Total Pronouns: {total_pronouns}")
print(f"First Person Singular Percentage: {first_person_singular_percentage:.2f}%")
print(f"First Person Plural Percentage: {first_person_plural_percentage:.2f}%")
print(f"Second Person Singular Percentage: {second_person_singular_percentage:.2f}%")
print(f"Second Person Plural Percentage: {second_person_plural_percentage:.2f}%")
print(f"Third Person Singular Percentage: {third_person_singular_percentage:.2f}%")
print(f"Third Person Plural Percentage: {third_person_plural_percentage:.2f}%")

print(f"Total Verbs: {total_verbs}")
print(f"Past Tense Percentage: {past_percentage:.2f}%")
print(f"Present Tense Percentage: {present_percentage:.2f}%")
print(f"Future Tense Percentage: {future_percentage:.2f}%")