In [None]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from utils import contingency_tables

sns.set_theme(style="darkgrid")
sns.set(font_scale = 1.8)
colors = sns.color_palette("colorblind")

# Load groundtruth

In [None]:
groundtruth_df = pd.read_csv("groundtruth.csv", index_col="stimulus_id")

# Upload date

In [None]:
upload_years = groundtruth_df["upload_date"].rename("Upload year").sort_values().astype("str").str[:4]

plt.figure(figsize=(12,8))
ax = sns.histplot(x=upload_years, discrete=True)
for p in ax.patches:
    annotation = p.get_height() if p.get_height()>10 else ""
    ax.annotate(f'\n{annotation}', (p.get_x()+0.3, p.get_height()), ha='center', va='top', color='white', size=16)
plt.savefig("upload_years.pdf", bbox_inches="tight")
plt.show()

upload_years = upload_years.astype(int)

# Gender of the actors

In [None]:
target_groundtruth = groundtruth_df['all_genders'].copy()
boys_commercials = target_groundtruth[target_groundtruth=='Boys/men'].index
girls_commercials = target_groundtruth[target_groundtruth=='Girls/women'].index
mixed_commercials = target_groundtruth[target_groundtruth=='Mixed'].index
no_actors_commercials = target_groundtruth[
    target_groundtruth=='There are no actors/presenters or you can never see their faces'
].index

In [None]:
target_groundtruth[target_groundtruth=='There are no actors/presenters or you can never see their faces'] = 'No actors'
temp = target_groundtruth.rename("Predominant gender of the actors/presenters")

plt.figure(figsize=(12,8))
ax = sns.histplot(x=temp)
for p in ax.patches:
    ax.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='white', size=20)
plt.show()

In [None]:
groundtruth_df.all_genders.value_counts()

# Voice type

In [None]:
cont_tab, expected = contingency_tables(groundtruth_df,"voice_type")
cont_tab

# Voice age

In [None]:
cont_tab, expected = contingency_tables(groundtruth_df,"voice_age")
cont_tab

# Voice gender

### Merge cases to improve trustworthiness of the chi-square test
Merge "Unclear" with no voices

In [None]:
merge_cases = [
    {
        "merged_name": "BOTH feminine and masculine voices",
        "cases_to_merge": ["BOTH feminine and masculine voices", "Unclear"]
    }
]
cont_tab, expected = contingency_tables(groundtruth_df,"voice_gender", merge_cases=merge_cases)
cont_tab

In [None]:
for merging_dict in merge_cases:
    for case in merging_dict["cases_to_merge"]:
        groundtruth_df.loc[groundtruth_df.voice_gender == case, "voice_gender"] = merging_dict["merged_name"]

# Voice gender exaggeration

In [None]:
cont_tab, expected = contingency_tables(groundtruth_df,"voice_exagg")
cont_tab