In [14]:
import pandas as pd

In [15]:
df = pd.read_csv('../../shared_data/processed/merged_data.csv', low_memory=False)

In [16]:
TASKS_WITH_TEXT = [
    'intimacy', 
    'politeness', 
    'offensiveness',
    'diaz',
    'dices-350'
]

df_text = df[(df['bad_users'] == False) & (df['task'].isin(TASKS_WITH_TEXT))].copy()

In [17]:
SOCDEM = ['gender', 'age', 'race', 'education']

In [18]:
df_text['socdem'] = df_text.apply(lambda row: '|'.join([row[attr] for attr in SOCDEM]), axis=1)

In [19]:
df_text.columns

Index(['instance_id', 'text', 'user_id', 'gender', 'age', 'race', 'education',
       'label', 'labels', 'task', 'instance_split', 'user_split', 'bad_users',
       'socdem'],
      dtype='object')

Number of unique users for each sociodemographic profile

In [20]:
socdem_counts = df_text.groupby(['task', 'socdem'])[['user_id']].nunique().reset_index()
socdem_counts.sort_values('user_id', ascending=False)

Unnamed: 0,task,socdem,user_id
81,diaz,Woman|50-59|White|Some college or associate's ...,86
35,diaz,Man|60-69|White|Some college or associate's de...,84
100,diaz,Woman|60-69|White|Some college or associate's ...,83
31,diaz,Man|60-69|White|College degree,77
17,diaz,Man|50-59|White|Some college or associate's de...,62
...,...,...,...
41,diaz,Man|70-79|Native American|Some college or asso...,1
42,diaz,Man|70-79|Other|Graduate degree,1
493,politeness,Woman|40-44|Black|Unknown,1
494,politeness,Woman|40-44|Hispanic/Latino|College degree,1


Number of users for each sociodemographic profile per task

In [21]:
socdem_counts.groupby('task').describe()

Unnamed: 0_level_0,user_id,user_id,user_id,user_id,user_id,user_id,user_id,user_id
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
task,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
diaz,122.0,12.139344,19.346478,1.0,1.0,3.5,11.0,86.0
dices-350,45.0,2.311111,1.458864,1.0,1.0,2.0,3.0,6.0
intimacy,105.0,2.485714,2.45386,1.0,1.0,1.0,3.0,16.0
offensiveness,104.0,2.519231,2.421509,1.0,1.0,1.0,4.0,17.0
politeness,154.0,3.285714,4.12186,1.0,1.0,2.0,4.0,24.0


Number of annotators that have a unique profile

In [22]:
unique_socdem_users = socdem_counts[socdem_counts['user_id'] == 1].groupby('task')[['user_id']].count()
unique_socdem_users

Unnamed: 0_level_0,user_id
task,Unnamed: 1_level_1
diaz,37
dices-350,18
intimacy,55
offensiveness,54
politeness,74


In [23]:
all_users = socdem_counts.groupby('task')[['user_id']].sum()
unique_socdem_users / all_users

Unnamed: 0_level_0,user_id
task,Unnamed: 1_level_1
diaz,0.024983
dices-350,0.173077
intimacy,0.210728
offensiveness,0.206107
politeness,0.146245


### Un/seen profiles in *user* test splits
Instance-based split does not have unseen users/raters

In [24]:
train_profiles = df_text[df_text['user_split'] == 'train'].groupby('task')[['socdem']].agg(lambda x: set(x))

In [25]:
train_profiles

Unnamed: 0_level_0,socdem
task,Unnamed: 1_level_1
diaz,"{Man|60-69|Asian|High school or below, Woman|8..."
dices-350,"{Man|gen z|White|Unknown, Man|gen z|Black|High..."
intimacy,"{Woman|25-29|Unknown|College degree, Man|25-29..."
offensiveness,"{Man|50-59|White|College degree, Woman|50-59|W..."
politeness,"{Man|25-29|Black|High school or below, Woman|5..."


In [26]:
df_text['socdem_in_user_train'] = df_text[df_text['user_split'] == 'test'] \
.apply(lambda row: row['socdem'] in train_profiles['socdem'].loc[row['task']], axis = 1)

In [27]:
all_ratings_profiles = df_text[df_text['user_split'] == 'test'].groupby('task')['instance_id'].count()

In [28]:
seen_ratings_profiles = df_text.groupby('task')['socdem_in_user_train'].sum()
seen_ratings_profiles

task
diaz             11954
dices-350         5950
intimacy          1913
offensiveness     1796
politeness        3762
Name: socdem_in_user_train, dtype: object

In [29]:
seen_ratings_profiles / all_ratings_profiles

task
diaz             0.982898
dices-350         0.73913
intimacy          0.75315
offensiveness    0.681335
politeness       0.746132
dtype: object