In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../../shared_data/processed/merged_data.csv', low_memory=False)

In [3]:
TASKS_WITH_TEXT = [
    'intimacy', 
    'politeness', 
    'offensiveness',
    'diaz'
]

## Attributes

In [4]:
df['instance_id'].nunique()

23122

In [5]:
df['gender'].value_counts(dropna=False)

gender
Woman         134634
Man           117967
Non-binary      2245
Other            364
Unknown          149
Name: count, dtype: int64

In [6]:
df['age'].value_counts(dropna=False)

age
millenial    54142
gen z        34499
60-69        33668
50-59        32397
gen x+       26512
18-24        21097
70-79        12118
30-34        10268
25-29         9978
35-39         6936
40-44         6347
45-49         5293
80-89         1643
Unknown        285
90-99          132
100+            44
Name: count, dtype: int64

In [7]:
df['race'].value_counts(dropna=False)

race
White                              125532
Asian                               39029
Black                               32921
Other                               30025
Hispanic/Latino                     17195
Multiracial                          8732
Native American                      1085
Pacific Islander                      317
Unknown                               290
Arab                                   99
Middle Eastern                         84
American India or Alaska Native        50
Name: count, dtype: int64

In [8]:
df['education'].value_counts(dropna=False)

education
College degree or higher              90304
High school or below                  49648
College degree                        39713
NaN                                   28958
Some college or associate's degree    23044
Graduate degree                       18477
Unknown                                3595
Less than high school                  1620
Name: count, dtype: int64

### Attributes for Tasks with Text

In [9]:
df_text = df[df['task'].isin(TASKS_WITH_TEXT)]

In [10]:
df_text['age'].value_counts(dropna=False)

age
60-69      33223
50-59      30697
70-79      12058
18-24       7925
30-34       6287
25-29       5481
35-39       4991
40-44       4400
45-49       4082
80-89       1643
Unknown      285
90-99        132
100+          44
Name: count, dtype: int64

In [11]:
df_text['gender'].value_counts(dropna=False)

gender
Woman         55360
Man           54155
Non-binary     1584
Unknown         149
Name: count, dtype: int64

In [12]:
df_text['race'].value_counts(dropna=False)

race
White                              83215
Black                              14508
Asian                               7265
Hispanic/Latino                     2325
Other                               1325
Native American                     1085
Multiracial                          685
Pacific Islander                     317
Unknown                              290
Arab                                  99
Middle Eastern                        84
American India or Alaska Native       50
Name: count, dtype: int64

In [13]:
df_text['education'].value_counts(dropna=False)

education
College degree                        39713
High school or below                  27249
Some college or associate's degree    23044
Graduate degree                       18477
Less than high school                  1620
Unknown                                1145
Name: count, dtype: int64

In [14]:
for attribute in ['age', 'race', 'gender', 'education']:
    values = df[attribute].unique()
    values = [v for v in values if type(v) == str]
    tokens = [value.upper().replace(" ", "_").replace("'S", "") for value in values]
    tokens = [f'___{token}___' for token in tokens]
    token_values_map = {token: value for token, value in zip(tokens, values)}
    print(token_values_map)

{'___45-49___': '45-49', '___25-29___': '25-29', '___18-24___': '18-24', '___30-34___': '30-34', '___50-59___': '50-59', '___60-69___': '60-69', '___UNKNOWN___': 'Unknown', '___35-39___': '35-39', '___40-44___': '40-44', '___70-79___': '70-79', '___90-99___': '90-99', '___80-89___': '80-89', '___MILLENIAL___': 'millenial', '___GEN_X+___': 'gen x+', '___GEN_Z___': 'gen z', '___100+___': '100+'}
{'___WHITE___': 'White', '___ASIAN___': 'Asian', '___MULTIRACIAL___': 'Multiracial', '___BLACK___': 'Black', '___HISPANIC/LATINO___': 'Hispanic/Latino', '___UNKNOWN___': 'Unknown', '___PACIFIC_ISLANDER___': 'Pacific Islander', '___NATIVE_AMERICAN___': 'Native American', '___ARAB___': 'Arab', '___AMERICAN_INDIA_OR_ALASKA_NATIVE___': 'American India or Alaska Native', '___OTHER___': 'Other', '___MIDDLE_EASTERN___': 'Middle Eastern'}
{'___MAN___': 'Man', '___WOMAN___': 'Woman', '___NON-BINARY___': 'Non-binary', '___UNKNOWN___': 'Unknown', '___OTHER___': 'Other'}
{'___HIGH_SCHOOL_OR_BELOW___': 'High 

### Attribute values description length

In [15]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    'meta-llama/Meta-Llama-3-8B'
)

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
records = []
for attribute in ['age', 'race', 'gender', 'education']:
    values = df[attribute].unique()
    values = [v for v in values if type(v) == str]
    lens = [len(tokenizer(v).input_ids[1:]) for v in values]
    records.extend([{
        'attribute': attribute, 
        'description': v,
        'length': l
    } for v, l in zip(values, lens)])

In [17]:
df_desc = pd.DataFrame.from_records(records)

In [18]:
# highest number of annotators for any task? This number minus one is the highest normalized id
df.groupby('task')['user_id'].nunique()

task
diaz                 1481
dices-350             123
dices-990             172
intimacy              261
offensiveness         262
politeness            506
yi-ding-cscw-2022     886
Name: user_id, dtype: int64

In [19]:
# max len preprocessed user_ids
len(tokenizer('unique identifier 1480')['input_ids'][1:])

5

In [20]:
df_desc.groupby('attribute')['length'].max()

attribute
age          3
education    6
gender       2
race         5
Name: length, dtype: int64

## Tasks, Splits, Labels

In [21]:
df['task'].value_counts()

task
dices-990            72103
diaz                 60654
dices-350            43050
yi-ding-cscw-2022    28958
politeness           25042
offensiveness        13036
intimacy             12516
Name: count, dtype: int64

### Number of ratings for each task per split 

Here, a rating is an annotation by a specific annotator on an instance. That is, instances here a not unique but the combination of instance and annotator is 

#### Instance Split

In [22]:
df.groupby(['task','instance_split']).count()[['instance_id']]

Unnamed: 0_level_0,Unnamed: 1_level_0,instance_id
task,instance_split,Unnamed: 2_level_1
diaz,test,12133
diaz,train,42519
diaz,val,6002
dices-350,test,8610
dices-350,train,30012
dices-350,val,4428
dices-990,test,14491
dices-990,train,50477
dices-990,val,7135
intimacy,test,2490


##### Datasets by Training Set Size (Ratings)

In [23]:
df[(df['instance_split'] == 'train') & (df['task'] != 'yi-ding-cscw-2022')].groupby(['task']).count()[['instance_id']].rename(columns={'instance_id': 'ratings in train set'}).sort_values(by='ratings in train set', ascending=False)

Unnamed: 0_level_0,ratings in train set
task,Unnamed: 1_level_1
dices-990,50477
diaz,42519
dices-350,30012
politeness,17524
offensiveness,9144
intimacy,8784


#### User Split

In [24]:
df.groupby(['task','user_split']).count()[['instance_id']]

Unnamed: 0_level_0,Unnamed: 1_level_0,instance_id
task,user_split,Unnamed: 2_level_1
diaz,test,12162
diaz,train,42413
diaz,val,6079
dices-350,test,8750
dices-350,train,30100
dices-350,val,4200
dices-990,test,12379
dices-990,train,51803
dices-990,val,7921
intimacy,test,2540


### Number of unique instances for each task

Here we count unique instances, not ratings

#### Instance Split

In [25]:
df.drop_duplicates(subset='instance_id').groupby(['task','instance_split']).count()[['instance_id']]

Unnamed: 0_level_0,Unnamed: 1_level_0,instance_id
task,instance_split,Unnamed: 2_level_1
diaz,test,2815
diaz,train,9849
diaz,val,1407
dices-350,test,70
dices-350,train,244
dices-350,val,36
dices-990,test,199
dices-990,train,693
dices-990,val,98
intimacy,test,399


#### User Split

In [26]:
df.drop_duplicates(subset='instance_id').groupby(['task','instance_split']).count()[['instance_id']]

Unnamed: 0_level_0,Unnamed: 1_level_0,instance_id
task,instance_split,Unnamed: 2_level_1
diaz,test,2815
diaz,train,9849
diaz,val,1407
dices-350,test,70
dices-350,train,244
dices-350,val,36
dices-990,test,199
dices-990,train,693
dices-990,val,98
intimacy,test,399


### Number of unique users/annotators for each task

#### Instance Split

In [27]:
df.groupby(['task','instance_split'])[['user_id']].nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id
task,instance_split,Unnamed: 2_level_1
diaz,test,1481
diaz,train,1481
diaz,val,1447
dices-350,test,123
dices-350,train,123
dices-350,val,123
dices-990,test,172
dices-990,train,171
dices-990,val,169
intimacy,test,261


#### User Split

In [28]:
df.groupby(['task','user_split'])[['user_id']].nunique()

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id
task,user_split,Unnamed: 2_level_1
diaz,test,297
diaz,train,1036
diaz,val,148
dices-350,test,25
dices-350,train,86
dices-350,val,12
dices-990,test,35
dices-990,train,120
dices-990,val,17
intimacy,test,53


### Label distributions for each task

In [29]:
df.groupby(['task', 'label']).count()[['instance_id']]

Unnamed: 0_level_0,Unnamed: 1_level_0,instance_id
task,label,Unnamed: 2_level_1
diaz,0.0,3082
diaz,0.25,11997
diaz,0.5,25235
diaz,0.75,14359
diaz,1.0,5981
dices-350,0.0,26292
dices-350,0.5,2694
dices-350,1.0,14064
dices-990,0.0,48995
dices-990,0.5,3751


### Distribution of number of annotations per instance (instance split)

In [30]:
df.groupby(['task', 'instance_split', 'instance_id']).count()['user_id'].groupby(['task', 'instance_split']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
task,instance_split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
diaz,test,2815.0,4.310124,0.78911,4.0,4.0,4.0,4.0,10.0
diaz,train,9849.0,4.317088,0.826472,4.0,4.0,4.0,4.0,12.0
diaz,val,1407.0,4.265814,0.755532,4.0,4.0,4.0,4.0,10.0
dices-350,test,70.0,123.0,0.0,123.0,123.0,123.0,123.0,123.0
dices-350,train,244.0,123.0,0.0,123.0,123.0,123.0,123.0,123.0
dices-350,val,36.0,123.0,0.0,123.0,123.0,123.0,123.0,123.0
dices-990,test,199.0,72.819095,1.149193,69.0,72.0,73.0,74.0,75.0
dices-990,train,693.0,72.838384,1.189974,69.0,72.0,73.0,74.0,76.0
dices-990,val,98.0,72.806122,1.022007,71.0,72.0,73.0,74.0,75.0
intimacy,test,399.0,6.240602,0.983408,1.0,6.0,6.0,7.0,7.0


### Distribution of number of annotations per instance (user split)

In [31]:
df.groupby(['task', 'user_split', 'instance_id']).count()['user_id'].groupby(['task', 'user_split']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
task,user_split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
diaz,test,9017.0,1.348786,0.58921,1.0,1.0,1.0,2.0,6.0
diaz,train,13991.0,3.031449,1.012155,1.0,2.0,3.0,4.0,10.0
diaz,val,5116.0,1.188233,0.453019,1.0,1.0,1.0,1.0,6.0
dices-350,test,350.0,25.0,0.0,25.0,25.0,25.0,25.0,25.0
dices-350,train,350.0,86.0,0.0,86.0,86.0,86.0,86.0,86.0
dices-350,val,350.0,12.0,0.0,12.0,12.0,12.0,12.0,12.0
dices-990,test,990.0,12.50404,2.021993,7.0,11.0,13.0,14.0,18.0
dices-990,train,990.0,52.326263,2.089895,46.0,51.0,52.0,54.0,58.0
dices-990,val,990.0,8.00101,0.986767,5.0,7.0,8.0,9.0,11.0
intimacy,test,1508.0,1.68435,0.826627,1.0,1.0,1.0,2.0,5.0


### Distribution of number of annotations per user (instance split)

In [32]:
df.groupby(['task', 'instance_split', 'user_id']).count()['instance_id'].groupby(['task', 'instance_split']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
task,instance_split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
diaz,test,1481.0,8.192438,2.592108,2.0,7.0,8.0,10.0,17.0
diaz,train,1481.0,28.709656,3.250691,18.0,27.0,29.0,31.0,38.0
diaz,val,1447.0,4.147892,1.840521,1.0,3.0,4.0,5.0,11.0
dices-350,test,123.0,70.0,0.0,70.0,70.0,70.0,70.0,70.0
dices-350,train,123.0,244.0,0.0,244.0,244.0,244.0,244.0,244.0
dices-350,val,123.0,36.0,0.0,36.0,36.0,36.0,36.0,36.0
dices-990,test,172.0,84.25,59.323587,1.0,51.75,71.5,95.5,199.0
dices-990,train,171.0,295.187135,204.554811,1.0,189.5,234.0,333.5,693.0
dices-990,val,169.0,42.218935,28.576016,1.0,26.0,33.0,47.0,98.0
intimacy,test,261.0,9.54023,2.923925,1.0,8.0,9.0,11.0,19.0


### Distribution of number of annotations per user (user split)

In [33]:
df.groupby(['task', 'user_split', 'user_id']).count()['instance_id'].groupby(['task', 'user_split']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
task,user_split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
diaz,test,297.0,40.949495,1.01049,40.0,40.0,41.0,42.0,44.0
diaz,train,1036.0,40.939189,0.979583,40.0,40.0,41.0,42.0,44.0
diaz,val,148.0,41.074324,1.004014,40.0,40.0,41.0,42.0,44.0
dices-350,test,25.0,350.0,0.0,350.0,350.0,350.0,350.0,350.0
dices-350,train,86.0,350.0,0.0,350.0,350.0,350.0,350.0,350.0
dices-350,val,12.0,350.0,0.0,350.0,350.0,350.0,350.0,350.0
dices-990,test,35.0,353.685714,235.047151,2.0,270.0,330.0,364.0,990.0
dices-990,train,120.0,431.691667,303.876351,1.0,266.0,331.0,517.0,990.0
dices-990,val,17.0,465.941176,316.799122,47.0,330.0,345.0,720.0,990.0
intimacy,test,53.0,47.924528,2.234769,42.0,47.0,48.0,50.0,50.0


In [34]:
df[df['task'] == 'dices-990'].groupby(['user_id']).count()['instance_id'].sort_values()[:20]

user_id
dices-990_296740250043827     1
dices-990_297212357650581     2
dices-990_296835612486158     2
dices-990_297211239397858     7
dices-990_297212254019548    12
dices-990_297212383066956    16
dices-990_296767110136704    19
dices-990_297211944549766    21
dices-990_296767022094381    24
dices-990_297211746573149    27
dices-990_297211960409584    31
dices-990_297212554813874    38
dices-990_296767492941200    40
dices-990_296708638173257    47
dices-990_296709554586074    48
dices-990_296708617501272    62
dices-990_297211428021249    63
dices-990_296738897147130    73
dices-990_297212444066256    83
dices-990_296740085481271    95
Name: instance_id, dtype: int64

In [35]:
df.columns

Index(['instance_id', 'text', 'user_id', 'gender', 'age', 'race', 'education',
       'label', 'labels', 'task', 'instance_split', 'user_split', 'bad_users'],
      dtype='object')

In [36]:
df[df['task'] == 'diaz']['labels'].value_counts()

labels
{'sentiment': 'Neutral'}              25235
{'sentiment': 'Somewhat positive'}    14359
{'sentiment': 'Somewhat negative'}    11997
{'sentiment': 'Very positive'}         5981
{'sentiment': 'Very negative'}         3082
Name: count, dtype: int64

**Your Profile**: In terms of race or ethnicity, you are {RACE}. You are {AGE} years old. In terms of gender, you are a {GENDER}. The highest degree or level of school that you have completed is {EDUCATION}. **Question**: Consider you read this email from a colleague, how polite do you think it is? **Email:**: "% (A) not polite at all (B) barely polite (C) somewhat polite (D) moderately polite (E) very polite **Answer**: (

**Your Profile**: {GENDER} {AGE} {RACE} {EDUCATION}\n**Question**: What is the sentiment of the following text?\n**Text**: {TEXT}\n(A) Very negative\n(B) Somewhat negative\n(C) Neutral\n(D) Somewhat positive\n(E) Very positive\n**Answer**: ( 

**Your Profile**: In terms of race or ethnicity, you are % **Question**: Consider you read this email from a colleague, how polite do you think it is? **Email:**: "% (A) not polite at all (B) barely polite (C) somewhat polite (D) moderately polite (E) very polite **Answer**: (

In [37]:
df[df['task'] == 'dices-990']['labels'].value_counts()

labels
{'Q_overall': 'No'}        48995
{'Q_overall': 'Yes'}       19357
{'Q_overall': 'Unsure'}     3751
Name: count, dtype: int64

## Text

In [38]:
df['text'].str.len().describe()

count    226401.000000
mean        259.087871
std         244.660142
min           5.000000
25%          92.000000
50%         142.000000
75%         364.000000
max        1789.000000
Name: text, dtype: float64

In [39]:
df['text_len'] = df['text'].apply(lambda x: len(x.split(' ')) if type(x) == str else 0)
df['char_len'] = df['text'].str.len()

In [40]:
df.groupby('task')['text_len'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
diaz,60654.0,18.645135,6.599359,1.0,13.0,19.0,24.0,49.0
dices-350,43050.0,66.254286,53.958504,9.0,26.0,48.0,92.0,255.0
dices-990,72103.0,72.949184,46.554639,2.0,38.0,66.0,99.0,351.0
intimacy,12516.0,11.49209,5.965798,1.0,7.0,11.0,15.0,34.0
offensiveness,13036.0,30.710417,23.062201,4.0,13.0,24.0,43.0,99.0
politeness,25042.0,34.200423,24.115036,7.0,17.0,27.0,45.0,302.0
yi-ding-cscw-2022,28958.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
df.groupby('task')['char_len'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
diaz,60654.0,96.571471,32.743496,6.0,71.0,100.0,126.0,237.0
dices-350,43050.0,365.314286,295.04837,54.0,141.0,260.5,491.0,1421.0
dices-990,72103.0,407.063576,254.532768,11.0,220.0,369.0,553.0,1789.0
intimacy,12516.0,65.122084,33.145787,5.0,36.0,61.0,91.0,144.0
offensiveness,13036.0,169.370666,129.034811,15.0,68.0,128.0,236.0,604.0
politeness,25042.0,187.686207,136.604098,24.0,88.0,148.0,244.0,952.0
yi-ding-cscw-2022,0.0,,,,,,,


In [42]:
df.groupby('task')['char_len'].quantile(0.95)

task
diaz                  137.0
dices-350            1018.0
dices-990             892.0
intimacy              124.0
offensiveness         445.0
politeness            470.0
yi-ding-cscw-2022       NaN
Name: char_len, dtype: float64

In [43]:
df['num_tokens'] = df[(df['task'] == 'dices-990') | (df['task'] == 'politeness')]['text'].apply(lambda txt: len(tokenizer(txt).input_ids[1:]))

In [44]:
df.groupby('task')['num_tokens'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
diaz,0.0,,,,,,,
dices-350,0.0,,,,,,,
dices-990,72103.0,110.05929,62.149895,6.0,64.0,104.0,150.0,423.0
intimacy,0.0,,,,,,,
offensiveness,0.0,,,,,,,
politeness,25042.0,44.123592,30.327538,7.0,22.0,35.0,57.0,205.0
yi-ding-cscw-2022,0.0,,,,,,,


In [45]:
df.groupby('task')['num_tokens'].quantile(q=0.95)

task
diaz                   NaN
dices-350              NaN
dices-990            224.0
intimacy               NaN
offensiveness          NaN
politeness           109.0
yi-ding-cscw-2022      NaN
Name: num_tokens, dtype: float64

In [46]:
sum(df['text'].isna())

28958

In [47]:
df[df['text'].isna()]['task'].value_counts()

task
yi-ding-cscw-2022    28958
Name: count, dtype: int64

Dices has context and response instead of text

In [48]:
df['context'].dropna().iloc[0]

KeyError: 'context'

In [None]:
df['context'].dropna().str.split(' ').apply(lambda x: len(x)).describe()

count    43050.000000
mean        49.094286
std         51.576061
min          2.000000
25%         10.000000
50%         25.500000
75%         73.000000
max        231.000000
Name: context, dtype: float64

In [None]:
df['response'].dropna().iloc[0]

"I'm not picking up on your vibe, human."

In [None]:
df['response'].dropna().str.split(' ').apply(lambda x: len(x)).describe()

count    43050.000000
mean        18.160000
std         10.971915
min          2.000000
25%         11.000000
50%         14.500000
75%         24.000000
max         86.000000
Name: response, dtype: float64

In [None]:
# merge context and reponse for dices datasets
df.apply(lambda row: f'{row["context"]} \nLAMDA: {row["response"]}' if 'dices' in row['task'] else row['text'], axis=1)

0                                   everyone loves seungwoo
1                                              What a catch
2         F&amp;O: Nifty50 breaches key support, VIX spi...
3         @user Thank you e3c76605-fd74-477c-b4fa-c5b301...
4         dunno who needs to hear this but if they wante...
                                ...                        
205557                              I don't mind being old.
205558    That patronizing look of pity makes me want to...
205559                              I don't mind being old.
205560           Life and learning does not end in old age.
205561           Life and learning does not end in old age.
Length: 205562, dtype: object

## Attributes per Task and Split

In [None]:
for name, group in df.groupby(['task','instance_split']):
    if name[0] == 'offensiveness':
        print(name[1])
        print(group['race'].value_counts(dropna=False))

test
race
White              1981
Black               332
Asian               180
Native American      63
Hispanic/Latino      46
Arab                  8
Name: count, dtype: int64
train
race
White              6867
Black              1184
Asian               674
Native American     210
Hispanic/Latino     175
Arab                 34
Name: count, dtype: int64
val
race
White              956
Black              169
Asian               95
Hispanic/Latino     29
Native American     26
Arab                 7
Name: count, dtype: int64


In [None]:
for name, group in df.groupby(['task','user_split']):
    if name[0] == 'offensiveness':
        print(name[1])
        print(group['race'].value_counts(dropna=False))

test
race
White              1939
Black               398
Asian               149
Native American     100
Hispanic/Latino      50
Name: count, dtype: int64
train
race
White              6916
Black              1041
Asian               750
Native American     199
Hispanic/Latino     150
Arab                 49
Name: count, dtype: int64
val
race
White              949
Black              246
Asian               50
Hispanic/Latino     50
Name: count, dtype: int64


## Instance

In [None]:
df['sociodemographics'] = df.apply(lambda row: f'{row["age"]} {row["gender"]} {row["race"]} {row["education"]}', axis=1)

In [None]:
profiles = df[(df['task'] == 'offensiveness') & (df['instance_split'] == 'test')].groupby(['sociodemographics'])['user_id'].unique()
profile_counts = df[(df['task'] == 'offensiveness') & (df['instance_split'] == 'test')].groupby(['sociodemographics'])['user_id'].nunique().sort_values()

In [None]:
profile_counts.shape

(104,)

In [None]:
sum(profile_counts)

262

In [None]:
test_unique_background_users = []
for p in profiles[profiles.apply(lambda x: len(x)) == 1]:
    test_unique_background_users.extend(p)

In [None]:
train_all_users = df[(df['task'] == 'offensiveness') & (df['instance_split'] == 'train')]['user_id'].unique()

In [None]:
len(train_all_users)

262

In [None]:
len([u for u in test_unique_background_users if u in train_all_users])

54

## User

### Offensiveness

In [None]:
profiles = df[(df['task'] == 'offensiveness') & (df['user_split'] == 'test')].groupby(['sociodemographics'])['user_id'].unique()
profile_counts = df[(df['task'] == 'offensiveness') & (df['user_split'] == 'test')].groupby(['sociodemographics'])['user_id'].nunique().sort_values(ascending=False)

In [None]:
profile_counts

sociodemographics
18-24 Man White College degree                    2
30-34 Man White College degree                    2
50-59 Woman White College degree                  2
50-59 Woman White Graduate degree                 2
30-34 Woman White High school or below            2
60-69 Man White High school or below              2
60-69 Man White Graduate degree                   2
50-59 Man White College degree                    2
25-29 Man White Graduate degree                   1
25-29 Man Black College degree                    1
18-24 Man Native American High school or below    1
18-24 Woman White College degree                  1
18-24 Man White High school or below              1
30-34 Woman White College degree                  1
30-34 Woman Asian High school or below            1
30-34 Man White Graduate degree                   1
30-34 Man Black Graduate degree                   1
25-29 Non-binary White College degree             1
25-29 Woman Asian College degree              

In [None]:
profile_counts[profile_counts == 1].shape

(37,)

In [None]:
train_profiles = df[(df['task'] == 'offensiveness') & (df['user_split'] == 'train')].groupby(['sociodemographics'])['user_id'].unique()
train_profile_counts = df[(df['task'] == 'offensiveness') & (df['user_split'] == 'train')].groupby(['sociodemographics'])['user_id'].nunique().sort_values(ascending=False)

In [None]:
len(profile_counts.index)

45

In [None]:
profiles_not_in_train = set(profile_counts.index) - set(train_profile_counts.index)
len(profiles_not_in_train)

17

In [None]:
len(profiles_not_in_train) / len(profile_counts.index)

0.37777777777777777

In [None]:
profile_counts[profile_counts == 1].shape[0] 

37

In [None]:
unique_profiles = profile_counts[profile_counts == 1].index

In [None]:
len([p for p in unique_profiles if p in train_profile_counts.index])

20

### Politeness

In [None]:
profiles = df[(df['task'] == 'politeness') & (df['user_split'] == 'test')].groupby(['sociodemographics'])['user_id'].unique()
profile_counts = df[(df['task'] == 'politeness') & (df['user_split'] == 'test')].groupby(['sociodemographics'])['user_id'].nunique().sort_values(ascending=False)

In [None]:
profile_counts

sociodemographics
60-69 Woman White College degree          8
40-44 Woman White College degree          3
18-24 Man White College degree            3
35-39 Man White College degree            3
50-59 Man White College degree            3
                                         ..
60-69 Woman White Unknown                 1
40-44 Woman Black College degree          1
40-44 Woman Black Unknown                 1
40-44 Woman White High school or below    1
30-34 Woman White Graduate degree         1
Name: user_id, Length: 67, dtype: int64

In [None]:
profile_counts[profile_counts == 1].shape

(48,)

In [None]:
train_profiles = df[(df['task'] == 'politeness') & (df['user_split'] == 'train')].groupby(['sociodemographics'])['user_id'].unique()
train_profile_counts = df[(df['task'] == 'politeness') & (df['user_split'] == 'train')].groupby(['sociodemographics'])['user_id'].nunique().sort_values(ascending=False)

In [None]:
train_profile_counts

sociodemographics
60-69 Man White College degree            19
60-69 Woman White High school or below    15
60-69 Woman White College degree          14
50-59 Man White College degree            14
60-69 Man White Graduate degree           13
                                          ..
25-29 Woman Black Graduate degree          1
40-44 Man Black College degree             1
40-44 Man Asian Graduate degree            1
40-44 Man Asian College degree             1
40-44 Woman White Graduate degree          1
Name: user_id, Length: 126, dtype: int64

In [None]:
len(profile_counts.index)

67

In [None]:
profiles_not_in_train = set(profile_counts.index) - set(train_profile_counts.index)
len(profiles_not_in_train)

23

In [None]:
len(profiles_not_in_train) / len(profile_counts.index)

0.34328358208955223

In [None]:
len([p for p in unique_profiles if p in train_profile_counts.index])

26

## Annotator IDs per Task and Split

In [None]:
for name, group in df.groupby(['task','instance_split']):
    if name[0] == 'intimacy':
        print(name[1])
        print(group['user_id'].value_counts(dropna=False))

test
user_id
intimacy_1611    19
intimacy_800     18
intimacy_238     17
intimacy_1189    16
intimacy_790     16
                 ..
intimacy_1196     4
intimacy_1123     3
intimacy_1351     3
intimacy_309      3
intimacy_1044     1
Name: count, Length: 261, dtype: int64
train
user_id
intimacy_1611    72
intimacy_238     65
intimacy_1189    63
intimacy_1123    43
intimacy_1328    42
                 ..
intimacy_1117    24
intimacy_1387    22
intimacy_979     14
intimacy_1044    11
intimacy_309     10
Name: count, Length: 261, dtype: int64
val
user_id
intimacy_1189    19
intimacy_83      13
intimacy_628     12
intimacy_30      12
intimacy_537     11
                 ..
intimacy_1500     1
intimacy_547      1
intimacy_1337     1
intimacy_1113     1
intimacy_1009     1
Name: count, Length: 260, dtype: int64


In [None]:
# Dataset is sorted by user_id with different values for the split columns
# This means that the first user in train will also be the first user in the other splits if not shuffling
df[df['task'] == 'intimacy'][['user_id', 'text', 'instance_split']].head(20)

Unnamed: 0,user_id,text,instance_split
0,intimacy_106,everyone loves seungwoo,val
1,intimacy_106,What a catch,train
2,intimacy_106,"F&amp;O: Nifty50 breaches key support, VIX spi...",val
3,intimacy_106,@user Thank you e3c76605-fd74-477c-b4fa-c5b301...,train
4,intimacy_106,dunno who needs to hear this but if they wante...,train
5,intimacy_106,@NickAdamsinUSA They actually lured people to ...,train
6,intimacy_106,"Give a compliment where it’s due , I promise i...",train
7,intimacy_106,This is the team some say would win Champions ...,train
8,intimacy_106,if you treat someone differently just because ...,train
9,intimacy_106,Considering a bowling league,train
