In [1]:
from typing import Union
import json
import pandas as pd
import numpy as np

DATASET_PATH = 'kumar_dataset.json'
SAMPLE_PATH = '../../data/processed/kumar/sample_5000_annotators_v3.csv'

## Create sample from Kumar "Final" Dataset

Uses the "final" version of the disaggregated Kumar dataset which 1) includes all participants 2) has confirmation on the existence of below 18 annotators which should be removed for indiviudal analysis

In [2]:
def read_dataset(path):
    with open(path) as f:
        records = []
        for index, line in enumerate(f):
            line_dict = json.loads(line)
            for annotation in line_dict['ratings']:
                record = {
                    'comment': line_dict['comment'],
                    'id': index,
                    'comment_id': line_dict['comment_id'],
                    'source': line_dict['source']
                }
                record.update(annotation)
                records.append(record)
    return pd.DataFrame(records)

In [3]:
df = read_dataset(DATASET_PATH)

In [4]:
df.shape

(538100, 31)

In [5]:
df.columns

Index(['comment', 'id', 'comment_id', 'source', 'toxic_score', 'is_profane',
       'is_threat', 'is_identity_attack', 'is_insult', 'is_sexual_harassment',
       'gender', 'gender_other', 'race', 'technology_impact',
       'uses_media_social', 'uses_media_news', 'uses_media_video',
       'uses_media_forums', 'personally_seen_toxic_content',
       'personally_been_target', 'identify_as_transgender',
       'toxic_comments_problem', 'education', 'age_range', 'lgbtq_status',
       'political_affilation', 'is_parent', 'religion_important',
       'fine_to_see_online', 'remove_from_online', 'worker_id'],
      dtype='object')

In [6]:
df[[c for c in df.columns if c not in ['comment', 'worker_id']]].head()

Unnamed: 0,id,comment_id,source,toxic_score,is_profane,is_threat,is_identity_attack,is_insult,is_sexual_harassment,gender,...,identify_as_transgender,toxic_comments_problem,education,age_range,lgbtq_status,political_affilation,is_parent,religion_important,fine_to_see_online,remove_from_online
0,0,1135_1,twitter,0,False,False,False,False,False,Male,...,No,Rarely a problem,Bachelor's degree in college (4-year),45 - 54,Heterosexual,Conservative,Yes,Very important,This is fine for me to see,This comment should be allowed
1,0,1135_1,twitter,0,False,False,False,False,False,Female,...,No,Frequently a problem,Bachelor's degree in college (4-year),25 - 34,Heterosexual,Liberal,No,Not important,This is fine for me to see,This comment should be allowed
2,0,1135_1,twitter,0,False,False,False,False,False,Female,...,No,Frequently a problem,Some college but no degree,25 - 34,Heterosexual,Prefer not to say,No,Not important,This is fine for me to see,This comment should be allowed
3,0,1135_1,twitter,0,False,False,False,False,False,Female,...,No,Occasionally a problem,Bachelor's degree in college (4-year),55 - 64,Heterosexual,Liberal,No,Not important,This is fine for me to see,This comment should be allowed
4,0,1135_1,twitter,0,False,False,False,False,False,Male,...,No,Very frequently a problem,Bachelor's degree in college (4-year),25 - 34,Heterosexual,Liberal,No,Not important,It depends on the context,This comment should be allowed


In [7]:
df['toxic_score'].value_counts()

0    281511
1     99973
2     71347
3     50300
4     34969
Name: toxic_score, dtype: int64

In [8]:
df['worker_id'].value_counts().describe()

count    17280.000000
mean        31.140046
std         20.158440
min         20.000000
25%         20.000000
50%         20.000000
75%         40.000000
max        240.000000
Name: worker_id, dtype: float64

In [9]:
df['comment_id'].value_counts().describe()

count    107620.0
mean          5.0
std           0.0
min           5.0
25%           5.0
50%           5.0
75%           5.0
max           5.0
Name: comment_id, dtype: float64

In [10]:
df['comment'].str.len().describe()

count    538100.000000
mean        112.083906
std         138.709632
min           3.000000
25%          48.000000
50%          80.000000
75%         139.000000
max        7754.000000
Name: comment, dtype: float64

### Binarizing labels

Kumar et al. use different aggregation strategies, one binarized and one keeping the original 5-point scale.

For aggregation of the 5 ratings the comment, they use the median score across ratings.

For binarization, they “convert every comment’s rating distribution into a binary verdict. We treat every comment with a median Likert score of “Moderately toxic” or higher as toxic and all other comments as benign.” (Kumar et al., 2021, p. 10)

When predicting decisions for individual annotators, we would not need to do this type of aggregation or binarization. However, it makes the prediciton task simpler.

In [11]:
df['toxic'] = df['toxic_score'].apply(lambda x: 1 if x > 1 else 0)

In [12]:
df['toxic'].value_counts()

0    381484
1    156616
Name: toxic, dtype: int64

## Annotation statistics after preprocessing

In [13]:
# exclude annotators who are underage
df_processed = df[df['age_range'] != 'Under 18']

In [14]:
df_processed.groupby('worker_id').size().describe()

count    17275.000000
mean        31.142113
std         20.160532
min         20.000000
25%         20.000000
50%         20.000000
75%         40.000000
max        240.000000
dtype: float64

In [15]:
# how many comments with less than 5 annotations?
df_processed.groupby('comment_id').size().value_counts()

5    107500
4       120
dtype: int64

It is only a small fraction, will accept. Also no annotator has less than 20 annotations

## Random Sample

In [16]:
def create_sample(
        df,
        n_participants,
        do_extend=True
    ):

    samples = []
    indecies = []
    participants = set()

    sample_df = pd.DataFrame(df[['comment_id', 'comment', 'worker_id', 'toxic']])
    examples = sample_df \
                .groupby(['comment_id']) \
                .agg({
                    'comment': 'first', 
                    'worker_id': list,
                    'toxic': list
                }) \
            .reset_index()

    while len(participants) < n_participants:
        example = examples[~examples.index.isin(indecies)] \
            .sample(random_state=3206589348)
        indecies.append(example.index.item())
        participants |= set(example['worker_id'].item())
        samples.append(example)

    sample = pd.concat(samples)

    if do_extend:
        # extend sample with examples that are annotated only by 
        # annotators who are already part of the sample
        indecies = sample.index
        not_yet_sample = examples[~examples.index.isin(indecies)]
        addition = not_yet_sample[not_yet_sample['worker_id'] \
            .apply(lambda x: all([annotator in participants for annotator in x]))]
        sample = pd.concat([sample, addition])

    return sample

In [17]:
sample = create_sample(df_processed, n_participants=5000)

In [19]:
exploded_sample = sample.explode(['worker_id', 'toxic'])
exploded_sample = exploded_sample.reset_index(drop=True).rename(columns={'comment_id': 'id'})

In [25]:
exploded_sample.to_csv(SAMPLE_PATH, index=False)