In [3]:
from typing import Union
import json
import pandas as pd
import numpy as np
DATASET_PATH = 'kumar_dataset.json'

## Explore Kumar "Final" Dataset

Uses the "final" version of the disaggregated Kumar dataset which 1) includes all participants 2) has confirmation on the existence of below 18 annotators which should be removed for individual analysis

In [4]:
def read_dataset(path):
    with open(path) as f:
        records = []
        for index, line in enumerate(f):
            line_dict = json.loads(line)
            for annotation in line_dict['ratings']:
                record = {
                    'comment': line_dict['comment'],
                    'id': index,
                    'comment_id': line_dict['comment_id'],
                    'source': line_dict['source']
                }
                record.update(annotation)
                records.append(record)
    return pd.DataFrame(records)

In [5]:
df = read_dataset(DATASET_PATH)

In [6]:
df['toxic'] = df['toxic_score'].apply(lambda x: 1 if x > 1 else 0)

In [7]:
df.shape

(538100, 32)

**Columns**

> Comments are annotated for their perceived toxicity on a Likert-scale from "not at all toxic" (0) to "extremely toxic" (4). For comments that are "Slightly Toxic" (1) or higher, annotators provide additional labels for the type of toxcity: Insult, Profanity, Identity attack, Threat, Sexual harassment.
> In addition, metadata about the annotator is included with each annotation

In [8]:
df.columns

Index(['comment', 'id', 'comment_id', 'source', 'toxic_score', 'is_profane',
       'is_threat', 'is_identity_attack', 'is_insult', 'is_sexual_harassment',
       'gender', 'gender_other', 'race', 'technology_impact',
       'uses_media_social', 'uses_media_news', 'uses_media_video',
       'uses_media_forums', 'personally_seen_toxic_content',
       'personally_been_target', 'identify_as_transgender',
       'toxic_comments_problem', 'education', 'age_range', 'lgbtq_status',
       'political_affilation', 'is_parent', 'religion_important',
       'fine_to_see_online', 'remove_from_online', 'worker_id', 'toxic'],
      dtype='object')

In [9]:
df['toxic_score'].value_counts()

0    281511
1     99973
2     71347
3     50300
4     34969
Name: toxic_score, dtype: int64

In [10]:
df['worker_id'].nunique()

17280

In [11]:
df['worker_id'].value_counts().describe()

count    17280.000000
mean        31.140046
std         20.158440
min         20.000000
25%         20.000000
50%         20.000000
75%         40.000000
max        240.000000
Name: worker_id, dtype: float64

In [12]:
df['id'].value_counts().describe()

count    107620.0
mean          5.0
std           0.0
min           5.0
25%           5.0
50%           5.0
75%           5.0
max           5.0
Name: id, dtype: float64

In [13]:
df['comment'].str.len().describe()

count    538100.000000
mean        112.083906
std         138.709632
min           3.000000
25%          48.000000
50%          80.000000
75%         139.000000
max        7754.000000
Name: comment, dtype: float64

### Age

In [14]:
df['age_range'].value_counts()

25 - 34              214600
35 - 44              133800
45 - 54               69800
18 - 24               62480
55 - 64               39580
65 or older           16580
Prefer not to say      1140
Under 18                120
Name: age_range, dtype: int64

In [15]:
df.groupby('worker_id')['age_range'].agg(pd.Series.mode).apply(lambda age: 'Prefer not to say' if type(age) is np.ndarray else age).value_counts()


25 - 34              6590
35 - 44              3816
18 - 24              1962
45 - 54              1933
Prefer not to say    1532
55 - 64              1038
65 or older           404
Under 18                5
Name: age_range, dtype: int64

### Gender

In [16]:
df['gender'].value_counts()

Female               281020
Male                 250220
Prefer not to say      3160
Nonbinary              2940
Other                   160
Name: gender, dtype: int64

In [17]:
df.groupby('worker_id')['gender'].agg(pd.Series.mode).apply(lambda gender: 'Prefer not to say' if type(gender) is np.ndarray else gender).value_counts()

Female               8709
Male                 7394
Prefer not to say    1092
Nonbinary              80
Other                   5
Name: gender, dtype: int64

### Sexual orientation

In [18]:
df['lgbtq_status'].value_counts()

Heterosexual         442280
Bisexual              59580
Homosexual            19060
Prefer not to say     10580
Other                  4560
Name: lgbtq_status, dtype: int64

In [19]:
df.groupby('worker_id')['lgbtq_status'].agg(pd.Series.mode).apply(lambda sexuality: 'Prefer not to say' if type(sexuality) is np.ndarray else sexuality).value_counts()

Heterosexual         13770
Bisexual              1797
Prefer not to say     1075
Homosexual             513
Other                  125
Name: lgbtq_status, dtype: int64

### Education

In [20]:
df['education'].value_counts()

Bachelor's degree in college (4-year)                                     219420
Some college but no degree                                                108860
Master's degree                                                            81340
Associate degree in college (2-year)                                       58520
High school graduate (high school diploma or equivalent including GED)     48420
Professional degree (JD, MD)                                                8740
Doctoral degree                                                             6500
Less than high school degree                                                3120
Prefer not to say                                                           2080
Other                                                                        960
Name: education, dtype: int64

In [21]:
df.groupby('worker_id')['education'].agg(pd.Series.mode).apply(lambda education: 'Prefer not to say' if type(education) is np.ndarray else education).value_counts()

Bachelor's degree in college (4-year)                                     6540
Some college but no degree                                                3222
Master's degree                                                           2383
Prefer not to say                                                         1658
Associate degree in college (2-year)                                      1609
High school graduate (high school diploma or equivalent including GED)    1360
Professional degree (JD, MD)                                               233
Doctoral degree                                                            161
Less than high school degree                                                87
Other                                                                       27
Name: education, dtype: int64