### <strong>POPQUORN</strong>
<strong>Po</strong>tato-<strong>P</strong>rolific dataset for <strong>Qu</strong>estion-Answering, <strong>O</strong>ffensiveness, text <strong>R</strong>ewriting and politeness rating with demographic <strong>N</strong>uance

[Source](https://github.com/Jiaxin-Pei/potato-prolific-dataset)
___

In [1]:
from IPython.display import display, HTML
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import krippendorff
import pandas as pd
import numpy as np
import copy
import json
import os

%matplotlib inline

In [2]:
path_to_dirs = '../../../data/Potato-Prolific-Dataset/dataset'
file_name = 'raw_data.csv'

In [3]:
# Group age ranges into three stages of adulthood as described in https://pubmed.ncbi.nlm.nih.gov/7203662/
stages_of_adulthood = {
    '18-24': 'Early adulthood',
    '25-29': 'Early adulthood',
    '30-34': 'Early adulthood',
    '35-39': 'Middle adulthood',
    '40-44': 'Middle adulthood',
    '45-49': 'Middle adulthood',
    '50-54': 'Middle adulthood',
    '54-59': 'Middle adulthood',
    '60-64': 'Middle adulthood',
    '>65': 'Late adulthood'
}

In [4]:
tasks = ['offensiveness', 'question_answering', 'email_rewriting', 'politeness_rating']

for task in tasks:

    path_to_data = f'{path_to_dirs}/{task}/{file_name}'
    df = pd.read_csv(path_to_data)
    df_profiles = df.drop_duplicates('user_id')

    # Treat 'Graduate degree' the same as 'College degree'
    df_profiles.loc[:, 'education'] = df_profiles['education'].replace({'Graduate degree': 'College degree'})
    # Group age ranges into three stages of adulthood 'Early', 'Middle' and 'Late'
    df_profiles.loc[:, 'age'] = df_profiles['age'].replace(stages_of_adulthood)

    profile_counts = df_profiles.groupby(['gender', 'race', 'age', 'education'], dropna=False).size().reset_index(name='count')

    profile_counts = profile_counts.sort_values(by=['race', 'count'], ascending=[True, False])

    top_profiles_per_race = profile_counts.groupby('race').head(5)
    # display(top_profiles_per_race)

    top_profiles_per_race.to_csv(f'extracted_profiles/{task}_profiles.csv', index=False)