In [1]:
import numpy as np
import pandas as pd
from utils import (
    calculate_qscore,
    calculate_qscore_by_demographic,
)

# Load data

In [2]:
dataset_file = "data/labels/processed/global_mapped_cleaned.csv"
raw_df = pd.read_csv(dataset_file, index_col=0)
countries = raw_df["Country"].unique()

# Calculate Q-scores

In [PP2](https://arxiv.org/abs/1608.01769), page 6, the max comparison across all indicators are:
- min 2.40
- avg 4.05, however on page 7 they state 3.35
- max 7.67 

# Gender

In [4]:
demographic = "gender"
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic, 'Image', 'Question', 'Score', 'Num_comparisons'])
raw_df[demographic].unique()

array(['female', 'male'], dtype=object)

In [5]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        raw_df, demographic, min_filter=4
    )
country = 'All'
for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df

  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,gender,Image,Question,Score,Num_comparisons
0,All,female,203,safe,4.806999,14
1,All,female,305,safe,4.801644,17
2,All,female,276,safe,4.049453,16
3,All,female,45,safe,5.497940,17
4,All,female,380,safe,6.625199,13
...,...,...,...,...,...,...
3995,All,male,386,green,4.047456,11
3996,All,male,257,green,4.145201,18
3997,All,male,107,green,5.358767,11
3998,All,male,25,green,5.051297,11


In [6]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = raw_df[raw_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic, min_filter=4
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df

Unnamed: 0,country,gender,Image,Question,Score,Num_comparisons
0,All,female,203,safe,4.806999,14
1,All,female,305,safe,4.801644,17
2,All,female,276,safe,4.049453,16
3,All,female,45,safe,5.497940,17
4,All,female,380,safe,6.625199,13
...,...,...,...,...,...,...
3976,Netherlands,female,185,green,7.500000,4
3982,Netherlands,female,9,green,8.412698,4
3984,Netherlands,female,364,green,3.166667,4
3986,Netherlands,female,380,green,3.166667,4


In [7]:
# save dataframe to csv
q_scores_demographic_df.to_csv(f"data/labels/processed/{demographic}_qscores.csv", index=False)


# Age

In [8]:
demographic_1 = "age_group"
demographic_2 = "age_group_2"
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
print(raw_df[demographic_1].unique())
print(raw_df[demographic_2].unique())

['40-49' '30-39' 'Above 50' '21-29']
['40+' '21-39']


## > 2 groups

In [9]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        raw_df, demographic_1, min_filter=4
    )
country = 'All'

for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,Question,age_group,Image
0,All,beautiful,21-29,300
1,All,beautiful,30-39,296
2,All,beautiful,40-49,268
3,All,beautiful,Above 50,400
4,All,boring,21-29,300
5,All,boring,30-39,297
6,All,boring,40-49,274
7,All,boring,Above 50,398
8,All,cycle,21-29,273
9,All,cycle,30-39,286


In [10]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = raw_df[raw_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_1, min_filter=4 
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])

q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

Unnamed: 0,country,Question,age_group,Image
0,All,beautiful,21-29,300
1,All,beautiful,30-39,296
2,All,beautiful,40-49,268
3,All,beautiful,Above 50,400
4,All,boring,21-29,300
...,...,...,...,...
233,USA,walk,Above 50,83
234,USA,wealthy,21-29,2
235,USA,wealthy,30-39,3
236,USA,wealthy,40-49,5


In [11]:
# save dataframe to csv
q_scores_demographic_df.to_csv(f"data/labels/processed/{demographic_1}_qscores.csv", index=False)


## 2 groups

In [12]:
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_2, 'Image', 'Question', 'Score', 'Num_comparisons'])

In [13]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        raw_df, demographic_2, min_filter=4
    )
country = 'All'

for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_2] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df.groupby(["country", "Question", demographic_2])["Image"].nunique().reset_index()

  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,Question,age_group_2,Image
0,All,beautiful,21-39,395
1,All,beautiful,40+,400
2,All,boring,21-39,396
3,All,boring,40+,400
4,All,cycle,21-39,392
5,All,cycle,40+,400
6,All,depressing,21-39,395
7,All,depressing,40+,400
8,All,green,21-39,398
9,All,green,40+,400


In [14]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = raw_df[raw_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_2, min_filter=4 
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_2] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])

q_scores_demographic_df.groupby(["country", "Question", demographic_2])["Image"].nunique().reset_index()

Unnamed: 0,country,Question,age_group_2,Image
0,All,beautiful,21-39,395
1,All,beautiful,40+,400
2,All,boring,21-39,396
3,All,boring,40+,400
4,All,cycle,21-39,392
...,...,...,...,...
115,USA,safe,40+,165
116,USA,walk,21-39,42
117,USA,walk,40+,170
118,USA,wealthy,21-39,38


In [15]:
# save dataframe to csv
q_scores_demographic_df.to_csv(f"data/labels/processed/{demographic_2}_qscores.csv", index=False)


# AHI

In [16]:
demographic_1 = "ahi_2_remapped"
demographic_2 = "ahi_3_remapped"
print(raw_df["ahi_2"].unique())
print(raw_df["ahi_3"].unique())

['SGD$0 - SGD$99,999' 'SGD$100,000 - SGD$999,999' 'SGD$0 - SGD$9,999'
 '4,000,000 NGN & below' 'Prefer not to answer' '4,000,001 NGN & above'
 'USD$0 - USD$99,999' 'USD$100,000 - USD$999,999' '40,400,001 CLP & above'
 '48,000 euros & above' '40,400,000 CLP & below' '47,999 euros & below']
['SGD$45,000 - SGD$99,999' 'SGD$0 - SGD$44,999'
 'SGD$100,000 - SGD$999,999' '2,000,001 NGN-4,000,000 NGN'
 'Prefer not to answer' '2,000,000 NGN & below' '4,000,001 NGN & above'
 'USD$50,000 - USD$99,999' 'USD$0 - USD$49,999'
 'USD$100,000 - USD$999,999' '60,400,001 CLP & above'
 '30,000 euros-81,999 euros' '20,400,001 CLP-60,400,000 CLP'
 '20,400,000 CLP & below' '82,000 euros & above' '29,999 euros & below']


## Remapping

In [17]:
two_df = raw_df.copy()
two_df["ahi_2_remapped"] = two_df["ahi_2"].map(
    {
        "40,400,000 CLP & below": "below",
        "40,400,001 CLP & above": "above",
        
        "47,999 euros & below": "below",
        "48,000 euros & above": "above",
        
        "4,000,000 NGN & below": "below",
        "4,000,001 NGN & above": "above",
        
        "SGD$0 - SGD$99,999": "below",
        "SGD$100,000 - SGD$999,999": "above",        
        
        "USD$0 - USD$99,999": "below",
        "USD$100,000 - USD$999,999": "above",
        
        "Prefer not to answer": "Prefer not to answer",
    }
)
two_df = two_df[two_df['ahi_2_remapped'].isin(['above', 'below'])]
two_df.groupby(["Country", "ahi_2_remapped"])["Respondent"].nunique().reset_index()

Unnamed: 0,Country,ahi_2_remapped,Respondent
0,Chile,above,37
1,Chile,below,151
2,Netherlands,above,95
3,Netherlands,below,88
4,Nigeria,above,60
5,Nigeria,below,134
6,Singapore,above,85
7,Singapore,below,93
8,USA,above,92
9,USA,below,105


In [20]:
three_df = raw_df.copy()
three_df["ahi_3_remapped"] = raw_df["ahi_3"].map(
    {
        "20,400,000 CLP & below": "below",
        "20,400,001 CLP-60,400,000 CLP": "middle",
        "60,400,001 CLP & above": "above",
        
        "29,999 euros & below": "below",
        "30,000 euros-81,999 euros": "middle",
        "82,000 euros & above": "above",
        
        "2,000,000 NGN & below": "below",
        "2,000,001 NGN-4,000,000 NGN": "middle",
        "4,000,001 NGN & above": "above",
        
        "SGD$0 - SGD$44,999": "below",
        "SGD$45,000 - SGD$99,999": "middle",
        "SGD$100,000 - SGD$999,999": "above",
        
        "USD$0 - USD$49,999": "below",
        "USD$50,000 - USD$99,999": "middle",
        "USD$100,000 - USD$999,999": "above",
        
        "Prefer not to answer": "Prefer not to answer",
    }
)
aux_three_df = three_df.copy()
three_df = three_df[three_df['ahi_3_remapped'].isin(["above", "middle", "below"])]
three_df.groupby(["Country","ahi_3_remapped"])["Respondent"].nunique().reset_index()

Unnamed: 0,Country,ahi_3_remapped,Respondent
0,Chile,above,19
1,Chile,below,119
2,Chile,middle,50
3,Netherlands,above,39
4,Netherlands,below,43
5,Netherlands,middle,101
6,Nigeria,above,60
7,Nigeria,below,66
8,Nigeria,middle,68
9,Singapore,above,85


In [49]:
# count of removed ratings
removed_three_df = aux_three_df[aux_three_df['ahi_3_remapped'].isin(["Prefer not to answer"])]
print(removed_three_df.groupby(["Country", "Question"])["Left_image"].nunique().reset_index())#name="unique_count")#["unique_count"].nunique())
average_per_country = (
    removed_three_df
    .groupby(["Country", "Question"])["Left_image"]
    .nunique()
    .reset_index(name="unique_count")
    .groupby(["Country"])["unique_count"]
    .mean()
    .reset_index(name="average_unique_left_images_per_question")
)
average_per_country

        Country     Question  Left_image
0         Chile    beautiful          54
1         Chile       boring          56
2         Chile        cycle          54
3         Chile   depressing          57
4         Chile        green          55
5         Chile  live nearby          52
6         Chile       lively          58
7         Chile         safe          51
8         Chile         walk          56
9         Chile      wealthy          54
10  Netherlands    beautiful          81
11  Netherlands       boring          74
12  Netherlands        cycle          77
13  Netherlands   depressing          72
14  Netherlands        green          75
15  Netherlands  live nearby          79
16  Netherlands       lively          80
17  Netherlands         safe          76
18  Netherlands         walk          72
19  Netherlands      wealthy          78
20      Nigeria    beautiful          30
21      Nigeria       boring          30
22      Nigeria        cycle          30
23      Nigeria 

Unnamed: 0,Country,average_unique_left_images_per_question
0,Chile,54.7
1,Netherlands,76.4
2,Nigeria,29.0
3,Singapore,5.0
4,USA,14.7


## 2 groups

In [29]:
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])

In [30]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        two_df, demographic_1, min_filter=4
    )
country = 'All'

for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
    
q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,Question,ahi_2_remapped,Image
0,All,beautiful,above,397
1,All,beautiful,below,400
2,All,boring,above,394
3,All,boring,below,400
4,All,cycle,above,387
5,All,cycle,below,399
6,All,depressing,above,395
7,All,depressing,below,398
8,All,green,above,391
9,All,green,below,400


In [31]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = two_df[two_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_1, min_filter=4 
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])

q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index().sort_values(by="Image", ascending=False)

Unnamed: 0,country,Question,ahi_2_remapped,Image
15,All,safe,below,400
11,All,live nearby,below,400
1,All,beautiful,below,400
3,All,boring,below,400
19,All,wealthy,below,400
...,...,...,...,...
36,Chile,walk,above,6
32,Chile,lively,above,5
28,Chile,green,above,4
20,Chile,beautiful,above,4


In [32]:
# save dataframe to csv
q_scores_demographic_df.to_csv(f"data/labels/processed/{demographic_1}_qscores.csv", index=False)


## 3 groups

In [33]:
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_2, 'Image', 'Question', 'Score', 'Num_comparisons'])

In [34]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        three_df, demographic_2, min_filter=4
    )
country = 'All'

for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_2] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df.groupby(["country", "Question", demographic_2])["Image"].nunique().reset_index()

  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,Question,ahi_3_remapped,Image
0,All,beautiful,above,376
1,All,beautiful,below,392
2,All,beautiful,middle,381
3,All,boring,above,379
4,All,boring,below,387
5,All,boring,middle,387
6,All,cycle,above,362
7,All,cycle,below,377
8,All,cycle,middle,378
9,All,depressing,above,376


In [35]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = three_df[three_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_2, min_filter=4 
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_2] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])

q_scores_demographic_df.groupby(["country", "Question", demographic_2])["Image"].nunique().reset_index()

Unnamed: 0,country,Question,ahi_3_remapped,Image
0,All,beautiful,above,376
1,All,beautiful,below,392
2,All,beautiful,middle,381
3,All,boring,above,379
4,All,boring,below,387
...,...,...,...,...
171,USA,walk,below,17
172,USA,walk,middle,17
173,USA,wealthy,above,84
174,USA,wealthy,below,19


In [36]:
# save dataframe to csv
q_scores_demographic_df.to_csv(f"data/labels/processed/{demographic_2}_qscores.csv", index=False)


# Education level

In [54]:
demographic_1 = "education_level"
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
raw_df.groupby("education_level")["Respondent"].nunique().reset_index()

Unnamed: 0,education_level,Respondent
0,Bachelors degree,380
1,College or Vocational training or Diploma,229
2,Doctorate or professional degree,38
3,Less than Secondary/High school,11
4,Masters degree,122
5,Other,11
6,Secondary/High school graduate or equivalent,209


In [55]:
raw_df.groupby(["Country", "education_level"])["Respondent"].nunique().reset_index()

Unnamed: 0,Country,education_level,Respondent
0,Chile,Bachelors degree,42
1,Chile,College or Vocational training or Diploma,71
2,Chile,Doctorate or professional degree,9
3,Chile,Less than Secondary/High school,3
4,Chile,Masters degree,4
5,Chile,Other,7
6,Chile,Secondary/High school graduate or equivalent,64
7,Netherlands,Bachelors degree,52
8,Netherlands,College or Vocational training or Diploma,43
9,Netherlands,Doctorate or professional degree,9


## filtering

In [56]:
# remove Other and Less than Secondary/High school
filtered_df = raw_df.copy()
filtered_df = filtered_df[~filtered_df[demographic_1].isin(['Other', 'Less than Secondary/High school'])]
filtered_df.groupby([demographic_1])["Respondent"].nunique().reset_index()

Unnamed: 0,education_level,Respondent
0,Bachelors degree,380
1,College or Vocational training or Diploma,229
2,Doctorate or professional degree,38
3,Masters degree,122
4,Secondary/High school graduate or equivalent,209


In [57]:
filtered_df.groupby(["Country", demographic_1])["Respondent"].nunique().reset_index()

Unnamed: 0,Country,education_level,Respondent
0,Chile,Bachelors degree,42
1,Chile,College or Vocational training or Diploma,71
2,Chile,Doctorate or professional degree,9
3,Chile,Masters degree,4
4,Chile,Secondary/High school graduate or equivalent,64
5,Netherlands,Bachelors degree,52
6,Netherlands,College or Vocational training or Diploma,43
7,Netherlands,Doctorate or professional degree,9
8,Netherlands,Masters degree,24
9,Netherlands,Secondary/High school graduate or equivalent,68


## remapping

In [58]:
remapped_df = filtered_df.copy()
remapped_df["education_level"] = remapped_df["education_level"].replace({
    "Masters degree": "Postgraduate degree", 
    "Doctorate or professional degree": "Postgraduate degree"
})

remapped_df.groupby(["Country", demographic_1])["Respondent"].nunique().reset_index()

Unnamed: 0,Country,education_level,Respondent
0,Chile,Bachelors degree,42
1,Chile,College or Vocational training or Diploma,71
2,Chile,Postgraduate degree,13
3,Chile,Secondary/High school graduate or equivalent,64
4,Netherlands,Bachelors degree,52
5,Netherlands,College or Vocational training or Diploma,43
6,Netherlands,Postgraduate degree,33
7,Netherlands,Secondary/High school graduate or equivalent,68
8,Nigeria,Bachelors degree,108
9,Nigeria,College or Vocational training or Diploma,31


In [59]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        remapped_df, demographic_1, min_filter=4
    )
country = 'All'

for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,Question,education_level,Image
0,All,beautiful,Bachelors degree,393
1,All,beautiful,College or Vocational training or Diploma,338
2,All,beautiful,Postgraduate degree,227
3,All,beautiful,Secondary/High school graduate or equivalent,299
4,All,boring,Bachelors degree,393
5,All,boring,College or Vocational training or Diploma,333
6,All,boring,Postgraduate degree,225
7,All,boring,Secondary/High school graduate or equivalent,308
8,All,cycle,Bachelors degree,387
9,All,cycle,College or Vocational training or Diploma,313


In [60]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = remapped_df[remapped_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_1, min_filter=4 
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])

q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

Unnamed: 0,country,Question,education_level,Image
0,All,beautiful,Bachelors degree,393
1,All,beautiful,College or Vocational training or Diploma,338
2,All,beautiful,Postgraduate degree,227
3,All,beautiful,Secondary/High school graduate or equivalent,299
4,All,boring,Bachelors degree,393
...,...,...,...,...
217,USA,walk,Secondary/High school graduate or equivalent,2
218,USA,wealthy,Bachelors degree,60
219,USA,wealthy,College or Vocational training or Diploma,11
220,USA,wealthy,Postgraduate degree,14


In [61]:
# save dataframe to csv
q_scores_demographic_df.to_csv(f"data/labels/processed/{demographic_1}_qscores.csv", index=False)

# Race & etnicity

In [37]:
demographic_1 = "race_ethnicity"
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
print(raw_df[demographic_1].unique())

['Asian or Pacific Islander' 'White or Caucasian'
 'A race/ethnicity not listed here' 'Black or African American'
 'Middle Eastern or  North African' 'Hispanic or Latino'
 'Native American or Alaska Native']


In [38]:
raw_df.groupby(demographic_1)["Respondent"].nunique().reset_index()

Unnamed: 0,race_ethnicity,Respondent
0,A race/ethnicity not listed here,69
1,Asian or Pacific Islander,231
2,Black or African American,184
3,Hispanic or Latino,166
4,Middle Eastern or North African,32
5,Native American or Alaska Native,5
6,White or Caucasian,313


In [40]:
# remove categories with low numbers
filtered_df = raw_df.copy()
filtered_df = filtered_df[~filtered_df[demographic_1].isin([
    "A race/ethnicity not listed here"])]
#     "Native American or Alaska Native",
#     "Middle Eastern or  North African"])] # has an extra whitespace
filtered_df.groupby(demographic_1)["Respondent"].nunique().reset_index()

Unnamed: 0,race_ethnicity,Respondent
0,Asian or Pacific Islander,231
1,Black or African American,184
2,Hispanic or Latino,166
3,Middle Eastern or North African,32
4,Native American or Alaska Native,5
5,White or Caucasian,313


In [41]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        filtered_df, demographic_1, min_filter=4
    )
country = 'All'

for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,Question,race_ethnicity,Image
0,All,beautiful,Asian or Pacific Islander,331
1,All,beautiful,Black or African American,281
2,All,beautiful,Hispanic or Latino,245
3,All,beautiful,Middle Eastern or North African,5
4,All,beautiful,White or Caucasian,385
5,All,boring,Asian or Pacific Islander,333
6,All,boring,Black or African American,274
7,All,boring,Hispanic or Latino,246
8,All,boring,Middle Eastern or North African,3
9,All,boring,White or Caucasian,383


In [42]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = filtered_df[filtered_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_1, min_filter=4 
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])

q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

Unnamed: 0,country,Question,race_ethnicity,Image
0,All,beautiful,Asian or Pacific Islander,331
1,All,beautiful,Black or African American,281
2,All,beautiful,Hispanic or Latino,245
3,All,beautiful,Middle Eastern or North African,5
4,All,beautiful,White or Caucasian,385
...,...,...,...,...
129,USA,safe,White or Caucasian,133
130,USA,walk,Asian or Pacific Islander,16
131,USA,walk,White or Caucasian,124
132,USA,wealthy,Asian or Pacific Islander,9


In [43]:
# save dataframe to csv
q_scores_demographic_df.to_csv(f"data/labels/processed/{demographic_1}_qscores.csv", index=False)

# Demographic interaction

## Gender, Age, AHI

In [2]:
# custom cleaned dataset
raw_df = pd.read_csv("data/labels/processed/global_mapped_cleaned_with_ahiremapped.csv", index_col=0)
countries = raw_df["Country"].unique()

In [3]:
demographic = 'gender_age_group_ahi_3_remapped'
mixed_demo_df = raw_df.copy()
mixed_demo_df =  mixed_demo_df[mixed_demo_df['ahi_3_remapped'] != "Prefer not to answer"]
mixed_demo_df[demographic] = mixed_demo_df['gender'] + " x " + mixed_demo_df['age_group'] + " x " + mixed_demo_df["ahi_3_remapped"]
print(len(mixed_demo_df[demographic].unique()))
mixed_demo_df[demographic].unique()

24


array(['female x 40-49 x middle', 'female x 40-49 x bottom',
       'female x 30-39 x bottom', 'male x Above 50 x upper',
       'male x Above 50 x middle', 'male x 40-49 x middle',
       'female x 21-29 x bottom', 'female x 40-49 x upper',
       'male x 30-39 x bottom', 'female x 21-29 x middle',
       'male x 40-49 x bottom', 'male x 30-39 x middle',
       'male x 30-39 x upper', 'male x 21-29 x upper',
       'female x Above 50 x upper', 'male x 40-49 x upper',
       'female x 30-39 x middle', 'female x 30-39 x upper',
       'male x 21-29 x bottom', 'female x Above 50 x middle',
       'female x 21-29 x upper', 'male x 21-29 x middle',
       'male x Above 50 x bottom', 'female x Above 50 x bottom'],
      dtype=object)

In [4]:
q_scores_gender_age_group_ahi_df = pd.DataFrame(columns=['country', demographic, 'Image', 'Question', 'Score', 'Num_comparisons'])

In [5]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        mixed_demo_df, demographic, min_filter=4
    )
country = 'All'
for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic] = d
    # update main df
    q_scores_gender_age_group_ahi_df = pd.concat([q_scores_gender_age_group_ahi_df, aux_df])
q_scores_gender_age_group_ahi_df

  q_scores_gender_age_group_ahi_df = pd.concat([q_scores_gender_age_group_ahi_df, aux_df])


Unnamed: 0,country,gender_age_group_ahi_3_remapped,Image,Question,Score,Num_comparisons
1,All,female x 40-49 x middle,305,safe,5.833333,4
3,All,female x 40-49 x middle,45,safe,7.916667,4
48,All,female x 40-49 x middle,129,safe,2.916667,4
53,All,female x 40-49 x middle,310,safe,2.500000,4
120,All,female x 40-49 x middle,41,safe,5.740741,6
...,...,...,...,...,...,...
3962,All,female x Above 50 x bottom,55,green,4.583333,4
3965,All,female x Above 50 x bottom,306,green,6.555556,6
3972,All,female x Above 50 x bottom,2,green,2.013889,4
3977,All,female x Above 50 x bottom,42,green,2.500000,4


In [6]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = mixed_demo_df[mixed_demo_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic, min_filter=4
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic] = d
        # update main df
        q_scores_gender_age_group_ahi_df = pd.concat([q_scores_gender_age_group_ahi_df, aux_df])
q_scores_gender_age_group_ahi_df

Unnamed: 0,country,gender_age_group_ahi_3_remapped,Image,Question,Score,Num_comparisons
1,All,female x 40-49 x middle,305,safe,5.833333,4
3,All,female x 40-49 x middle,45,safe,7.916667,4
48,All,female x 40-49 x middle,129,safe,2.916667,4
53,All,female x 40-49 x middle,310,safe,2.500000,4
120,All,female x 40-49 x middle,41,safe,5.740741,6
...,...,...,...,...,...,...
1902,Netherlands,female x Above 50 x bottom,317,live nearby,6.888889,5
1498,Netherlands,male x Above 50 x bottom,62,cycle,5.000000,4
1752,Netherlands,male x Above 50 x upper,326,cycle,7.777778,4
1571,Netherlands,female x 30-39 x upper,366,cycle,2.916667,4


In [7]:
# save dataframe to csv
q_scores_gender_age_group_ahi_df.to_csv(f"data/labels/processed/{demographic}_qscores.csv", index=False)


## Gender, Age group

In [8]:
demographic = 'gender_age_group'
mixed_demo_df = raw_df.copy()
mixed_demo_df[demographic] = mixed_demo_df['gender'] + " x " + mixed_demo_df['age_group']
print(len(mixed_demo_df[demographic].unique()))
mixed_demo_df[demographic].unique()

8


array(['female x 40-49', 'female x 30-39', 'male x Above 50',
       'male x 40-49', 'female x 21-29', 'male x 30-39', 'male x 21-29',
       'female x Above 50'], dtype=object)

In [9]:
q_scores_gender_age_group_df = pd.DataFrame(columns=['country', demographic, 'Image', 'Question', 'Score', 'Num_comparisons'])

In [10]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        mixed_demo_df, demographic, min_filter=4
    )
country = 'All'
for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic] = d
    # update main df
    q_scores_gender_age_group_df = pd.concat([q_scores_gender_age_group_df, aux_df])
q_scores_gender_age_group_df

  q_scores_gender_age_group_df = pd.concat([q_scores_gender_age_group_df, aux_df])


Unnamed: 0,country,gender_age_group,Image,Question,Score,Num_comparisons
1,All,female x 40-49,305,safe,5.277778,4
3,All,female x 40-49,45,safe,7.916667,5
15,All,female x 40-49,120,safe,5.416667,4
18,All,female x 40-49,42,safe,3.333333,4
29,All,female x 40-49,35,safe,3.333333,5
...,...,...,...,...,...,...
3993,All,female x Above 50,205,green,3.333333,5
3994,All,female x Above 50,65,green,1.269841,4
3995,All,female x Above 50,126,green,5.904762,7
3996,All,female x Above 50,19,green,5.000000,4


In [11]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = mixed_demo_df[mixed_demo_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic, min_filter=4
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic] = d
        # update main df
        q_scores_gender_age_group_df = pd.concat([q_scores_gender_age_group_df, aux_df])
q_scores_gender_age_group_df

Unnamed: 0,country,gender_age_group,Image,Question,Score,Num_comparisons
1,All,female x 40-49,305,safe,5.277778,4
3,All,female x 40-49,45,safe,7.916667,5
15,All,female x 40-49,120,safe,5.416667,4
18,All,female x 40-49,42,safe,3.333333,4
29,All,female x 40-49,35,safe,3.333333,5
...,...,...,...,...,...,...
2599,Netherlands,male x 21-29,386,cycle,7.500000,4
2624,Netherlands,male x 21-29,260,cycle,5.833333,4
2649,Netherlands,male x 21-29,275,cycle,4.166667,4
2364,Netherlands,male x 30-39,99,cycle,3.333333,4


In [12]:
# save dataframe to csv
q_scores_gender_age_group_df.to_csv(f"data/labels/processed/{demographic}_qscores.csv", index=False)


## Gender, AHI

In [13]:
# custom cleaned dataset
raw_df = pd.read_csv("data/labels/processed/global_mapped_cleaned_with_ahiremapped.csv", index_col=0)
countries = raw_df["Country"].unique()

In [14]:
demographic = 'gender_ahi_3_remapped'
mixed_demo_df = raw_df.copy()
mixed_demo_df =  mixed_demo_df[mixed_demo_df['ahi_3_remapped'] != "Prefer not to answer"]
mixed_demo_df[demographic] = mixed_demo_df['gender'] + " x " + mixed_demo_df["ahi_3_remapped"]
print(len(mixed_demo_df[demographic].unique()))
mixed_demo_df[demographic].unique()

6


array(['female x middle', 'female x bottom', 'male x upper',
       'male x middle', 'female x upper', 'male x bottom'], dtype=object)

In [15]:
q_scores_gender_ahi_df = pd.DataFrame(columns=['country', demographic, 'Image', 'Question', 'Score', 'Num_comparisons'])

In [16]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        mixed_demo_df, demographic, min_filter=4
    )
country = 'All'
for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic] = d
    # update main df
    q_scores_gender_ahi_df = pd.concat([q_scores_gender_ahi_df, aux_df])
q_scores_gender_ahi_df

  q_scores_gender_ahi_df = pd.concat([q_scores_gender_ahi_df, aux_df])


Unnamed: 0,country,gender_ahi_3_remapped,Image,Question,Score,Num_comparisons
0,All,female x middle,203,safe,6.574074,6
1,All,female x middle,305,safe,5.618056,10
2,All,female x middle,276,safe,4.000000,5
3,All,female x middle,45,safe,7.126840,7
4,All,female x middle,380,safe,6.095238,7
...,...,...,...,...,...,...
3982,All,male x bottom,239,green,6.541667,5
3983,All,male x bottom,23,green,2.166667,5
3988,All,male x bottom,165,green,6.666667,4
3989,All,male x bottom,237,green,7.777778,4


In [17]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = mixed_demo_df[mixed_demo_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic, min_filter=4
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic] = d
        # update main df
        q_scores_gender_ahi_df = pd.concat([q_scores_gender_ahi_df, aux_df])
q_scores_gender_ahi_df

Unnamed: 0,country,gender_ahi_3_remapped,Image,Question,Score,Num_comparisons
0,All,female x middle,203,safe,6.574074,6
1,All,female x middle,305,safe,5.618056,10
2,All,female x middle,276,safe,4.000000,5
3,All,female x middle,45,safe,7.126840,7
4,All,female x middle,380,safe,6.095238,7
...,...,...,...,...,...,...
1020,Netherlands,female x upper,341,wealthy,8.333333,4
1729,Netherlands,female x upper,37,beautiful,1.666667,4
2269,Netherlands,female x upper,11,cycle,4.722222,4
2301,Netherlands,female x upper,132,cycle,5.833333,4


In [18]:
# save dataframe to csv
q_scores_gender_ahi_df.to_csv(f"data/labels/processed/{demographic}_qscores.csv", index=False)


## Age group, AHI

In [19]:
# custom cleaned dataset
raw_df = pd.read_csv("data/labels/processed/global_mapped_cleaned_with_ahiremapped.csv", index_col=0)
countries = raw_df["Country"].unique()

In [20]:
demographic = 'age_group_ahi_3_remapped'
mixed_demo_df = raw_df.copy()
mixed_demo_df =  mixed_demo_df[mixed_demo_df['ahi_3_remapped'] != "Prefer not to answer"]
mixed_demo_df[demographic] = mixed_demo_df['age_group'] + " x " + mixed_demo_df["ahi_3_remapped"]
print(len(mixed_demo_df[demographic].unique()))
mixed_demo_df[demographic].unique()

12


array(['40-49 x middle', '40-49 x bottom', '30-39 x bottom',
       'Above 50 x upper', 'Above 50 x middle', '21-29 x bottom',
       '40-49 x upper', '21-29 x middle', '30-39 x middle',
       '30-39 x upper', '21-29 x upper', 'Above 50 x bottom'],
      dtype=object)

In [21]:
q_scores_age_group_ahi_df = pd.DataFrame(columns=['country', demographic, 'Image', 'Question', 'Score', 'Num_comparisons'])

In [22]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        mixed_demo_df, demographic, min_filter=4
    )
country = 'All'
for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic] = d
    # update main df
    q_scores_age_group_ahi_df = pd.concat([q_scores_age_group_ahi_df, aux_df])
q_scores_age_group_ahi_df

  q_scores_age_group_ahi_df = pd.concat([q_scores_age_group_ahi_df, aux_df])


Unnamed: 0,country,age_group_ahi_3_remapped,Image,Question,Score,Num_comparisons
1,All,40-49 x middle,305,safe,5.833333,6
3,All,40-49 x middle,45,safe,7.916667,4
6,All,40-49 x middle,262,safe,2.916667,4
22,All,40-49 x middle,80,safe,7.083333,4
29,All,40-49 x middle,35,safe,4.722222,4
...,...,...,...,...,...,...
3988,All,Above 50 x bottom,393,green,6.666667,4
3992,All,Above 50 x bottom,121,green,7.083333,4
3994,All,Above 50 x bottom,306,green,6.733333,6
3997,All,Above 50 x bottom,42,green,3.000000,5


In [23]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = mixed_demo_df[mixed_demo_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic, min_filter=4
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic] = d
        # update main df
        q_scores_age_group_ahi_df = pd.concat([q_scores_age_group_ahi_df, aux_df])
q_scores_age_group_ahi_df

Unnamed: 0,country,age_group_ahi_3_remapped,Image,Question,Score,Num_comparisons
1,All,40-49 x middle,305,safe,5.833333,6
3,All,40-49 x middle,45,safe,7.916667,4
6,All,40-49 x middle,262,safe,2.916667,4
22,All,40-49 x middle,80,safe,7.083333,4
29,All,40-49 x middle,35,safe,4.722222,4
...,...,...,...,...,...,...
2521,Netherlands,Above 50 x bottom,34,cycle,2.500000,4
2558,Netherlands,Above 50 x bottom,62,cycle,5.555556,4
2720,Netherlands,Above 50 x bottom,52,cycle,2.500000,4
2886,Netherlands,Above 50 x bottom,356,walk,5.833333,4


In [24]:
# save dataframe to csv
q_scores_age_group_ahi_df.to_csv(f"data/labels/processed/{demographic}_qscores.csv", index=False)
