In [1]:
import pandas as pd
from utils import (
    calculate_qscore,
    calculate_trueskill,
    calculate_trueskill_by_demographic,
    calculate_qscore_by_demographic,
)


# Load dataset

In [2]:
dataset_file = "data/labels/processed/global_mapped_cleaned.csv" # full dataset
metadata_file = "data/svi/metadata.csv"
gvi_file = "data/labels/processed/all_imgs_gvi.csv"


In [3]:
raw_df = pd.read_csv(dataset_file, index_col=0)
metadata_df = pd.read_csv(metadata_file, index_col=0)
gvi_df = pd.read_csv(gvi_file, index_col=0)
countries = raw_df["Country"].unique()


In [4]:
metadata_df.head()


Unnamed: 0,uuid,city,Relabelled Name,Image number
0,dc6eee81-9513-4edb-813b-05b83b5127be,Santiago,Image_1,1
1,50811b7a-5b0f-4581-92fa-017e6ff26ba0,Santiago,Image_2,2
2,35c8af9e-5e36-4810-9824-59b9b273fbcf,Santiago,Image_3,3
3,3a439403-305b-4be7-ba43-f8ff8e75c31c,Santiago,Image_4,4
4,3cc24150-c590-4d83-883c-44855010e338,Santiago,Image_5,5


In [5]:
raw_df.head()


Unnamed: 0,Respondent,Country,gender,age_group,age_group_2,nationality,city_living,city_living_length,ahi,ahi_2,...,conscientiousness_q3_q1,neuroticism_median,neuroticism_q3_q1,openness_median,openness_q3_q1,Question,Left_image,Right_image,education_level_remapped,Score
0,4,Singapore,female,40-49,40+,Singapore,Singapore,Above 5 years,"SGD$82,000 - SGD$99,999","SGD$0 - SGD$99,999",...,,1,1.0,0,0.0,safe,203,44,college_vocational_diploma,left
1,4,Singapore,female,40-49,40+,Singapore,Singapore,Above 5 years,"SGD$82,000 - SGD$99,999","SGD$0 - SGD$99,999",...,,1,1.0,0,0.0,safe,305,382,college_vocational_diploma,left
2,4,Singapore,female,40-49,40+,Singapore,Singapore,Above 5 years,"SGD$82,000 - SGD$99,999","SGD$0 - SGD$99,999",...,,1,1.0,0,0.0,safe,276,69,college_vocational_diploma,left
3,4,Singapore,female,40-49,40+,Singapore,Singapore,Above 5 years,"SGD$82,000 - SGD$99,999","SGD$0 - SGD$99,999",...,,1,1.0,0,0.0,safe,45,278,college_vocational_diploma,left
4,4,Singapore,female,40-49,40+,Singapore,Singapore,Above 5 years,"SGD$82,000 - SGD$99,999","SGD$0 - SGD$99,999",...,,1,1.0,0,0.0,safe,380,305,college_vocational_diploma,right


# Variables

In [6]:
min_threshold = 4 #minmum number of pairwise comparisons


# Metadata + segmentation

In [7]:
metadata_seg_df = pd.merge(
    metadata_df, gvi_df, on="uuid", how="left"
)
metadata_seg_df.head()


Unnamed: 0,uuid,city,Relabelled Name,Image number,green_view_index
0,dc6eee81-9513-4edb-813b-05b83b5127be,Santiago,Image_1,1,0.334511
1,50811b7a-5b0f-4581-92fa-017e6ff26ba0,Santiago,Image_2,2,0.163249
2,35c8af9e-5e36-4810-9824-59b9b273fbcf,Santiago,Image_3,3,0.02065
3,3a439403-305b-4be7-ba43-f8ff8e75c31c,Santiago,Image_4,4,0.122725
4,3cc24150-c590-4d83-883c-44855010e338,Santiago,Image_5,5,0.293745


# Q-score and TrueSkill calculation and merging with metadata and segmentation

## No demographic subdivision

In [8]:
# single-city SVI & multi-city participant
# q-score
scores_df = calculate_qscore(raw_df)
scores_df = scores_df[scores_df['Num_comparisons'] >= min_threshold]
# trueskill
trueskill_scores_df = calculate_trueskill(raw_df, scaling=True, normal_dist=False)
trueskill_scores_df = trueskill_scores_df[trueskill_scores_df['Num_comparisons'] >= min_threshold]

merged_df = pd.merge(
    scores_df, metadata_seg_df, left_on="Image", right_on="Image number", how="left"
)

# dataframe with both scores
both_scores_df = pd.merge(
    merged_df, trueskill_scores_df[["Image", "Question", "TrueSkill_score"]], on=["Image", "Question"], how='left'
)

merged_df.loc[:, 'SVI_from'] = merged_df.loc[:, 'city']
both_scores_df.loc[:, 'SVI_from'] = merged_df.loc[:, 'city']

aux_df = merged_df.copy()
aux_df.loc[:, 'SVI_from'] = 'All'
singleSVI_multiPar = pd.concat([merged_df, aux_df], ignore_index=True)

aux_df = both_scores_df.copy()
aux_df.loc[:, 'SVI_from'] = 'All'
singleSVI_multiPar_both_scores = pd.concat([both_scores_df, aux_df], ignore_index=True)

# save
singleSVI_multiPar.to_csv(f"data/labels/processed/singleSVI_multiPar_qscores.csv", index=False)
singleSVI_multiPar_both_scores.to_csv(f"data/labels/processed/singleSVI_multiPar_qscores_trueskill.csv", index=False)
# Single city is 80  imgs per city so 400 for all cities per indicator, 400 * 10 = 4000
# combining it and adding 'all' becomes 4000 more so total of 8000 rows
singleSVI_multiPar


Unnamed: 0,Image,Question,Score,Num_comparisons,uuid,city,Relabelled Name,Image number,green_view_index,SVI_from
0,203,safe,5.321440,28,76d51909-0a1e-4a6b-b0bc-ef9e5d1781ab,Singapore,Image_203,203,0.117226,Singapore
1,305,safe,5.240802,29,94ed8a77-577f-4064-ab7f-ae55b864b674,San Francisco,Image_305,305,0.147160,San Francisco
2,276,safe,4.175803,28,8cb95370-8b07-4e81-a903-977d060fd48a,San Francisco,Image_276,276,0.139876,San Francisco
3,45,safe,4.848895,32,11afd5fb-986b-4919-97d6-e13795916ad7,Santiago,Image_45,45,0.282413,Santiago
4,380,safe,6.101784,30,b33ead6f-e7e3-4d60-befe-def6a70c5468,Amsterdam,Image_380,380,0.192555,Amsterdam
...,...,...,...,...,...,...,...,...,...,...
7995,25,green,4.592230,22,44fb3166-93d7-4606-89d4-3fb8e2b50e8d,Santiago,Image_25,25,0.111992,All
7996,74,green,3.599508,18,eba4da21-0dcc-4e1c-9b7e-8e0686f1b25f,Santiago,Image_74,74,0.140035,All
7997,295,green,5.113697,27,3ad99f11-e7e6-4733-95f1-ce9942a444ee,San Francisco,Image_295,295,0.324259,All
7998,108,green,4.561685,29,70f06427-febf-473e-93f1-e8c2033be895,Abuja,Image_108,108,0.056028,All


In [9]:
# multi-city SVI & single-city participants
min_df = pd.DataFrame()
min_df_trueskills = pd.DataFrame()

for j, c in enumerate(countries):
    # calculate the Q-score for all SVI from ONLY participants of current country
    country_df = raw_df[raw_df['Country'] == c]

    # q score
    country_df_scores = calculate_qscore(country_df)
    country_df_scores_filtered = country_df_scores[country_df_scores['Num_comparisons'] >= min_threshold].copy()
    country_df_scores_filtered.loc[:, 'participants_from'] = c
    min_df = pd.concat([min_df, country_df_scores_filtered], ignore_index=True)

    # trueskill
    country_df_trueskills = calculate_trueskill(country_df, scaling=True, normal_dist=False)
    country_df_trueskills_filtered = country_df_trueskills[country_df_trueskills['Num_comparisons'] >= min_threshold].copy()
    country_df_trueskills_filtered.loc[:, 'participants_from'] = c
    min_df_trueskills = pd.concat([min_df_trueskills, country_df_trueskills_filtered], ignore_index=True)

    # end for loop

multiSVI_singlePar = pd.merge(
    min_df, metadata_seg_df, left_on="Image", right_on="Image number", how="left"
)

multiSVI_singlePar_both_scores = pd.merge(
    multiSVI_singlePar, min_df_trueskills[["Image", "Question", "participants_from", "TrueSkill_score"]], on=["Image", "Question", "participants_from"], how="left"
)

# Multiple city single par is 400 imgs for each city so 400 x 5 = 2,000 x 10 indicators <= 20,000 rows
# Then add a comparison of alL SVI rated by all participants, which is just scores on all dataset
# qscore
aux_df_qscore = calculate_qscore(raw_df)
aux_df_qscore = aux_df_qscore[aux_df_qscore['Num_comparisons'] >= min_threshold]
aux_df_qscore = pd.merge(
    aux_df_qscore, metadata_seg_df, left_on="Image", right_on="Image number", how="left"
)
aux_df_qscore.loc[:, 'participants_from'] = 'All'
multiSVI_singlePar = pd.concat([multiSVI_singlePar, aux_df_qscore], ignore_index=True)

# trueskill
aux_df_trueskill = calculate_trueskill(raw_df, scaling=True, normal_dist=False)
aux_df_trueskill = aux_df_trueskill[aux_df_trueskill['Num_comparisons'] >= min_threshold]
aux_df_trueskill = pd.merge(
    aux_df_trueskill, metadata_seg_df, left_on="Image", right_on="Image number", how="left"
)
aux_df_trueskill.loc[:, 'participants_from'] = 'All'

aux_df_both_scores = pd.merge(
    aux_df_qscore, aux_df_trueskill[["Image", "Question", "participants_from", "TrueSkill_score"]], on=["Image", "Question", "participants_from"], how="left"
)
multiSVI_singlePar_both_scores = pd.concat([multiSVI_singlePar_both_scores, aux_df_both_scores], ignore_index=True)

# save
multiSVI_singlePar.to_csv(f"data/labels/processed/multiSVI_singlePar_qscores.csv", index=False)
multiSVI_singlePar_both_scores.to_csv(f"data/labels/processed/multiSVI_singlePar_qscores_trueskill.csv", index=False)
# total sample size should be <=20,000 + 4000 (400 imgs 10 indicators), meaning <= 24,000
multiSVI_singlePar


Unnamed: 0,Image,Question,Score,Num_comparisons,participants_from,uuid,city,Relabelled Name,Image number,green_view_index
0,203,safe,6.527778,4,Singapore,76d51909-0a1e-4a6b-b0bc-ef9e5d1781ab,Singapore,Image_203,203,0.117226
1,305,safe,7.321429,4,Singapore,94ed8a77-577f-4064-ab7f-ae55b864b674,San Francisco,Image_305,305,0.147160
2,276,safe,4.027778,8,Singapore,8cb95370-8b07-4e81-a903-977d060fd48a,San Francisco,Image_276,276,0.139876
3,45,safe,6.722222,5,Singapore,11afd5fb-986b-4919-97d6-e13795916ad7,Santiago,Image_45,45,0.282413
4,380,safe,6.329365,12,Singapore,b33ead6f-e7e3-4d60-befe-def6a70c5468,Amsterdam,Image_380,380,0.192555
...,...,...,...,...,...,...,...,...,...,...
18631,25,green,4.592230,22,All,44fb3166-93d7-4606-89d4-3fb8e2b50e8d,Santiago,Image_25,25,0.111992
18632,74,green,3.599508,18,All,eba4da21-0dcc-4e1c-9b7e-8e0686f1b25f,Santiago,Image_74,74,0.140035
18633,295,green,5.113697,27,All,3ad99f11-e7e6-4733-95f1-ce9942a444ee,San Francisco,Image_295,295,0.324259
18634,108,green,4.561685,29,All,70f06427-febf-473e-93f1-e8c2033be895,Abuja,Image_108,108,0.056028


# Age

In [10]:
demographic_1 = "age_group"
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
trueskill_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
print(raw_df[demographic_1].unique())


['40-49' '30-39' 'Above 50' '21-29']


In [11]:
# all

# q score
all_q_scores_dict = calculate_qscore_by_demographic(
        raw_df, demographic_1, min_filter=4
    )
country = 'All'

for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

# trueskill
all_trueskill_dict = calculate_trueskill_by_demographic(
        raw_df, demographic_1, min_filter=4, scaling=True, normal_dist=False
    )
country = 'All'

for d in all_trueskill_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_trueskill_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    trueskill_demographic_df = pd.concat([trueskill_demographic_df, aux_df])
trueskill_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

print(q_scores_demographic_df.shape[0])
print(trueskill_demographic_df.shape[0])

  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


12428
12428


In [12]:
# for each country
countries_q_scores_dict = {}
countries_trueskill_dict = {}
for c in countries:
    country_df = raw_df[raw_df["Country"] == c]

    # q score
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_1, min_filter=4 
    )
    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])

    # trueskill
    countries_trueskill_dict[c] = calculate_trueskill_by_demographic(
        country_df, demographic_1, min_filter=4, scaling=True, normal_dist=False
    )
    for d in countries_trueskill_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_trueskill_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        trueskill_demographic_df = pd.concat([trueskill_demographic_df, aux_df])

q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()
trueskill_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

print(q_scores_demographic_df.shape[0])
print(trueskill_demographic_df.shape[0])

17323
17323


In [13]:
# save dataframe to csv
merged_df = pd.merge(
    q_scores_demographic_df, metadata_seg_df, left_on="Image", right_on="Image number", how="left"
)
merged_df.to_csv(f"data/labels/processed/{demographic_1}_qscores.csv", index=False)

both_scores_df = pd.merge(
   merged_df, trueskill_demographic_df[["country", "Image", "Question", "TrueSkill_score", demographic_1]], on=["country", "Image", "Question", demographic_1], how="left"
)
both_scores_df.to_csv(f"data/labels/processed/{demographic_1}_qscores_trueskill.csv", index=False)

print(merged_df.shape[0])
print(both_scores_df.shape[0])
merged_df.head()


17323
17323


Unnamed: 0,country,age_group,Image,Question,Score,Num_comparisons,uuid,city,Relabelled Name,Image number,green_view_index
0,All,40-49,203,safe,4.166667,4,76d51909-0a1e-4a6b-b0bc-ef9e5d1781ab,Singapore,Image_203,203,0.117226
1,All,40-49,305,safe,5.357143,6,94ed8a77-577f-4064-ab7f-ae55b864b674,San Francisco,Image_305,305,0.14716
2,All,40-49,45,safe,5.486111,7,11afd5fb-986b-4919-97d6-e13795916ad7,Santiago,Image_45,45,0.282413
3,All,40-49,380,safe,1.898148,4,b33ead6f-e7e3-4d60-befe-def6a70c5468,Amsterdam,Image_380,380,0.192555
4,All,40-49,277,safe,1.666667,4,f3dc8967-fc2c-407f-b4e7-e350b62b9288,San Francisco,Image_277,277,0.261296


# AHI

In [14]:
demographic_1 = "ahi_3_remapped"
print(raw_df["ahi_3"].unique())


['SGD$45,000 - SGD$99,999' 'SGD$0 - SGD$44,999'
 'SGD$100,000 - SGD$999,999' '2,000,001 NGN-4,000,000 NGN'
 'Prefer not to answer' '2,000,000 NGN & below' '4,000,001 NGN & above'
 'USD$50,000 - USD$99,999' 'USD$0 - USD$49,999'
 'USD$100,000 - USD$999,999' '60,400,001 CLP & above'
 '30,000 euros-81,999 euros' '20,400,001 CLP-60,400,000 CLP'
 '20,400,000 CLP & below' '82,000 euros & above' '29,999 euros & below']


In [15]:
# same mapping as the SPECS dataset (TODO ref)
three_df = raw_df.copy()
three_df["ahi_3_remapped"] = raw_df["ahi_3"].map(
    {
        "20,400,000 CLP & below": "below",
        "20,400,001 CLP-60,400,000 CLP": "middle",
        "60,400,001 CLP & above": "above",
        
        "29,999 euros & below": "below",
        "30,000 euros-81,999 euros": "middle",
        "82,000 euros & above": "above",
        
        "2,000,000 NGN & below": "below",
        "2,000,001 NGN-4,000,000 NGN": "middle",
        "4,000,001 NGN & above": "above",
        
        "SGD$0 - SGD$44,999": "below",
        "SGD$45,000 - SGD$99,999": "middle",
        "SGD$100,000 - SGD$999,999": "above",
        
        "USD$0 - USD$49,999": "below",
        "USD$50,000 - USD$99,999": "middle",
        "USD$100,000 - USD$999,999": "above",
        
        "Prefer not to answer": "Prefer not to answer",
    }
)

three_df = three_df[three_df['ahi_3_remapped'].isin(["above", "middle", "below"])]
three_df.groupby(["Country","ahi_3_remapped"])["Respondent"].nunique().reset_index()


Unnamed: 0,Country,ahi_3_remapped,Respondent
0,Chile,above,19
1,Chile,below,119
2,Chile,middle,50
3,Netherlands,above,39
4,Netherlands,below,43
5,Netherlands,middle,101
6,Nigeria,above,60
7,Nigeria,below,66
8,Nigeria,middle,68
9,Singapore,above,85


In [16]:
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
trueskill_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])

# all
# qscore
all_q_scores_dict = calculate_qscore_by_demographic(
        three_df, demographic_1, min_filter=4
    )
country = 'All'

for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

# trueskill
all_trueskill_dict = calculate_trueskill_by_demographic(
        three_df, demographic_1, min_filter=4, scaling=True, normal_dist=False
    )
country = 'All'

for d in all_trueskill_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_trueskill_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    trueskill_demographic_df = pd.concat([trueskill_demographic_df, aux_df])
trueskill_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()


  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,Question,ahi_3_remapped,Image
0,All,beautiful,above,376
1,All,beautiful,below,392
2,All,beautiful,middle,381
3,All,boring,above,379
4,All,boring,below,387
5,All,boring,middle,387
6,All,cycle,above,362
7,All,cycle,below,377
8,All,cycle,middle,378
9,All,depressing,above,376


In [17]:
# for each country
countries_q_scores_dict = {}
countries_trueskill_dict = {}

for c in countries:
    country_df = three_df[three_df["Country"] == c]
    
    # qscore
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_1, min_filter=4 
    )
    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])

    # trueskill
    countries_trueskill_dict[c] = calculate_trueskill_by_demographic(
        country_df, demographic_1, min_filter=4, scaling=True, normal_dist=False
    )
    for d in countries_trueskill_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_trueskill_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        trueskill_demographic_df = pd.concat([trueskill_demographic_df, aux_df])

q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()
trueskill_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

print(q_scores_demographic_df.shape[0])
print(trueskill_demographic_df.shape[0])

17630
17630


In [18]:
# save dataframe to csv
merged_df = pd.merge(
    q_scores_demographic_df, metadata_seg_df, left_on="Image", right_on="Image number", how="left"
)
merged_df.to_csv(f"data/labels/processed/{demographic_1}_qscores.csv", index=False)

both_scores_df = pd.merge(
    merged_df, trueskill_demographic_df[["country", "Image", "Question", "TrueSkill_score", demographic_1]], on=["country","Image", "Question", demographic_1], how="left"
)
both_scores_df.to_csv(f"data/labels/processed/{demographic_1}_qscores_trueskill.csv", index=False)

print(merged_df.shape[0])
print(both_scores_df.shape[0])


17630
17630


# Education level

In [19]:
demographic_1 = "education_level"
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
trueskill_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
raw_df.groupby("education_level")["Respondent"].nunique().reset_index()


Unnamed: 0,education_level,Respondent
0,Bachelors degree,380
1,College or Vocational training or Diploma,229
2,Doctorate or professional degree,38
3,Less than Secondary/High school,11
4,Masters degree,122
5,Other,11
6,Secondary/High school graduate or equivalent,209


In [20]:
raw_df.groupby(["Country", "education_level"])["Respondent"].nunique().reset_index()


Unnamed: 0,Country,education_level,Respondent
0,Chile,Bachelors degree,42
1,Chile,College or Vocational training or Diploma,71
2,Chile,Doctorate or professional degree,9
3,Chile,Less than Secondary/High school,3
4,Chile,Masters degree,4
5,Chile,Other,7
6,Chile,Secondary/High school graduate or equivalent,64
7,Netherlands,Bachelors degree,52
8,Netherlands,College or Vocational training or Diploma,43
9,Netherlands,Doctorate or professional degree,9


## filtering

In [21]:
# remove Other and Less than Secondary/High school
filtered_df = raw_df.copy()
filtered_df = filtered_df[~filtered_df[demographic_1].isin(['Other', 'Less than Secondary/High school'])]
filtered_df.groupby([demographic_1])["Respondent"].nunique().reset_index()


Unnamed: 0,education_level,Respondent
0,Bachelors degree,380
1,College or Vocational training or Diploma,229
2,Doctorate or professional degree,38
3,Masters degree,122
4,Secondary/High school graduate or equivalent,209


In [22]:
filtered_df.groupby(["Country", demographic_1])["Respondent"].nunique().reset_index()


Unnamed: 0,Country,education_level,Respondent
0,Chile,Bachelors degree,42
1,Chile,College or Vocational training or Diploma,71
2,Chile,Doctorate or professional degree,9
3,Chile,Masters degree,4
4,Chile,Secondary/High school graduate or equivalent,64
5,Netherlands,Bachelors degree,52
6,Netherlands,College or Vocational training or Diploma,43
7,Netherlands,Doctorate or professional degree,9
8,Netherlands,Masters degree,24
9,Netherlands,Secondary/High school graduate or equivalent,68


## remapping

In [23]:
remapped_df = filtered_df.copy()
remapped_df["education_level"] = remapped_df["education_level"].replace({
    "Masters degree": "Postgraduate degree", 
    "Doctorate or professional degree": "Postgraduate degree"
})

remapped_df.groupby(["Country", demographic_1])["Respondent"].nunique().reset_index()


Unnamed: 0,Country,education_level,Respondent
0,Chile,Bachelors degree,42
1,Chile,College or Vocational training or Diploma,71
2,Chile,Postgraduate degree,13
3,Chile,Secondary/High school graduate or equivalent,64
4,Netherlands,Bachelors degree,52
5,Netherlands,College or Vocational training or Diploma,43
6,Netherlands,Postgraduate degree,33
7,Netherlands,Secondary/High school graduate or equivalent,68
8,Nigeria,Bachelors degree,108
9,Nigeria,College or Vocational training or Diploma,31


In [24]:
# all
# qscore
all_q_scores_dict = calculate_qscore_by_demographic(
        remapped_df, demographic_1, min_filter=4
    )
country = 'All'

for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

# trueskill
all_trueskill_dict = calculate_trueskill_by_demographic(
        remapped_df, demographic_1, min_filter=4, scaling=True, normal_dist=False
    )
country = 'All'

for d in all_trueskill_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_trueskill_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    trueskill_demographic_df = pd.concat([trueskill_demographic_df, aux_df])
trueskill_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()


  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,Question,education_level,Image
0,All,beautiful,Bachelors degree,393
1,All,beautiful,College or Vocational training or Diploma,338
2,All,beautiful,Postgraduate degree,227
3,All,beautiful,Secondary/High school graduate or equivalent,299
4,All,boring,Bachelors degree,393
5,All,boring,College or Vocational training or Diploma,333
6,All,boring,Postgraduate degree,225
7,All,boring,Secondary/High school graduate or equivalent,308
8,All,cycle,Bachelors degree,387
9,All,cycle,College or Vocational training or Diploma,313


In [25]:
# for each country
countries_q_scores_dict = {}
countries_trueskill_dict = {}

for c in countries:
    country_df = remapped_df[remapped_df["Country"] == c]
    
    # qscore
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_1, min_filter=4 
    )
    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])

    # trueskill 
    countries_trueskill_dict[c] = calculate_trueskill_by_demographic(
        country_df, demographic_1, min_filter=4, scaling=True, normal_dist=False
    )
    for d in countries_trueskill_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_trueskill_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        trueskill_demographic_df = pd.concat([trueskill_demographic_df, aux_df])

q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()
trueskill_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()


Unnamed: 0,country,Question,education_level,Image
0,All,beautiful,Bachelors degree,393
1,All,beautiful,College or Vocational training or Diploma,338
2,All,beautiful,Postgraduate degree,227
3,All,beautiful,Secondary/High school graduate or equivalent,299
4,All,boring,Bachelors degree,393
...,...,...,...,...
217,USA,walk,Secondary/High school graduate or equivalent,2
218,USA,wealthy,Bachelors degree,60
219,USA,wealthy,College or Vocational training or Diploma,11
220,USA,wealthy,Postgraduate degree,14


In [26]:
# save to csv
merged_df = pd.merge(
    q_scores_demographic_df, metadata_seg_df, left_on="Image", right_on="Image number", how="left"
)
merged_df.to_csv(f"data/labels/processed/{demographic_1}_qscores.csv", index=False)

both_scores_df = pd.merge(
   merged_df, trueskill_demographic_df[["country", "Image", "Question", "TrueSkill_score", demographic_1]], on=["country", "Image", "Question", demographic_1], how="left"
)
both_scores_df.to_csv(f"data/labels/processed/{demographic_1}_qscores_trueskill.csv", index=False)

print(merged_df.shape[0])
print(both_scores_df.shape[0])

17393
17393


# Race & ethnicity

In [27]:
demographic_1 = "race_ethnicity"
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
trueskill_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
print(raw_df[demographic_1].unique())


['Asian or Pacific Islander' 'White or Caucasian'
 'A race/ethnicity not listed here' 'Black or African American'
 'Middle Eastern or  North African' 'Hispanic or Latino'
 'Native American or Alaska Native']


In [28]:
raw_df.groupby(demographic_1)["Respondent"].nunique().reset_index()


Unnamed: 0,race_ethnicity,Respondent
0,A race/ethnicity not listed here,69
1,Asian or Pacific Islander,231
2,Black or African American,184
3,Hispanic or Latino,166
4,Middle Eastern or North African,32
5,Native American or Alaska Native,5
6,White or Caucasian,313


In [29]:
# remove categories with low numbers
filtered_df = raw_df.copy()
filtered_df = filtered_df[~filtered_df[demographic_1].isin([
    "A race/ethnicity not listed here"])]
filtered_df.groupby(demographic_1)["Respondent"].nunique().reset_index()


Unnamed: 0,race_ethnicity,Respondent
0,Asian or Pacific Islander,231
1,Black or African American,184
2,Hispanic or Latino,166
3,Middle Eastern or North African,32
4,Native American or Alaska Native,5
5,White or Caucasian,313


In [30]:
# all
# qscore
all_q_scores_dict = calculate_qscore_by_demographic(
        filtered_df, demographic_1, min_filter=4
    )
country = 'All'

for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

# trueskill
all_trueskill_dict = calculate_trueskill_by_demographic(
        filtered_df, demographic_1, min_filter=4, scaling=True, normal_dist=False
    )
country = 'All'

for d in all_trueskill_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_trueskill_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    trueskill_demographic_df = pd.concat([trueskill_demographic_df, aux_df])
trueskill_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()


  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,Question,race_ethnicity,Image
0,All,beautiful,Asian or Pacific Islander,331
1,All,beautiful,Black or African American,281
2,All,beautiful,Hispanic or Latino,245
3,All,beautiful,Middle Eastern or North African,5
4,All,beautiful,White or Caucasian,385
5,All,boring,Asian or Pacific Islander,333
6,All,boring,Black or African American,274
7,All,boring,Hispanic or Latino,246
8,All,boring,Middle Eastern or North African,3
9,All,boring,White or Caucasian,383


In [31]:
# for each country
countries_q_scores_dict = {}
countries_trueskill_dict = {}

for c in countries:
    country_df = filtered_df[filtered_df["Country"] == c]
    # qscore
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_1, min_filter=4 
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])

    # trueskill 
    countries_trueskill_dict[c] = calculate_trueskill_by_demographic(
        country_df, demographic_1, min_filter=4, scaling=True, normal_dist=False 
    )

    for d in countries_trueskill_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_trueskill_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        trueskill_demographic_df = pd.concat([trueskill_demographic_df, aux_df])

q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()
trueskill_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()


Unnamed: 0,country,Question,race_ethnicity,Image
0,All,beautiful,Asian or Pacific Islander,331
1,All,beautiful,Black or African American,281
2,All,beautiful,Hispanic or Latino,245
3,All,beautiful,Middle Eastern or North African,5
4,All,beautiful,White or Caucasian,385
...,...,...,...,...
129,USA,safe,White or Caucasian,133
130,USA,walk,Asian or Pacific Islander,16
131,USA,walk,White or Caucasian,124
132,USA,wealthy,Asian or Pacific Islander,9


In [32]:
# save dataframe to csv
merged_df = pd.merge(
    q_scores_demographic_df, metadata_seg_df, left_on="Image", right_on="Image number", how="left"
)
merged_df.to_csv(f"data/labels/processed/{demographic_1}_qscores.csv", index=False)

both_scores_df = pd.merge(
    merged_df, trueskill_demographic_df[["country", "Image", "Question", "TrueSkill_score", demographic_1]], on=["country", "Image", "Question", demographic_1], how="left"
)
both_scores_df.to_csv(f"data/labels/processed/{demographic_1}_qscores_trueskill.csv", index=False)

print(merged_df.shape[0])
print(both_scores_df.shape[0])

22685
22685
