In [1]:
import numpy as np
import pandas as pd
from utils import (
    calculate_qscore,
    calculate_qscore_by_demographic,
)



# Load data

In [2]:
dataset_file = "data/labels/processed/global_mapped_cleaned.csv"
raw_df = pd.read_csv(dataset_file, index_col=0)
countries = raw_df["Country"].unique()

# Calculate Q-scores

In [PP2](https://arxiv.org/abs/1608.01769), page 6, the max comparison across all indicators are:
- min 2.40
- avg 4.05, however on page 7 they state 3.35
- max 7.67 

# Extraversion

In [3]:
demographic_1 = "extraversion_q3_q1"
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
print(raw_df[demographic_1].unique())
raw_df = raw_df.dropna(subset=[demographic_1])
print(raw_df[demographic_1].unique())

[ 1.  0. nan]
[1. 0.]


In [4]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        raw_df, demographic_1, min_filter=4
    )
country = 'All'
for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df

  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,extraversion_q3_q1,Image,Question,Score,Num_comparisons
0,All,1.0,203,safe,3.935185,6
1,All,1.0,305,safe,5.490530,11
2,All,1.0,276,safe,2.901235,9
3,All,1.0,45,safe,6.435185,11
4,All,1.0,380,safe,7.163033,12
...,...,...,...,...,...,...
3995,All,0.0,338,green,4.091970,7
3996,All,0.0,361,green,4.280055,9
3997,All,0.0,313,green,6.354497,12
3998,All,0.0,254,green,4.250000,5


In [5]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = raw_df[raw_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_1, min_filter=4
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df

Unnamed: 0,country,extraversion_q3_q1,Image,Question,Score,Num_comparisons
0,All,1.0,203,safe,3.935185,6
1,All,1.0,305,safe,5.490530,11
2,All,1.0,276,safe,2.901235,9
3,All,1.0,45,safe,6.435185,11
4,All,1.0,380,safe,7.163033,12
...,...,...,...,...,...,...
3951,Netherlands,0.0,98,green,7.333333,5
3953,Netherlands,0.0,363,green,7.083333,4
3957,Netherlands,0.0,272,green,0.666667,5
3958,Netherlands,0.0,186,green,7.666667,5


In [6]:
# save dataframe to csv
q_scores_demographic_df.to_csv(f"data/labels/processed/{demographic_1}_qscores.csv", index=False)


# Agreeableness

In [7]:
demographic_1 = "agreeableness_q3_q1"
raw_df = pd.read_csv(dataset_file, index_col=0)
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
print(raw_df[demographic_1].unique())
raw_df = raw_df.dropna(subset=[demographic_1])
print(raw_df[demographic_1].unique())

[nan  0.  1.]
[0. 1.]


In [8]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        raw_df, demographic_1, min_filter=4
    )
country = 'All'

for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,Question,agreeableness_q3_q1,Image
0,All,beautiful,0.0,371
1,All,beautiful,1.0,378
2,All,boring,0.0,380
3,All,boring,1.0,376
4,All,cycle,0.0,365
5,All,cycle,1.0,361
6,All,depressing,0.0,378
7,All,depressing,1.0,364
8,All,green,0.0,379
9,All,green,1.0,378


In [9]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = raw_df[raw_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_1, min_filter=4 
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])

q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

Unnamed: 0,country,Question,agreeableness_q3_q1,Image
0,All,beautiful,0.0,371
1,All,beautiful,1.0,378
2,All,boring,0.0,380
3,All,boring,1.0,376
4,All,cycle,0.0,365
...,...,...,...,...
115,USA,safe,1.0,28
116,USA,walk,0.0,27
117,USA,walk,1.0,20
118,USA,wealthy,0.0,24


In [10]:
# save dataframe to csv
q_scores_demographic_df.to_csv(f"data/labels/processed/{demographic_1}_qscores.csv", index=False)


# Conscientiousness

In [11]:
demographic_1 = "conscientiousness_q3_q1"
raw_df = pd.read_csv(dataset_file, index_col=0)
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
print(raw_df[demographic_1].unique())
raw_df = raw_df.dropna(subset=[demographic_1])
print(raw_df[demographic_1].unique())

[nan  0.  1.]
[0. 1.]


In [12]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        raw_df, demographic_1, min_filter=4
    )
country = 'All'

for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
    
q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,Question,conscientiousness_q3_q1,Image
0,All,beautiful,0.0,353
1,All,beautiful,1.0,393
2,All,boring,0.0,360
3,All,boring,1.0,400
4,All,cycle,0.0,339
5,All,cycle,1.0,387
6,All,depressing,0.0,353
7,All,depressing,1.0,388
8,All,green,0.0,364
9,All,green,1.0,391


In [13]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = raw_df[raw_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_1, min_filter=4 
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])

q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

Unnamed: 0,country,Question,conscientiousness_q3_q1,Image
0,All,beautiful,0.0,353
1,All,beautiful,1.0,393
2,All,boring,0.0,360
3,All,boring,1.0,400
4,All,cycle,0.0,339
...,...,...,...,...
109,USA,safe,1.0,51
110,USA,walk,0.0,23
111,USA,walk,1.0,48
112,USA,wealthy,0.0,18


In [14]:
# save dataframe to csv
q_scores_demographic_df.to_csv(f"data/labels/processed/{demographic_1}_qscores.csv", index=False)


# Neuroticism

In [15]:
demographic_1 = "neuroticism_q3_q1"
raw_df = pd.read_csv(dataset_file, index_col=0)
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
print(raw_df[demographic_1].unique())
raw_df = raw_df.dropna(subset=[demographic_1])
print(raw_df[demographic_1].unique())

[ 1.  0. nan]
[1. 0.]


In [16]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        raw_df, demographic_1, min_filter=4
    )
country = 'All'

for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,Question,neuroticism_q3_q1,Image
0,All,beautiful,0.0,389
1,All,beautiful,1.0,397
2,All,boring,0.0,392
3,All,boring,1.0,397
4,All,cycle,0.0,375
5,All,cycle,1.0,394
6,All,depressing,0.0,391
7,All,depressing,1.0,397
8,All,green,0.0,392
9,All,green,1.0,400


In [17]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = raw_df[raw_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_1, min_filter=4 
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])

q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

Unnamed: 0,country,Question,neuroticism_q3_q1,Image
0,All,beautiful,0.0,389
1,All,beautiful,1.0,397
2,All,boring,0.0,392
3,All,boring,1.0,397
4,All,cycle,0.0,375
...,...,...,...,...
115,USA,safe,1.0,95
116,USA,walk,0.0,41
117,USA,walk,1.0,101
118,USA,wealthy,0.0,36


In [18]:
# save dataframe to csv
q_scores_demographic_df.to_csv(f"data/labels/processed/{demographic_1}_qscores.csv", index=False)

# Openness

In [19]:
demographic_1 = "openness_q3_q1"
raw_df = pd.read_csv(dataset_file, index_col=0)
q_scores_demographic_df = pd.DataFrame(columns=['country', demographic_1, 'Image', 'Question', 'Score', 'Num_comparisons'])
print(raw_df[demographic_1].unique())
raw_df = raw_df.dropna(subset=[demographic_1])
print(raw_df[demographic_1].unique())

[ 0. nan  1.]
[0. 1.]


In [20]:
# all
all_q_scores_dict = calculate_qscore_by_demographic(
        raw_df, demographic_1, min_filter=4
    )
country = 'All'

for d in all_q_scores_dict.keys():
    # transform dictionaries into a dataframe
    aux_df = all_q_scores_dict[d]
    aux_df['country'] = country
    aux_df[demographic_1] = d
    # update main df
    q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])
q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

  q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])


Unnamed: 0,country,Question,openness_q3_q1,Image
0,All,beautiful,0.0,397
1,All,beautiful,1.0,376
2,All,boring,0.0,400
3,All,boring,1.0,380
4,All,cycle,0.0,397
5,All,cycle,1.0,364
6,All,depressing,0.0,399
7,All,depressing,1.0,368
8,All,green,0.0,399
9,All,green,1.0,380


In [21]:
# for each country
countries_q_scores_dict = {}
for c in countries:
    country_df = raw_df[raw_df["Country"] == c]
    countries_q_scores_dict[c] = calculate_qscore_by_demographic(
        country_df, demographic_1, min_filter=4 
    )

    for d in countries_q_scores_dict[c].keys():
        # transform dictionaries into a dataframe
        aux_df = countries_q_scores_dict[c][d]
        aux_df['country'] = c
        aux_df[demographic_1] = d
        # update main df
        q_scores_demographic_df = pd.concat([q_scores_demographic_df, aux_df])

q_scores_demographic_df.groupby(["country", "Question", demographic_1])["Image"].nunique().reset_index()

Unnamed: 0,country,Question,openness_q3_q1,Image
0,All,beautiful,0.0,397
1,All,beautiful,1.0,376
2,All,boring,0.0,400
3,All,boring,1.0,380
4,All,cycle,0.0,397
...,...,...,...,...
115,USA,safe,1.0,38
116,USA,walk,0.0,91
117,USA,walk,1.0,34
118,USA,wealthy,0.0,91


In [22]:
# save dataframe to csv
q_scores_demographic_df.to_csv(f"data/labels/processed/{demographic_1}_qscores.csv", index=False)