In [1]:
import pandas as pd
import altair as alt
import altair_saver
from scipy import stats
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

# Descriptives forced-choice sentiment scores

In [2]:
forced_choice_long_df = pd.read_csv('../clean_data_mturk/forced_choice_emotion_mturk_long_format_lmer.csv')

In [3]:
forced_choice_long_df.head()

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore,sexC,ageC,ethnicityC
0,0,Q2.1,Anger,bipoc,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.5719,-0.5,0.5,-0.5
1,0,Q126.1,Neutral,bipoc,male,child,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,0.5,-0.5,-0.5
2,0,Q127.1,Surprise,white,female,adult,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...,0.2732,-0.5,0.5,0.5
3,0,Q128.1,Fear,white,female,child,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.4939,-0.5,-0.5,0.5
4,0,Q129.1,Other,white,male,adult,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,0.5,0.5,0.5


In [4]:
# Grouping for plotting
forced_grouped_score = forced_choice_long_df.groupby(["photoId", "ethnicity", "sex","age"])['sentimentScore'].mean().to_frame().reset_index()

## Sentiment score histogram overall

In [5]:
source = forced_grouped_score

hist_forced_all_com = alt.Chart(source).mark_bar().encode(
    alt.X('sentimentScore:Q',bin=alt.Bin(maxbins=30), title="mean sentiment distribution"),
    y='count()')

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('sentimentScore:Q', aggregate='mean'))

hist_forced_all_com = hist_forced_all_com + rule
hist_forced_all_com.display(renderer='svg')

## Sentiment score by sex

In [6]:
source = forced_grouped_score
hist_forced_sex_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('sentimentScore:Q', bin=alt.Bin(maxbins=30), title='mean sentiment distribution'),
    alt.Y('count()',  stack=None),
    alt.Color('sex'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('sentimentScore:Q', aggregate='mean'),
    color=alt.Color('sex'))

hist_forced_sex_com = hist_forced_sex_com + rule
hist_forced_sex_com.display(renderer='svg')

## Sentiment score by ethnicity

In [7]:
source = forced_grouped_score
hist_forced_ethnicity_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('sentimentScore:Q', bin=alt.Bin(maxbins=30), title='mean sentiment distribution'),
    alt.Y('count()',  stack=None),
    alt.Color('ethnicity'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('sentimentScore:Q', aggregate='mean'),
    color=alt.Color('ethnicity'))

hist_forced_ethnicity_com = hist_forced_ethnicity_com + rule
hist_forced_ethnicity_com.display(renderer='svg')

## Sentiment score by age

In [8]:
source = forced_grouped_score
hist_forced_age_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('sentimentScore:Q', bin=alt.Bin(maxbins=30), title='mean sentiment distribution'),
    alt.Y('count()',  stack=None),
    alt.Color('age'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('sentimentScore:Q', aggregate='mean'),
    color=alt.Color('age'))

hist_forced_age_com = hist_forced_age_com + rule
hist_forced_age_com.display(renderer='svg')

## Paired T-test

In [9]:
v1 = forced_grouped_score[forced_grouped_score['sex'] == 'female']['sentimentScore']
v2 = forced_grouped_score[forced_grouped_score['sex'] == 'male']['sentimentScore']

v3 = forced_grouped_score[forced_grouped_score['ethnicity'] == 'bipoc']['sentimentScore']
v4 = forced_grouped_score[forced_grouped_score['ethnicity'] == 'white']['sentimentScore']

v5 = forced_grouped_score[forced_grouped_score['age'] == 'adult']['sentimentScore']
v6 = forced_grouped_score[forced_grouped_score['age'] == 'child']['sentimentScore']

In [10]:
print(f'Paired t-test by sex: t-value = {stats.ttest_rel(v1, v2)[0]}, p-value = {stats.ttest_rel(v1, v2)[1]}')
print(f'Paired t-test by ethnicity: t-value = {stats.ttest_rel(v3, v4)[0]}, p-value = {stats.ttest_rel(v3, v4)[1]}')
print(f'Paired t-test by age: t-value = {stats.ttest_rel(v5, v6)[0]}, p-value = {stats.ttest_rel(v5, v6)[1]}')

Paired t-test by sex: t-value = 0.9117409701413358, p-value = 0.36421290094012526
Paired t-test by ethnicity: t-value = 0.20012684322342258, p-value = 0.8418091632569907
Paired t-test by age: t-value = -1.6630127558936814, p-value = 0.09960576449696461


# Descriptives free-choice sentiment scores

In [11]:
free_choice_long_df = pd.read_csv('../clean_data_mturk/free_labeling_emotion_mturk_long_format_lmer.csv')

In [12]:
free_choice_long_df['photoIdSplit'] = free_choice_long_df['photoId'].str.split('.').str[0]

In [13]:
free_choice_long_df.head()

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore,sexC,ageC,ethnicityC,photoIdSplit
0,0,Q2.1_1,boredom,bipoc,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.3182,-0.5,0.5,-0.5,Q2
1,0,Q117.1_3,annoyance,bipoc,male,adult,surprise,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.3182,0.5,0.5,-0.5,Q117
2,0,Q117.1_2,anger,bipoc,male,adult,surprise,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.5719,0.5,0.5,-0.5,Q117
3,0,Q52.1_1,surprise,bipoc,male,adult,surprise,https://uwmadison.co1.qualtrics.com/ControlPan...,0.2732,0.5,0.5,-0.5,Q52
4,0,Q117.1_1,surprise,bipoc,male,adult,surprise,https://uwmadison.co1.qualtrics.com/ControlPan...,0.2732,0.5,0.5,-0.5,Q117


In [14]:
# Grouping by photo
free_grouped_score = free_choice_long_df.groupby(["photoIdSplit", "ethnicity", "sex","age"])['sentimentScore'].mean().to_frame().reset_index()

## Sentiment score overall

In [15]:
source = free_grouped_score

hist_free_all_com = alt.Chart(source).mark_bar().encode(
    alt.X('sentimentScore:Q',bin=alt.Bin(maxbins=30), title="mean sentiment distribution"),
    y='count()')

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('sentimentScore:Q', aggregate='mean'))

hist_free_all_com = hist_free_all_com + rule
hist_free_all_com.display(renderer='svg')

## Sentiment by sex

In [16]:
source = free_grouped_score
hist_free_sex_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('sentimentScore:Q', bin=alt.Bin(maxbins=30), title='mean sentiment distribution'),
    alt.Y('count()',  stack=None),
    alt.Color('sex'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('sentimentScore:Q', aggregate='mean'),
    color=alt.Color('sex'))

hist_free_sex_com = hist_free_sex_com + rule
hist_free_sex_com.display(renderer='svg')

## Sentiment by ethnicity

In [17]:
source = free_grouped_score
hist_free_ethnicity_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('sentimentScore:Q', bin=alt.Bin(maxbins=30), title='mean sentiment distribution'),
    alt.Y('count()',  stack=None),
    alt.Color('ethnicity'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('sentimentScore:Q', aggregate='mean'),
    color=alt.Color('ethnicity'))

hist_free_ethnicity_com = hist_free_ethnicity_com + rule
hist_free_ethnicity_com.display(renderer='svg')

## Sentiment by age

In [18]:
source = free_grouped_score
hist_free_age_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('sentimentScore:Q', bin=alt.Bin(maxbins=30), title='mean sentiment distribution'),
    alt.Y('count()',  stack=None),
    alt.Color('age'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('sentimentScore:Q', aggregate='mean'),
    color=alt.Color('age'))

hist_free_age_com = hist_free_age_com + rule
hist_free_age_com.display(renderer='svg')

## Paired t-test 

In [19]:
v1 = free_grouped_score[free_grouped_score['sex'] == 'female']['sentimentScore']
v2 = free_grouped_score[free_grouped_score['sex'] == 'male']['sentimentScore']

v3 = free_grouped_score[free_grouped_score['ethnicity'] == 'bipoc']['sentimentScore']
v4 = free_grouped_score[free_grouped_score['ethnicity'] == 'white']['sentimentScore']

v5 = free_grouped_score[free_grouped_score['age'] == 'adult']['sentimentScore']
v6 = free_grouped_score[free_grouped_score['age'] == 'child']['sentimentScore']

In [20]:
print(f'Paired t-test by sex: t-value = {stats.ttest_rel(v1, v2)[0]}, p-value = {stats.ttest_rel(v1, v2)[1]}')
print(f'Paired t-test by ethnicity: t-value = {stats.ttest_rel(v3, v4)[0]}, p-value = {stats.ttest_rel(v3, v4)[1]}')
print(f'Paired t-test by age: t-value = {stats.ttest_rel(v5, v6)[0]}, p-value = {stats.ttest_rel(v5, v6)[1]}')

Paired t-test by sex: t-value = 1.2529219897659898, p-value = 0.21330931408750511
Paired t-test by ethnicity: t-value = 0.24599033689749722, p-value = 0.8062204074114241
Paired t-test by age: t-value = -2.796184554082821, p-value = 0.006258395811391273


# Mean and error bar showing confidence interval

## Forced shoice survey

In [21]:
forced_grouped_score['groups'] = forced_grouped_score['ethnicity']+'_'+forced_grouped_score['sex']+'_'+forced_grouped_score['age']

In [22]:
source = forced_grouped_score

error_bars = alt.Chart(source).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_errorbar(extent='ci').encode(
    x=alt.X('sentimentScore:Q', scale=alt.Scale(zero=False), title='sentiment score - mean and 95% CI'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N'))

points = alt.Chart(source).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_point(filled=True, color='black').encode(
    x=alt.X('sentimentScore:Q', aggregate='mean'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('sentimentScore:Q', aggregate='mean'))

In [23]:
forced_error_bar_com = error_bars + points + rule

forced_error_bar_com.display(renderer='svg')

## Free choice survey

In [24]:
free_grouped_score['groups'] = free_grouped_score['ethnicity']+'_'+free_grouped_score['sex']+'_'+free_grouped_score['age']

In [25]:
source = free_grouped_score

error_bars = alt.Chart(source).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_errorbar(extent='ci').encode(
    x=alt.X('sentimentScore:Q', scale=alt.Scale(zero=False), title='sentiment score - mean and 95% CI'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N'))

points = alt.Chart(source).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_point(filled=True, color='black').encode(
    x=alt.X('sentimentScore:Q', aggregate='mean'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('sentimentScore:Q', aggregate='mean'))

In [26]:
free_error_bar_com = error_bars + points + rule

free_error_bar_com.display(renderer='svg')

# Save to dataframe to dashboard

In [27]:
# placeholders to save svg strings
svg_str = []
image_title = []

def svg_string(chart, title):
    chart_str = altair_saver.save(chart, fmt='svg')
    image_title.append(title)
    svg_str.append(chart_str)
    print('Saved!')

In [28]:
hist_all = hist_forced_all_com | hist_free_all_com  
hist_sex = hist_forced_sex_com | hist_free_sex_com  
hist_ethnicity = hist_forced_ethnicity_com | hist_free_ethnicity_com  
hist_age = hist_forced_age_com | hist_free_age_com  
error_bar_com = forced_error_bar_com | free_error_bar_com  

In [29]:
svg_string(hist_all, 'Overall mean sentiment distribution')
svg_string(hist_sex, 'Mean sentiment distribution by sex')
svg_string(hist_ethnicity, 'Mean sentiment distribution by ethnicity')
svg_string(hist_age, 'Mean sentiment distribution by age group')
svg_string(error_bar_com, 'Compounded sentiment score')

Saved!
Saved!
Saved!
Saved!
Saved!


In [30]:
images_strings = pd.DataFrame({'image_title': image_title, 'svg': svg_str})

In [31]:
images_strings.to_csv('../clean_data_mturk/sentiment_svg_strings_mturk.csv', index=False)
images_strings.to_csv('../../emotions_dashboard/data/sentiment_svg_strings_mturk.csv', index=False)

In [32]:
df_svg = pd.read_csv('../clean_data_mturk/sentiment_svg_strings_mturk.csv')

In [33]:
df_svg['image_title']

0         Overall mean sentiment distribution
1          Mean sentiment distribution by sex
2    Mean sentiment distribution by ethnicity
3    Mean sentiment distribution by age group
4                  Compounded sentiment score
Name: image_title, dtype: object