In [1]:
import pandas as pd
import numpy as np
import altair as alt

## Data ingestion and formating

In [2]:
df = pd.read_csv('../data/emotion_forced_choice_uw_students.csv')
df_labels = pd.read_csv('../data/emotion_labels.csv')

In [3]:
df = df.iloc[5:, :] # filter out test rows

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 5 to 85
Columns: 220 entries, StartDate to Q195.1
dtypes: object(220)
memory usage: 139.3+ KB


In [5]:
df = df[df['Finished'] =='True'] # filter out incomplete surveys

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 5 to 85
Columns: 220 entries, StartDate to Q195.1
dtypes: object(220)
memory usage: 88.1+ KB


In [7]:
df.rename(columns={
    'Q1.2': 'sex',
    'Q1.3_1': 'age',
    'Q1.4': 'ethnicity',
    'Q1.5': 'formal education',
    'Q1.6_1': 'income'}, inplace=True)

In [8]:
df.to_csv('../clean_data/forced_choice_emotion_uw_students.csv', index=False)

### Words by frequency for all images (ranking)

In [9]:
def count_freq_labels(df, X="all" ):
    if X == "all":
        df_counts = df.stack().reset_index(drop=True).value_counts() # stack as series
        df_counts = df_counts.to_frame('counts') # get value_counts as df
        df_counts['emotion'] = df_counts.index # get index as col
    else:
        df_counts = df[X].reset_index(drop=True).value_counts() # stack as series
        df_counts = df_counts.to_frame('counts') # get value_counts as df
        df_counts[X] = df_counts.index # get index as col

    df_counts = df_counts.reset_index(drop=True) # clean index
    df_counts['percent'] = df_counts['counts'] / df_counts['counts'].sum() # compute percentage
    return df_counts

In [10]:
def simple_per_bar(
    df, title='Title', X='percent:Q', Y='emotion:N', \
    width=450, height=250, sort='-x', \
    text_size = 12, label_size = 11, title_size=12, \
    emotion='Some', color1='#0570b0', color2='orange'):
    
    bars = alt.Chart(df, title=title).mark_bar().encode(
        alt.X(X, axis=alt.Axis(format='.0%')),
        y=alt.Y(Y, sort=sort), 
        color=alt.condition(
            alt.datum.emotion == emotion,
            alt.value(color2),
            alt.value(color1)
        ))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    fontSize=text_size
    ).encode(
        alt.Text(X, format='.1%')
    )
    
    chart = (bars + text).configure_axis(
            labelFontSize=label_size,
            titleFontSize=title_size).properties(
                width=width, 
                height=height).display(renderer='svg')
    
    
    return chart

In [11]:
def simple_count_bar(
    df, title='Title', X='counts:Q', Y='emotion:N', \
    width=450, height=250, sort='-x', \
    text_size = 12, label_size = 11, title_size=12,
    emotion='Some', color1='#0570b0', color2='#orange'):
    
    bars = alt.Chart(df, title=title).mark_bar().encode(
        alt.X(X),
        y=alt.Y(Y, sort=sort), 
        color=alt.condition(
            alt.datum.emotion == emotion,
            alt.value(color2),
            alt.value(color1)
        ))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    fontSize=text_size
    ).encode(
        alt.Text(X)
    )
    
    chart = (bars + text).configure_axis(
            labelFontSize=label_size,
            titleFontSize=title_size).properties(
                width=width, 
                height=height).display(renderer='svg')
    
    
    return chart

## Demographics

In [12]:
source = count_freq_labels(df, X="sex") 
title = 'Sex | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'sex:N'
w, h= 450, 100
txs, ls, ts = 12, 12, 12

chart_sex = simple_per_bar(source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)

In [13]:
source = count_freq_labels(df, X="age") 
title = 'Age | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'age:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_age = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h, \
    text_size = txs, label_size = ls, title_size=ts)
chart_age

In [14]:
source = count_freq_labels(df, X="ethnicity") 
title = 'Ethnicity | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'ethnicity:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_ethnicity= simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h, \
    text_size = txs, label_size = ls, title_size=ts)
chart_ethnicity

In [15]:
source = count_freq_labels(df, X="formal education") 
title = 'Formal education | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'formal education:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_formal_education= simple_per_bar(
    source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)
chart_formal_education

## Overall results

In [16]:
df_emo_answers = df.loc[:, 'Q2.1':'Q195.1'] # subset photos

In [17]:
source = count_freq_labels(df_emo_answers, X="all") 

# save list of emotions words to avoid repeating this computation later
emotion_words_list = source['emotion'].str.lower().tolist()
emotion_words_list.remove('other')

title = 'Labels frequency | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12

chart_overall_per = simple_per_bar(
    source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)
chart_overall_per

In [18]:
source = count_freq_labels(df_emo_answers, X="all") 
title = 'Labels frequency | n = '+ source['counts'].sum().astype(str)
X, Y = 'counts:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12

chart_overall_count = simple_count_bar(
    source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)
chart_overall_count

## Most frequently used word for each emotion category (grouping pictures by expected-emotion)

In [19]:
def emotion_df_formated(df_emo_answers, emotion_label):
    df_emo_cat = df_emo_answers.copy() 
    df_emo_cat_t = df_emo_cat.T # transpose
    df_emo_cat_t['photo_id'] = df_emo_cat_t.index # get index as col
    df_emo_cat_t = df_emo_cat_t.reset_index(drop=True) # clean index
    df_emo_cat_t_labels = pd.concat([df_emo_cat_t, df_labels], axis=1) # add metadata cols
    df_label =  df_emo_cat_t_labels[df_emo_cat_t_labels['label'] == emotion_label]
    return df_label

In [20]:
emotion = 'anger'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_anger = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion.capitalize(), \
    text_size = txs, label_size = ls, title_size=ts)

### Disgust

In [21]:
emotion = 'disgust'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_disgust = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h,  emotion=emotion.capitalize(),\
    text_size = txs, label_size = ls, title_size=ts)

### Fear

In [22]:
emotion = 'fear'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_fear = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h,  emotion=emotion.capitalize(),\
    text_size = txs, label_size = ls, title_size=ts)

### Surprise

In [23]:
emotion = 'surprise'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_surprise = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h,  emotion=emotion.capitalize(), \
    text_size = txs, label_size = ls, title_size=ts)

### Happiness

In [24]:
emotion = 'happiness'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_happiness = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h,  emotion=emotion.capitalize(),\
    text_size = txs, label_size = ls, title_size=ts)

### Sadness

In [25]:
emotion = 'sadness'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_sadness = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h,  emotion=emotion.capitalize(),\
    text_size = txs, label_size = ls, title_size=ts)

### Uncertain

In [26]:
emotion = 'uncertain'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_uncertain = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h,  emotion=emotion.capitalize(),\
    text_size = txs, label_size = ls, title_size=ts)

### Neutral

In [27]:
emotion = 'neutral'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_neutral = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h,  emotion=emotion.capitalize(),\
    text_size = txs, label_size = ls, title_size=ts)

## Most frequently used word for each emotion category (grouping pictures by expected-emotion), by ethnicity group

In [28]:
def emotion_df_formated_et(df_emo_answers, emotion_label, ethnicity):
    df_emo_cat = df_emo_answers.copy() 
    df_emo_cat_t = df_emo_cat.T # transpose
    df_emo_cat_t['photo_id'] = df_emo_cat_t.index # get index as col
    df_emo_cat_t = df_emo_cat_t.reset_index(drop=True) # clean index
    df_emo_cat_t_labels = pd.concat([df_emo_cat_t, df_labels], axis=1) # add metadata cols
    df_label =  df_emo_cat_t_labels[(df_emo_cat_t_labels['label'] == emotion_label) & (df_emo_cat_t_labels['ethnicity'] == ethnicity)]
    return df_label

In [29]:
def wrapper_chart_emotion(df_emo_answers, emotion, ethnicity):
    df = emotion_df_formated_et(df_emo_answers, emotion,  ethnicity) # subset 'anger' rows
    df_formated_ans = df.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
    df_count = count_freq_labels(df_formated_ans) # count label freq
    chart = simple_per_bar(
        df_count, \
        title='Expected label: '+ emotion + ' | n = '+ df_count['counts'].sum().astype(str), \
         emotion=emotion.capitalize())
    return chart

## Anger

In [30]:
chart_anger_bipoc = wrapper_chart_emotion(df_emo_answers, 'anger', 'bipoc')
chart_anger_white = wrapper_chart_emotion(df_emo_answers, 'anger', 'white')

### Disgust

In [31]:
chart_disgust_bipoc = wrapper_chart_emotion(df_emo_answers, 'disgust', 'bipoc')
chart_disgust_white = wrapper_chart_emotion(df_emo_answers, 'disgust', 'white')

### Fear

In [32]:
chart_fear_bipoc = wrapper_chart_emotion(df_emo_answers, 'fear', 'bipoc')
chart_fear_white = wrapper_chart_emotion(df_emo_answers, 'fear', 'white')

### Surprise

In [33]:
chart_surprise_bipoc= wrapper_chart_emotion(df_emo_answers, 'surprise', 'bipoc')
chart_surprise_white = wrapper_chart_emotion(df_emo_answers, 'surprise', 'white')

### Happiness

In [34]:
chart_happiness_bipoc = wrapper_chart_emotion(df_emo_answers, 'happiness', 'bipoc')
chart_happiness_white = wrapper_chart_emotion(df_emo_answers, 'happiness', 'white')

### Sadness

In [35]:
chart_sadness_bipoc = wrapper_chart_emotion(df_emo_answers, 'sadness', 'bipoc')
chart_sadness_white = wrapper_chart_emotion(df_emo_answers, 'sadness', 'white')

### Neutral

In [36]:
chart_neutral_bipoc = wrapper_chart_emotion(df_emo_answers, 'neutral', 'bipoc')
chart_neutral_white = wrapper_chart_emotion(df_emo_answers, 'neutral', 'white')

### Uncertain/Other

In [37]:
chart_uncertain_bipoc = wrapper_chart_emotion(df_emo_answers, 'uncertain', 'bipoc')
chart_uncertain_white = wrapper_chart_emotion(df_emo_answers, 'uncertain', 'white')

## Emotion percentages as feature vectors

In [38]:
# placeholder dataframe
df_emo_features = pd.DataFrame(columns = emotion_words_list) 
# add key-ids
df_emo_features.insert(loc=0, column='photo_id', value=df_emo_answers.columns) 

In [39]:
df_emo_answers = df.loc[:, 'Q2.1':'Q195.1'] # subset photos

In [40]:
def fill_in_emotion_vectors(df_emo_features, df_emo_answers, emotion_words_list):
    for emo in range(0, len(emotion_words_list)):
        
        emotion = emotion_words_list[emo] # get emotion word
        df_ans = emotion_df_formated(df_emo_answers, emotion) # ans for all pics of emo-word
        photo_id_list = df_ans['photo_id'].tolist() # photo ids for the emotion word
        splited_photo_id = np.array_split(photo_id_list, 24) # photo_id_list as an array 
        
        for ids in range(0, len(splited_photo_id)):
            photo_id = splited_photo_id[ids]  
            df_single = df_ans[df_ans['photo_id'].isin(photo_id)] # get row for nth photo
            df_emotion_ans = df_single.drop(['photo_id', 'ethnicity', 'sex', \
                                             'age', 'label', 'url'], axis=1) # clean up for calculation
            source = count_freq_labels(df_emotion_ans, X="all") # compute vector-values
            source = source[['percent', 'emotion']].T
            source.columns = source.iloc[1].str.lower() # emotion words as cols
            source.drop('emotion', axis=0, inplace=True) # clean up emotion row
            source.reset_index(drop=True, inplace=True) # clean up index
            source.columns.name = None
            photo_id_str = photo_id[0] # photo id as str
            source.insert(0, 'photo_id', photo_id_str) # photo id as col
            # set photo id as col for easier manipulation
            df_emo_features = df_emo_features.set_index('photo_id') 
            # fill in vectors where the value is available
#             print(pd.Series(df_emo_features.loc[photo_id_str]))
            df_emo_features.loc[photo_id_str] = pd.Series(source.T.to_dict()[0])
            
            df_emo_features['photo_id'] = df_emo_features.index # index back to column
            df_emo_features.index.name = None 
            df_emo_features.reset_index(drop=True, inplace=True) # clean up index
    
    return df_emo_features 
            


In [41]:
df_emo_vectors = fill_in_emotion_vectors(df_emo_features, df_emo_answers, emotion_words_list)

In [42]:
df_emo_vectors = df_emo_vectors.fillna(0)

In [43]:
df_emo_vectors = pd.concat([df_emo_vectors, df_labels], axis=1) # concat metadata

In [44]:
df_emo_vectors

Unnamed: 0,happiness,neutral,surprise,sadness,disgust,anger,fear,uncertain,photo_id,ethnicity,sex,age,label,url
0,0.019608,0.372549,0.019608,0.000000,0.098039,0.274510,0.000000,0.137255,Q2.1,bipoc,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
1,0.000000,0.019608,0.000000,0.000000,0.098039,0.784314,0.039216,0.039216,Q3.1,bipoc,female,child,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
2,0.000000,0.039216,0.000000,0.000000,0.000000,0.921569,0.019608,0.000000,Q4.1,bipoc,male,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
3,0.000000,0.019608,0.058824,0.333333,0.039216,0.156863,0.078431,0.176471,Q5.1,bipoc,male,child,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
4,0.000000,0.098039,0.039216,0.000000,0.019608,0.686275,0.019608,0.039216,Q6.1,white,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,0.019608,0.078431,0.058824,0.000000,0.137255,0.019608,0.000000,0.254902,Q191.1,bipoc,male,child,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...
190,0.039216,0.019608,0.313725,0.000000,0.176471,0.098039,0.000000,0.156863,Q192.1,white,female,adult,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...
191,0.039216,0.078431,0.039216,0.019608,0.039216,0.352941,0.000000,0.254902,Q193.1,white,female,child,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...
192,0.019608,0.000000,0.019608,0.000000,0.254902,0.000000,0.000000,0.176471,Q194.1,white,male,adult,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...
