In [1]:
import pandas as pd
import numpy as np
import altair as alt

## Data ingestion and formating

In [2]:
df = pd.read_csv('../data/emotion_forced_choice_uw_students.csv')
df_labels = pd.read_csv('../data/emotion_labels.csv')

In [3]:
df = df.iloc[5:, :] # filter out test rows

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 5 to 85
Columns: 220 entries, StartDate to Q195.1
dtypes: object(220)
memory usage: 139.3+ KB


In [5]:
df = df[df['Finished'] =='True'] # filter out incomplete surveys

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 5 to 85
Columns: 220 entries, StartDate to Q195.1
dtypes: object(220)
memory usage: 88.1+ KB


In [37]:
df.to_csv('../clean_data/forced_choice_emotion_uw_students.csv', index=False)

## Analysis plan:

### Exploratory analsys without grouping:

- Number of unique words used to describe facial expressions in the dataset
- Words by frequency in the dataset (ranking)
- Most frequently used word for each picture
- Most frequently used word for each emotion category (grouping pictures by expected-emotion)

### Exploratory analsys grouping with synonyms:

- Group words with their synonyms:
    - PyDictionary method
     - Wordnet (NLTK) method
     - spaCy method
     - Word2Vec method
- Number of group-words after merging by synonyms
- Group-words by frequency in the dataset (ranking)
- Most frequently used group-word for each picture
- Most frequently used group-word for each emotion category

Libraries: https://github.com/johnbumgarner/synonyms_discovery_aggregation

### Comparison with expected word categories: 
- Comparing expected-word and given-word for each picture
- Comparing expected-word and given-word for each emotion category (grouping pictures by expected-emotion)

### Words by frequency for all images (ranking)

In [8]:
def count_freq_labels(df):
    df_counts = df.stack().reset_index(drop=True).value_counts() # stack as series
    df_counts = df_counts.to_frame('counts') # get value_counts as df
    df_counts['emotion'] = df_counts.index # get index as col
    df_counts = df_counts.reset_index(drop=True) # clean index
    df_counts['percent'] = df_counts['counts'] / df_counts['counts'].sum() # compute percentage
    return df_counts

In [9]:
def simple_per_bar(df, title='Title'):
    bars = alt.Chart(df, title=title).mark_bar().encode(
        alt.X('percent:Q', axis=alt.Axis(format='.0%')),
        y=alt.Y('emotion:N', sort='-x'))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
    ).encode(
        alt.Text('percent:Q', format='.1%')
    )
    
    chart = (bars + text)
    
    return chart

In [10]:
def simple_count_bar(df, title='Title'):
    bars = alt.Chart(df, title=title).mark_bar().encode(
        alt.X('counts:Q'),
        y=alt.Y('emotion:N', sort='-x'))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
    ).encode(
        alt.Text('counts:Q')
    )
    
    chart = (bars + text)
    
    return chart

In [11]:
df_emo_answers = df.loc[:, 'Q2.1':'Q195.1'] # subset photos

In [12]:
df_emo_overall = count_freq_labels(df_emo_answers) # 
simple_per_bar(df_emo_overall, title='Most frequently selected labels')

In [13]:
simple_count_bar(df_emo_overall, title='Most frequently selected labels')

## Most frequently used word for each emotion category (grouping pictures by expected-emotion)

In [14]:
def emotion_df_formated(df_emo_answers, emotion_label):
    df_emo_cat = df_emo_answers.copy() 
    df_emo_cat_t = df_emo_cat.T # transpose
    df_emo_cat_t['photo_id'] = df_emo_cat_t.index # get index as col
    df_emo_cat_t = df_emo_cat_t.reset_index(drop=True) # clean index
    df_emo_cat_t_labels = pd.concat([df_emo_cat_t, df_labels], axis=1) # add metadata cols
    df_label =  df_emo_cat_t_labels[df_emo_cat_t_labels['label'] == emotion_label]
    return df_label

### Anger 

In [15]:
df_anger = emotion_df_formated(df_emo_answers, 'anger') # subset 'anger' rows
df_anger_ans = df_anger.iloc[:, 0:4] # extract answers only
df_anger_overall = count_freq_labels(df_anger_ans) # count label freq
chart_anger = simple_per_bar(df_anger_overall, title='Expected label: Anger')

### Disgust

In [16]:
df_disgust = emotion_df_formated(df_emo_answers, 'disgust') # subset 'disgust' rows
df_disgust_ans = df_disgust.iloc[:, 0:4] # extract answers only
df_disgust_overall = count_freq_labels(df_disgust_ans) # count label freq
chart_disgust = simple_per_bar(df_disgust_overall, title='Expected label: Disgust')

### Fear

In [17]:
df_fear = emotion_df_formated(df_emo_answers, 'fear') # subset 'fear' rows
df_fear_ans = df_fear.iloc[:, 0:4] # extract answers only
df_fear_overall = count_freq_labels(df_fear_ans) # count label freq
chart_fear = simple_per_bar(df_fear_overall, title='Expected label: Fear')

### Surprise

In [18]:
df_surprise = emotion_df_formated(df_emo_answers, 'surprise') # subset 'surprise' rows
df_surprise_ans = df_surprise.iloc[:, 0:4] # extract answers only
df_surprise_overall = count_freq_labels(df_surprise_ans) # count label freq
chart_surprise = simple_per_bar(df_surprise_overall, title='Expected label: Surprise')

### Happiness

In [19]:
df_happiness = emotion_df_formated(df_emo_answers, 'happiness') # subset 'hapiness' rows
df_happiness_ans = df_happiness.iloc[:, 0:4] # extract answers only
df_happiness_overall = count_freq_labels(df_happiness_ans) # count label freq
chart_happiness = simple_per_bar(df_happiness_overall, title='Expected label: Happiness')

### Sadness

In [20]:
df_sadness = emotion_df_formated(df_emo_answers, 'sadness') # subset 'sadness' rows
df_sadness_ans = df_sadness.iloc[:, 0:4] # extract answers only
df_sadness_overall = count_freq_labels(df_sadness_ans) # count label freq
chart_sadness = simple_per_bar(df_sadness_overall, title='Expected label: Sadness')

### Uncertain

In [21]:
df_sadness = emotion_df_formated(df_emo_answers, 'sadness') # subset 'uncertain' rows
df_sadness_ans = df_sadness.iloc[:, 0:4] # extract answers only
df_sadness_overall = count_freq_labels(df_sadness_ans) # count label freq
chart_uncertain = simple_per_bar(df_sadness_overall, title='Expected label: Uncertain/Other')

### Neutral

In [22]:
df_neutral = emotion_df_formated(df_emo_answers, 'neutral') # subset 'neutral' rows
df_neutral_ans = df_neutral.iloc[:, 0:4] # extract answers only
df_neutral_overall = count_freq_labels(df_neutral_ans) # count label freq
chart_neutral = simple_per_bar(df_neutral_overall, title='Expected label: Neutral')

In [23]:
(chart_anger | chart_disgust) & (chart_fear | chart_surprise) & (chart_happiness | chart_sadness) & (chart_neutral | chart_uncertain) 

## Most frequently used word for each emotion category (grouping pictures by expected-emotion), by ethnicity group

In [24]:
def emotion_df_formated_2(df_emo_answers, emotion_label, ethnicity):
    df_emo_cat = df_emo_answers.copy() 
    df_emo_cat_t = df_emo_cat.T # transpose
    df_emo_cat_t['photo_id'] = df_emo_cat_t.index # get index as col
    df_emo_cat_t = df_emo_cat_t.reset_index(drop=True) # clean index
    df_emo_cat_t_labels = pd.concat([df_emo_cat_t, df_labels], axis=1) # add metadata cols
    df_label =  df_emo_cat_t_labels[(df_emo_cat_t_labels['label'] == emotion_label) & (df_emo_cat_t_labels['ethnicity'] == ethnicity)]
    return df_label

In [25]:
def wrapper_chart_emotion(df_emo_answers, emotion, ethnicity):
    df = emotion_df_formated_2(df_emo_answers, emotion,  ethnicity) # subset 'anger' rows
    df_ans = df.iloc[:, 0:4] # extract answers only
    df_count = count_freq_labels(df_ans) # count label freq
    chart = simple_per_bar(df_count, title='Expected label: '+ emotion +' - ' + ethnicity)
    return chart

## Anger

In [26]:
chart_anger_bipoc = wrapper_chart_emotion(df_emo_answers, 'anger', 'bipoc')

In [27]:
chart_anger_white = wrapper_chart_emotion(df_emo_answers, 'anger', 'white')

### Disgust

In [28]:
chart_disgust_bipoc = wrapper_chart_emotion(df_emo_answers, 'disgust', 'bipoc')
chart_disgust_white = wrapper_chart_emotion(df_emo_answers, 'disgust', 'white')

### Fear

In [29]:
chart_fear_bipoc = wrapper_chart_emotion(df_emo_answers, 'fear', 'bipoc')
chart_fear_white = wrapper_chart_emotion(df_emo_answers, 'fear', 'white')

### Surprise

In [30]:
chart_surprise_bipoc= wrapper_chart_emotion(df_emo_answers, 'surprise', 'bipoc')
chart_surprise_white = wrapper_chart_emotion(df_emo_answers, 'surprise', 'white')

### Happiness

In [31]:
chart_happiness_bipoc = wrapper_chart_emotion(df_emo_answers, 'happiness', 'bipoc')
chart_happiness_white = wrapper_chart_emotion(df_emo_answers, 'happiness', 'white')

### Sadness

In [32]:
chart_sadness_bipoc = wrapper_chart_emotion(df_emo_answers, 'sadness', 'bipoc')
chart_sadness_white = wrapper_chart_emotion(df_emo_answers, 'sadness', 'white')

### Neutral

In [33]:
chart_neutral_bipoc = wrapper_chart_emotion(df_emo_answers, 'neutral', 'bipoc')
chart_neutral_white = wrapper_chart_emotion(df_emo_answers, 'neutral', 'white')

### Uncertain/Other

In [34]:
chart_uncertain_bipoc = wrapper_chart_emotion(df_emo_answers, 'uncertain', 'bipoc')
chart_uncertain_white = wrapper_chart_emotion(df_emo_answers, 'uncertain', 'white')

In [35]:
(chart_anger_bipoc | chart_anger_white) & (chart_disgust_bipoc | chart_disgust_white) & (chart_fear_bipoc | chart_fear_white) & (chart_surprise_bipoc | chart_surprise_white) & (chart_happiness_bipoc | chart_happiness_white) & (chart_sadness_bipoc | chart_sadness_white) & (chart_neutral_bipoc | chart_neutral_white) & (chart_uncertain_bipoc | chart_uncertain_white) 