In [1]:
import pandas as pd
import numpy as np
import altair as alt

## Data ingestion

In [2]:
df = pd.read_csv('../data/emotion_free_choice_uw_students.csv')
df_label = pd.read_csv('../data/emotion_labels.csv')

In [3]:
df_label['url'] = df_label['url'].astype(str)

In [4]:
# replicate rows by 4 to match with free responses
df_labels = pd.DataFrame(np.repeat(df_label.values, 4, axis=0))
df_labels.columns = df_label.columns

In [5]:
df_labels.to_csv('../data/emotion_labels_free_choice.csv', index=False)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 633.0+ KB


In [7]:
df = df.iloc[19:, :]  # filter out test rows

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 19 to 100
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 513.9+ KB


In [9]:
df = df[df['Finished'] =='True'] # filter out incomplete surveys

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 19 to 95
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 319.9+ KB


In [11]:
df.to_csv('../clean_data/free_choice_emotion_uw_students.csv', index=False)

## Formating

In [12]:
df_emo_answers = df.loc[:, 'Q2.1_1':'Q195.1_4'] # get cols with words only

In [13]:
def formating_words(df, len_words=1, len_letters=2):
    df_stack = df.stack().reset_index(drop=True) # stack as series
    df_stack = df_stack.to_frame(name='emotion') # as DF
    df_stack['emotion'] = df_stack['emotion'].str.strip() # remove blank spaces
    df_stack['emotion'] = df_stack['emotion'].str.lower() # as lower case
    df_stack['emotion'] = df_stack['emotion'].replace({'na':'none'}) 
    df_stack['len_words'] = df_stack['emotion'].str.split().apply(len) # cnt number of words
    df_stack['len_letters'] = df_stack['emotion'].apply(len) # cont number of letters
    # get df with single words of 3 or more letters
    df_stack_single_word = df_stack[(df_stack['len_words'] == len_words) & (df_stack['len_letters'] > len_letters)] 
    return df_stack_single_word

In [14]:
df_stack_single_word = formating_words(df_emo_answers)

## Spell checking 
**NOTE**: poor results so far with this

In [15]:
# conda install -c conda-forge pattern 
# from pattern.en import suggest

In [16]:
# df_stack_single_word['emotion_spell_check'] = df_stack_single_word['emotion'].apply(lambda x: suggest(x)[0][0])
# df_stack_single_word['emotion'].size
# df_stack_single_word['emotion'].size - sum(df_stack_single_word['emotion'] == df_stack_single_word['emotion_spell_check']) # number of words changed

## Steamming

In [17]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

In [18]:
ps = PorterStemmer()
ls = LancasterStemmer()
snowball = SnowballStemmer(language='english')

In [19]:
# Porter stemmer
df_stack_single_word['emotion_ps_steamed'] = df_stack_single_word['emotion'].apply(lambda x: ps.stem(x))

In [None]:
# Lancaster stemmer
df_stack_single_word['emotion_ls_steamed'] = df_stack_single_word['emotion'].apply(lambda x: ls.stem(x))

In [21]:
# Snowball stemmer
df_stack_single_word['emotion_sb_steamed'] = df_stack_single_word['emotion'].apply(lambda x: snowball.stem(x))

In [22]:
df_stack_single_word.head(3)

Unnamed: 0,emotion,len_words,len_letters,emotion_ps_steamed,emotion_ls_steamed,emotion_sb_steamed
0,angry,1,5,angri,angry,angri
1,yelling,1,7,yell,yel,yell
2,yelling,1,7,yell,yel,yell


## Count frequency and plot

In [23]:
def count_freq_labels(df, col):
    df_counts = df[col].value_counts() # count word frequency 
    df_counts = df_counts.to_frame('counts') # get value_counts as df
    df_counts['emotion'] = df_counts.index # get index as col
    df_counts = df_counts.reset_index(drop=True) # clean index
    df_counts['percent'] = df_counts['counts'] / df_counts['counts'].sum() # compute percentage
    return df_counts

In [24]:
def simple_per_bar(df, X, Y, top, width=300, height=300, title='Title'):
    bars = alt.Chart(df.head(top), title=title).mark_bar().encode(
        alt.X(X, axis=alt.Axis(format='.0%')),
        y=alt.Y(Y, sort='-x'))
    
    text = bars.mark_text(
        align='left',
        baseline='middle',
        dx=3,  # Nudges text to right so it doesn't appear on top of the bar
        fontSize=10
    ).encode(
        alt.Text(X, format='.2%')
    )
    
    chart = (bars + text)
    
    return chart.properties(width=width, height=height)

In [25]:
def simple_count_bar(df, X, Y, top, width=300, height=300, title='Title'):
    bars = alt.Chart(df.head(top), title=title).mark_bar().encode(
        alt.X(X),
        y=alt.Y(Y, sort='-x'))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    fontSize=10
    ).encode(
        alt.Text('counts:Q')
    )
    
    chart = (bars + text)
    
    return chart.properties(width=width, height=height)

In [26]:
df_emo_overall = count_freq_labels(df_stack_single_word, 'emotion_ps_steamed')


In [27]:
df_emo_overall.to_csv('../clean_data/free_choice_emotion_uw_students_overall.csv', index=False)

In [28]:
df_emo_overall = count_freq_labels(df_stack_single_word, 'emotion_ps_steamed')
simple_per_bar(df_emo_overall, 'percent:Q', 'emotion:N', width=300, height=500, top=50, title='Most frequently selected labels | n = '+df_emo_overall['counts'].sum().astype(str))

In [29]:
simple_count_bar(df_emo_overall, 'counts:Q', 'emotion:N', top=20, title='Most frequently selected labels | n = '+df_emo_overall['counts'].sum().astype(str))

## Most frequently used word for each emotion category (grouping pictures by expected-emotion)

In [30]:
def df_add_label(df_emo_answers, emotion_label):
    df_emo_cat = df_emo_answers.copy() 
    df_emo_cat_t = df_emo_cat.T # transpose
    df_emo_cat_t['photo_id'] = df_emo_cat_t.index # get index as col
    df_emo_cat_t = df_emo_cat_t.reset_index(drop=True) # clean index
    df_emo_cat_t_labels = pd.concat([df_emo_cat_t, df_labels], axis=1) # add metadata cols
    df_label =  df_emo_cat_t_labels[df_emo_cat_t_labels['label'] == emotion_label]
    
    return df_label

In [31]:
    df_emotion = df_add_label(df, 'anger') # subset 'emotion' rows
    df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label'], axis=1)
    df_stack_emotion = formating_words(df_emotion_ans) # clean up
    # df_stack_anger['emotion_spell_check'] = df_stack_single_word['emotion'].apply(lambda x: suggest(x)[0][0])
    df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
    df_stack_emotion_count = count_freq_labels(df_stack_emotion, 'emotion_ps_steamed')

In [32]:
def chart_wrapper(df, emotion, title):
    df_emotion = df_add_label(df, emotion) # subset 'emotion' rows
    df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label'], axis=1)
    df_stack_emotion = formating_words(df_emotion_ans) # clean up
    # df_stack_anger['emotion_spell_check'] = df_stack_single_word['emotion'].apply(lambda x: suggest(x)[0][0])
    df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
    df_stack_emotion_count = count_freq_labels(df_stack_emotion, 'emotion_ps_steamed')
    n = df_stack_emotion_count['counts'].sum()    
    chart = simple_per_bar(df_stack_emotion_count, 'percent:Q', 'emotion:N', top=20, title=title+" | n= "+n.astype(str))
    
    return chart

### Charts

In [33]:
chart_anger = chart_wrapper(df_emo_answers, 'anger', 'Expected label: '+ ' anger')
chart_disgust = chart_wrapper(df_emo_answers, 'disgust', 'Expected label: '+ ' disgust')
chart_fear = chart_wrapper(df_emo_answers, 'fear', 'Expected label: '+ ' fear')
chart_surprise = chart_wrapper(df_emo_answers, 'surprise', 'Expected label: '+ ' surprise')
chart_happiness = chart_wrapper(df_emo_answers, 'happiness', 'Expected label: '+ ' happiness')
chart_sadness = chart_wrapper(df_emo_answers, 'sadness', 'Expected label: '+ ' sadness')
chart_uncertain = chart_wrapper(df_emo_answers, 'uncertain', 'Expected label: '+ ' uncertain')
chart_neutral = chart_wrapper(df_emo_answers, 'neutral', 'Expected label: '+ ' neutral')

In [34]:
(chart_anger | chart_disgust) & (chart_fear | chart_surprise) & (chart_happiness | chart_sadness) & (chart_neutral | chart_uncertain) 

### Most frequently used labels by photo

In [35]:
def photo_chart(df_emo_answers, emotion,  i):
    df = df_add_label(df_emo_answers, emotion) # add emotion label
    photo_id_list = df['photo_id'].tolist() # photo id to list
    splited_photo_id = np.array_split(photo_id_list, 24) # split photo id by photo
    df = df[df['photo_id'].isin(splited_photo_id[i])] # select photo rows   
    df_url = df[['url']].head(1)  # get url for chart
    
    face = alt.Chart(df_url).mark_image(width=110, height=110, align='right', xOffset=0, yOffset=230).encode(url='url')
    
    df_emotion_ans = df.drop(['photo_id', 'ethnicity', 'sex', 'age', 'label', 'url'], axis=1) # clean cols
    df_stack_emotion = formating_words(df_emotion_ans) # clean up words
    df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) # steam
    df_stack_emotion_count = count_freq_labels(df_stack_emotion, 'emotion_ps_steamed') # group and count
    
    chart = simple_per_bar(df_stack_emotion_count, 'percent:Q', 'emotion:N', width=300, height=300, top=20, title="Expected: "+" "+emotion+" | n = "+    df_stack_emotion_count['counts'].sum().astype(str))
    
    return chart + face

In [36]:
photo_chart(df_emo_answers, 'anger',  7)

In [37]:
def charts_emotion_faces(df_emo_answers, emotion):
    charts = []
    for i in range(0, 24):
        charts.append(photo_chart(df_emo_answers, emotion,  i))
    return charts

In [38]:
def dashboard_emotion_faces(charts, rows):
    if rows == 1:
        dashboard= (charts[0] | charts[1] | charts[2])
    if rows == 2:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) 
    if rows == 3:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8])
    if rows == 4:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11])
    if rows == 5:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11]) & \
                   (charts[12] | charts[13] | charts[14]) 
    if rows == 6:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11]) & \
                   (charts[12] | charts[13] | charts[14]) & \
                   (charts[15] | charts[16] | charts[17])
    if rows == 7:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11]) & \
                   (charts[12] | charts[13] | charts[14]) & \
                   (charts[15] | charts[16] | charts[17]) & \
                   (charts[18] | charts[19] | charts[20]) 
    if rows == 8:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11]) & \
                   (charts[12] | charts[13] | charts[14]) & \
                   (charts[15] | charts[16] | charts[17]) & \
                   (charts[18] | charts[19] | charts[20]) & \
                   (charts[21] | charts[22] | charts[23]) 
        
    return dashboard

In [39]:
charts = charts_emotion_faces(df_emo_answers, 'sadness')
dashboard_emotion_faces(charts, 8)