In [198]:
import pandas as pd
import numpy as np
import altair as alt
import altair_saver

## Data ingestion

In [199]:
df = pd.read_csv('../data/emotion_free_choice_uw_students.csv')
df_label = pd.read_csv('../data/emotion_labels.csv')

In [200]:
df_label['url'] = df_label['url'].astype(str)

In [201]:
# replicate rows by 4 to match with free responses
df_labels = pd.DataFrame(np.repeat(df_label.values, 4, axis=0))
df_labels.columns = df_label.columns

In [202]:
df_labels.to_csv('../data/emotion_labels_free_choice.csv', index=False)

In [203]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 633.0+ KB


In [204]:
df = df.iloc[19:, :]  # filter out test rows

In [205]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 19 to 100
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 513.9+ KB


In [206]:
df = df[df['Finished'] =='True'] # filter out incomplete surveys

In [207]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 19 to 95
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 319.9+ KB


In [208]:
df.rename(columns={
    'Q1.2': 'sex',
    'Q1.3_1': 'age',
    'Q1.4': 'ethnicity',
    'Q1.5': 'formal education',
    'Q1.6_1': 'income'}, inplace=True)

In [209]:
df.to_csv('../clean_data/free_choice_emotion_uw_students.csv', index=False)

In [210]:
# placeholders to save svg strings
svg_str = []
image_title = []

## Demographics

In [211]:
def count_freq_labels(df, X="all", col="emotion"):
    if X == "all":
        df_counts = df.stack().reset_index(drop=True).value_counts() # stack as series
        df_counts = df_counts.to_frame('counts') # get value_counts as df
        df_counts['emotion'] = df_counts.index # get index as col
    elif X == "stacked":
        df_counts = df[col].value_counts() # count word frequency
        df_counts = df_counts.to_frame('counts') # get value_counts as df
        df_counts['emotion'] = df_counts.index # get index as col    
    else:
        df_counts = df[X].reset_index(drop=True).value_counts() # stack as series
        df_counts = df_counts.to_frame('counts') # get value_counts as df
        df_counts[X] = df_counts.index # get index as col

    df_counts = df_counts.reset_index(drop=True) # clean index
    df_counts['percent'] = df_counts['counts'] / df_counts['counts'].sum() # compute percentage
    return df_counts

In [212]:
def simple_per_bar(
    df, title='Title', X='percent:Q', Y='emotion:N', \
    width=450, height=250, sort='-x', \
    text_size = 12, label_size = 11, title_size=12, \
    emotion='Some', color1='#0570b0', color2='orange'):
    
    bars = alt.Chart(df, title=title).mark_bar().encode(
        alt.X(X, axis=alt.Axis(format='.0%')),
        y=alt.Y(Y, sort=sort), 
        color=alt.condition(
            alt.datum.emotion == emotion,
            alt.value(color2),
            alt.value(color1)
        ))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    fontSize=text_size
    ).encode(
        alt.Text(X, format='.1%')
    )
    
    chart = (bars + text).configure_axis(
            labelFontSize=label_size,
            titleFontSize=title_size).properties(
                width=width, 
                height=height)
    
    
    return chart

In [213]:
def simple_count_bar(
    df, title='Title', X='counts:Q', Y='emotion:N', \
    width=450, height=250, sort='-x', \
    text_size = 12, label_size = 11, title_size=12,
    emotion='Some', color1='#0570b0', color2='#orange'):
    
    bars = alt.Chart(df, title=title).mark_bar().encode(
        alt.X(X),
        y=alt.Y(Y, sort=sort), 
        color=alt.condition(
            alt.datum.emotion == emotion,
            alt.value(color2),
            alt.value(color1)
        ))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    fontSize=text_size
    ).encode(
        alt.Text(X)
    )
    
    chart = (bars + text).configure_axis(
            labelFontSize=label_size,
            titleFontSize=title_size).properties(
                width=width, 
                height=height)
    
    
    return chart

In [214]:
source = count_freq_labels(df, X="sex") 
title = 'Sex | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'sex:N'
w, h= 450, 100
txs, ls, ts = 12, 12, 12

chart_sex = simple_per_bar(source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)

In [215]:
chart_sex.display(renderer='svg')

In [216]:
chart_sex_string = altair_saver.save(chart_sex, fmt='svg')
image_title.append('Participants by sex')
svg_str.append(chart_sex_string)

In [217]:
source = count_freq_labels(df, X="age") 
title = 'Age | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'age:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_age = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h, \
    text_size = txs, label_size = ls, title_size=ts)
chart_age.display(renderer='svg')

In [218]:
chart_age_str = altair_saver.save(chart_age, fmt='svg')
image_title.append('Participants by age')
svg_str.append(chart_age_str)

In [219]:
source = count_freq_labels(df, X="ethnicity") 
title = 'Ethnicity | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'ethnicity:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_ethnicity= simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h, \
    text_size = txs, label_size = ls, title_size=ts)
chart_ethnicity.display(renderer='svg')

In [220]:
chart_et_str = altair_saver.save(chart_ethnicity, fmt='svg')
image_title.append('Participants by ethnicity')
svg_str.append(chart_et_str)

In [221]:
source = count_freq_labels(df, X="formal education") 
title = 'Formal education | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'formal education:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_formal_education= simple_per_bar(
    source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)
chart_formal_education.display(renderer='svg')

In [222]:
chart_formal_education_str = altair_saver.save(chart_formal_education, fmt='svg')
image_title.append('Participants by formal education')
svg_str.append(chart_formal_education_str)

## Formating

In [223]:
df_emo_answers = df.loc[:, 'Q2.1_1':'Q195.1_4'] # get cols with words only

In [224]:
def formating_words(df, len_words=1, len_letters=2):
    df_stack = df.stack().reset_index(drop=True) # stack as series
    df_stack = df_stack.to_frame(name='emotion') # as DF
    df_stack['emotion'] = df_stack['emotion'].str.strip() # remove blank spaces
    df_stack['emotion'] = df_stack['emotion'].str.lower() # as lower case
    df_stack['emotion'] = df_stack['emotion'].replace({'na':'none'}) 
    df_stack['len_words'] = df_stack['emotion'].str.split().apply(len) # cnt number of words
    df_stack['len_letters'] = df_stack['emotion'].apply(len) # cont number of letters
    # get df with single words of 3 or more letters
    df_stack_single_word = df_stack[(df_stack['len_words'] == len_words) & (df_stack['len_letters'] > len_letters)] 
    df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
    df_stack_single_word = df_stack_single_word[~df_stack_single_word['emotion'].str.contains(r'[0-9]')] #drop words containing numbers
    df_stack_single_word = df_stack_single_word[df_stack_single_word.groupby('emotion').emotion.transform(len) > 1] # keep words that ocurr > 1
    return df_stack_single_word

In [225]:
df_stack_single_word = formating_words(df_emo_answers)

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [226]:
df_stack_single_word['len_words'].value_counts()

1    13847
Name: len_words, dtype: int64

## Spell checking 
**NOTE**: poor results with automated spell-check, so I did it manually for now

In [227]:
# conda install -c conda-forge pattern 
# from pattern.en import suggest

In [228]:
# df_stack_single_word['emotion_spell_check'] = df_stack_single_word['emotion'].apply(lambda x: suggest(x)[0][0])
# df_stack_single_word['emotion'].size
# df_stack_single_word['emotion'].size - sum(df_stack_single_word['emotion'] == df_stack_single_word['emotion_spell_check']) # number of words changed

In [229]:
def manual_spell_check(df):
    df.replace(['happy', 'happu', 'hapy', 'happyy', 'happyb', 'happt', 'haapy', 'hapoy'], 'happiness', inplace=True)
    df.replace(['angry', 'angr'], 'anger', inplace=True)
    df.replace(['sad', 'sadd', 'sadness'], 'sadness', inplace=True)
    df.replace(['disgusted', 'disgust', 'disgusting', 'disguetsed', 'disugested'], 'disgust', inplace=True)
    df.replace(['surprise', 'surprised', 'surpsied', 'surpise', 'surprising', 'surprisef'], 'surprise', inplace=True)
    df.replace(['fear', 'fearful', 'fearfulness'], 'fear', inplace=True)
    df.replace(["frusturated"], "frustrated", inplace=True)
    
    return df

In [230]:
df_stack_single_word = manual_spell_check(df_stack_single_word)

## Grouping 

In [231]:
# importing the module
import json
 
# synsets 
with open('../clean_data/syn_dict_emotions.json') as json_file:
    syns = json.load(json_file)

# hyponyms 
with open('../clean_data/hyp_dict_emotions.json') as json_file:
    hyps = json.load(json_file)

In [232]:
df_stack_single_word['emotion'].value_counts()

happiness     1405
sadness       1129
anger          752
confused       594
surprise       528
              ... 
yellow           2
blurry           2
aggressive       2
youthful         2
eww              2
Name: emotion, Length: 481, dtype: int64

In [233]:
def update_dic(syns, emo, emo_variant):
    '''to merge key variants'''
    
    syns[emo].extend(syns[emo_variant])
    syns.pop(emo_variant)

In [234]:
update_dic(syns, emo='anger', emo_variant='angry')
update_dic(syns, emo='disgust', emo_variant='disgusted')
update_dic(syns, emo='sadness', emo_variant='sad')
update_dic(syns, emo='fear', emo_variant='fearful')
update_dic(syns, emo='surprise', emo_variant='surprised')
update_dic(syns, emo='happiness', emo_variant='happy')

In [235]:
syns.keys()

dict_keys(['anger', 'neutral', 'disgust', 'sadness', 'fear', 'surprise', 'happiness'])

In [236]:
def grouping(df, syns):
    '''to group (replace) by synsets and hyponyms'''
    
    for key in syns:
        synset = syns[key]
        df.replace(synset, key, inplace=True)
    
    return df

In [237]:
df_stack_single_word = grouping(df_stack_single_word, syns)

In [238]:
df_stack_single_word['emotion'].value_counts()

happiness    1410
sadness      1135
anger         766
confused      594
surprise      528
             ... 
yellow          2
blurry          2
arrested        2
youthful        2
guilt           2
Name: emotion, Length: 473, dtype: int64

In [239]:
df_stack_single_word = grouping(df_stack_single_word, hyps)

In [240]:
df_stack_single_word['emotion'].value_counts()

happiness    1412
sadness      1159
anger         782
confused      594
surprise      528
             ... 
tooth           2
arrested        2
yellow          2
blurry          2
guilt           2
Name: emotion, Length: 465, dtype: int64

## Steamming

In [241]:
# from nltk.stem import PorterStemmer
# from nltk.stem import LancasterStemmer
# from nltk.stem import SnowballStemmer

In [242]:
# ps = PorterStemmer()
# ls = LancasterStemmer()
# snowball = SnowballStemmer(language='english')

In [243]:
# # Porter stemmer
# df_stack_single_word['emotion_ps_steamed'] = df_stack_single_word['emotion'].apply(lambda x: ps.stem(x))

In [244]:
# # Lancaster stemmer
# df_stack_single_word['emotion_ls_steamed'] = df_stack_single_word['emotion'].apply(lambda x: ls.stem(x))

In [245]:
# # Snowball stemmer
# df_stack_single_word['emotion_sb_steamed'] = df_stack_single_word['emotion'].apply(lambda x: snowball.stem(x))

## Get stacked counts datasets

In [246]:
def emotion_df_formated(df_emo_answers, emotion_label):
    df_emo_cat = df_emo_answers.copy() 
    df_emo_cat_t = df_emo_cat.T # transpose
    df_emo_cat_t['photo_id'] = df_emo_cat_t.index # get index as col
    df_emo_cat_t = df_emo_cat_t.reset_index(drop=True) # clean index
    df_emo_cat_t_labels = pd.concat([df_emo_cat_t, df_labels], axis=1) # add metadata cols
    df_label =  df_emo_cat_t_labels[df_emo_cat_t_labels['label'] == emotion_label]
    
    return df_label

def wrapper_emotion_rank(df_emo_answers, emotion_label = None, top = None):
    df_emotion_cnt = emotion_df_formated(df_emo_answers, emotion_label=emotion_label)
    df_emotion_ans_cnt = df_emotion_cnt.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url', 'photoId'], axis=1)
    df_stack_emotion_cnt = formating_words(df_emotion_ans_cnt) # clean up
    df_stack_emotion_cnt = manual_spell_check(df_stack_emotion_cnt)
    df_stack_emotion_cnt_clean = count_freq_labels(df_stack_emotion_cnt, X='stacked', col='emotion')
    df_stack_emotion_cnt_clean_top = df_stack_emotion_cnt_clean[df_stack_emotion_cnt_clean['counts'] > top]
    
    return df_stack_emotion_cnt_clean_top

In [247]:
# df_anger_top = wrapper_emotion_rank(df_emo_answers, emotion_label= 'anger', top=10)
# df_disgust_top = wrapper_emotion_rank(df_emo_answers, emotion_label= 'disgust', top=10)
# df_fear_top = wrapper_emotion_rank(df_emo_answers, emotion_label= 'fear', top=10)
# df_happiness_top = wrapper_emotion_rank(df_emo_answers, emotion_label= 'happiness', top=10)
# df_sadness_top = wrapper_emotion_rank(df_emo_answers, emotion_label= 'sadness', top=10)
# df_surprise_top = wrapper_emotion_rank(df_emo_answers, emotion_label= 'surprise', top=10)

In [248]:
df_emo_overall_raw = count_freq_labels(df_stack_single_word, X="stacked", col='emotion')

In [249]:
# df_emo_overall_raw.to_csv('../clean_data/free_choice_emotion_uw_students_overall_count.csv', index=False)

# df_anger_top.to_csv('../clean_data/free_choice_emotion_uw_students_anger_rank.csv', index=False)
# df_disgust_top.to_csv('../clean_data/free_choice_emotion_uw_students_disgust_rank.csv', index=False)
# df_fear_top.to_csv('../clean_data/free_choice_emotion_uw_students_fear_rank.csv', index=False)
# df_happiness_top.to_csv('../clean_data/free_choice_emotion_uw_students_happiness_rank.csv', index=False)
# df_sadness_top.to_csv('../clean_data/free_choice_emotion_uw_students_sadness_rank.csv', index=False)
# df_surprise_top.to_csv('../clean_data/free_choice_emotion_uw_students_surprise_rank.csv', index=False)

## Count frequency and plot

In [250]:
df_emo_overall = count_freq_labels(df_stack_single_word, X="stacked", col='emotion')

In [251]:
free_choice_word_list = count_freq_labels(df_stack_single_word, 'emotion')['emotion']

In [252]:
free_choice_word_list.to_csv('../clean_data/free_choice_word_list.csv', index=False)

In [253]:
df_emo_overall.to_csv('../clean_data/free_choice_emotion_uw_students_overall.csv', index=False)

In [254]:
source = df_emo_overall[df_emo_overall['counts'] > 49]
title = 'Labels with 50 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12

chart_overall_per = simple_per_bar(source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)
chart_overall_per.display(renderer='svg')

In [255]:
chart_overall_per_str = altair_saver.save(chart_overall_per, fmt='svg')
image_title.append('Overall results by expected label as %')
svg_str.append(chart_overall_per_str)

In [256]:
source = df_emo_overall[df_emo_overall['counts'] > 49]
title = 'Labels with 50 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'counts:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12

chart_overall_count = simple_count_bar(source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)
chart_overall_count.display(renderer='svg')

In [257]:
chart_overall_count_str = altair_saver.save(chart_overall_count, fmt='svg')
image_title.append('Overall results by expected label as count')
svg_str.append(chart_overall_count_str)

## Most frequently used word for each emotion category (grouping pictures by expected-emotion)

### Anger

In [258]:
emotion = 'anger'

df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url', 'photoId'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion = manual_spell_check(df_stack_emotion)
df_stack_emotion = grouping(df_stack_emotion, syns)
df_stack_emotion = grouping(df_stack_emotion, hyps)

df_stack_emotion.replace({'scared': 'fear'}, inplace=True)

source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_anger = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion, \
    text_size = txs, label_size = ls, title_size=ts)
chart_anger.display(renderer='svg')

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [259]:
chart_anger_str = altair_saver.save(chart_anger, fmt='svg')
image_title.append("Images depicting 'anger'")
svg_str.append(chart_anger_str)

### Disgust

In [260]:
emotion = 'disgust'

df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url', 'photoId'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion = manual_spell_check(df_stack_emotion)
df_stack_emotion = grouping(df_stack_emotion, syns)
df_stack_emotion = grouping(df_stack_emotion, hyps)

df_stack_emotion.replace({'scared': 'fear'}, inplace=True)

source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_disgust = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion, \
    text_size = txs, label_size = ls, title_size=ts)
chart_disgust.display(renderer='svg')

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [261]:
chart_disgust_str = altair_saver.save(chart_disgust, fmt='svg')
image_title.append("Images depicting 'disgust'")
svg_str.append(chart_disgust_str)

### Fear

In [262]:
emotion = 'fear'

df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url', 'photoId'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion = manual_spell_check(df_stack_emotion)
df_stack_emotion = grouping(df_stack_emotion, syns)
df_stack_emotion = grouping(df_stack_emotion, hyps)

df_stack_emotion.replace({'scared': 'fear'}, inplace=True)

source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_fear = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion, \
    text_size = txs, label_size = ls, title_size=ts)
chart_fear.display(renderer='svg')

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [263]:
chart_fear_str = altair_saver.save(chart_fear, fmt='svg')
image_title.append("Images depicting 'fear'")
svg_str.append(chart_fear_str)

### Surprise

In [264]:
emotion = 'surprise'

df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url', 'photoId'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion = manual_spell_check(df_stack_emotion)
df_stack_emotion = grouping(df_stack_emotion, syns)
df_stack_emotion = grouping(df_stack_emotion, hyps)

df_stack_emotion.replace({'scared': 'fear'}, inplace=True)

source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_surprise = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion, \
    text_size = txs, label_size = ls, title_size=ts)
chart_surprise.display(renderer='svg')

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [265]:
chart_surprise_str = altair_saver.save(chart_surprise, fmt='svg')
image_title.append("Images depicting 'surprise'")
svg_str.append(chart_surprise_str)

### Happiness

In [266]:
emotion = 'happiness'

df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url', 'photoId'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion = manual_spell_check(df_stack_emotion)
df_stack_emotion = grouping(df_stack_emotion, syns)
df_stack_emotion = grouping(df_stack_emotion, hyps)

df_stack_emotion.replace({'scared': 'fear'}, inplace=True)

source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_happiness = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion, \
    text_size = txs, label_size = ls, title_size=ts)
chart_happiness.display(renderer='svg')

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [267]:
chart_happiness_str = altair_saver.save(chart_happiness, fmt='svg')
image_title.append("Images depicting 'happiness'")
svg_str.append(chart_happiness_str)

### Sadness

In [268]:
emotion = 'sadness'

df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url', 'photoId'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion = manual_spell_check(df_stack_emotion)
df_stack_emotion = grouping(df_stack_emotion, syns)
df_stack_emotion = grouping(df_stack_emotion, hyps)

df_stack_emotion.replace({'scared': 'fear'}, inplace=True)

source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_sadness = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion, \
    text_size = txs, label_size = ls, title_size=ts)
chart_sadness.display(renderer='svg')

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [269]:
chart_sadness_str = altair_saver.save(chart_sadness, fmt='svg')
image_title.append("Images depicting 'sadness'")
svg_str.append(chart_sadness_str)

### Uncertain

In [270]:
emotion = 'uncertain'

df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url', 'photoId'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion = manual_spell_check(df_stack_emotion)
df_stack_emotion = grouping(df_stack_emotion, syns)
df_stack_emotion = grouping(df_stack_emotion, hyps)

df_stack_emotion.replace({'scared': 'fear'}, inplace=True)

source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_uncertain = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion, \
    text_size = txs, label_size = ls, title_size=ts)
chart_uncertain.display(renderer='svg')

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [271]:
chart_uncertain_str = altair_saver.save(chart_uncertain, fmt='svg')
image_title.append("Images depicting 'uncertain (unknown)'")
svg_str.append(chart_uncertain_str)

### Neutral

In [272]:
emotion = 'neutral'

df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url', 'photoId'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion = manual_spell_check(df_stack_emotion)
df_stack_emotion = grouping(df_stack_emotion, syns)
df_stack_emotion = grouping(df_stack_emotion, hyps)

df_stack_emotion.replace({'scared': 'fear'}, inplace=True)

source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12

chart_neutral = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion, \
    text_size = txs, label_size = ls, title_size=ts)
chart_neutral.display(renderer='svg')

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [273]:
chart_neutral_str = altair_saver.save(chart_neutral, fmt='svg')
image_title.append("Images depicting 'neutral'")
svg_str.append(chart_neutral_str)

## Most frequently used word for each emotion category (grouping pictures by expected-emotion), by ethnicity group

In [177]:
def simple_per_bar_concat(
    df, title='Title', X='percent:Q', Y='emotion:N', \
    width=450, height=250, sort='-x', \
    text_size = 12, label_size = 11, title_size=12, \
    emotion='Some', color1='#0570b0', color2='orange'):
    
    bars = alt.Chart(df, title=title).mark_bar().encode(
        alt.X(X, axis=alt.Axis(format='.0%')),
        y=alt.Y(Y, sort=sort), 
        color=alt.condition(
            alt.datum.emotion == emotion,
            alt.value(color2),
            alt.value(color1)
        ))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    fontSize=text_size
    ).encode(
        alt.Text(X, format='.1%')
    )
    
    chart = (bars + text)
    
    return chart

In [178]:
def emotion_df_formated_et(df_emo_answers, emotion_label, ethnicity):
    df_emo_cat = df_emo_answers.copy() 
    df_emo_cat_t = df_emo_cat.T # transpose
    df_emo_cat_t['photo_id'] = df_emo_cat_t.index # get index as col
    df_emo_cat_t = df_emo_cat_t.reset_index(drop=True) # clean index
    df_emo_cat_t_labels = pd.concat([df_emo_cat_t, df_labels], axis=1) # add metadata cols
    df_label =  df_emo_cat_t_labels[(df_emo_cat_t_labels['label'] == emotion_label) & (df_emo_cat_t_labels['ethnicity'] == ethnicity)]
    return df_label

In [179]:
def wrapper_chart_emotion(df_emo_answers, emotion, ethnicity, emotion_st='angri', n_mentions=10):
    df = emotion_df_formated_et(df_emo_answers, emotion,  ethnicity) # subset 'anger' rows
    df_emotion_ans = df.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url', 'photoId'], axis=1)
    df_stack_emotion = formating_words(df_emotion_ans) # clean up
    df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
    df_count = count_freq_labels(df_stack_emotion, X='stacked', col='emotion_ps_steamed') # count label freq
    source = df_count[df_count['counts'] >= n_mentions]

    chart = simple_per_bar_concat(
        source, \
        title=f"Expected label: {emotion} | Labels with {n_mentions} or more mentions | n= {source['counts'].sum().astype(str)}", \
        emotion=emotion_st)
    return chart

### Anger

In [180]:
chart_anger_bipoc = wrapper_chart_emotion(df_emo_answers, 'anger', 'bipoc', emotion_st='angri', n_mentions=10)
chart_anger_white = wrapper_chart_emotion(df_emo_answers, 'anger', 'white', emotion_st='angri', n_mentions=10)

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [181]:
chart_anger_layered =  (chart_anger_bipoc | chart_anger_white)

In [182]:
chart_anger_layered.display(renderer='svg')

In [183]:
chart_anger_layered_str = altair_saver.save(chart_anger_layered, fmt='svg')
image_title.append("'anger' by ethnicity")
svg_str.append(chart_anger_layered_str)

### Disgust

In [184]:
chart_disgust_bipoc = wrapper_chart_emotion(df_emo_answers, 'disgust', 'bipoc', emotion_st='disgust', n_mentions=10)
chart_disgust_white = wrapper_chart_emotion(df_emo_answers, 'disgust', 'white', emotion_st='disgust', n_mentions=10)

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [185]:
chart_disgust_layered =  (chart_disgust_bipoc | chart_disgust_white)
chart_disgust_layered.display(renderer='svg')

In [186]:
chart_disgust_layered_str = altair_saver.save(chart_disgust_layered, fmt='svg')
image_title.append("'disgust' by ethnicity")
svg_str.append(chart_disgust_layered_str)

### Fear

In [187]:
chart_fear_bipoc = wrapper_chart_emotion(df_emo_answers, 'fear', 'bipoc', emotion_st='fear', n_mentions=10)
chart_fear_white = wrapper_chart_emotion(df_emo_answers, 'fear', 'white', emotion_st='fear', n_mentions=10)

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [188]:
chart_fear_layered =  (chart_fear_bipoc | chart_fear_white)
chart_fear_layered.display(renderer='svg')

In [189]:
chart_fear_layered_str = altair_saver.save(chart_fear_layered, fmt='svg')
image_title.append("'fear' by ethnicity")
svg_str.append(chart_fear_layered_str)

### Surprise

In [190]:
chart_surprise_bipoc= wrapper_chart_emotion(df_emo_answers, 'surprise', 'bipoc', emotion_st='surpris', n_mentions=10)
chart_surprise_white = wrapper_chart_emotion(df_emo_answers, 'surprise', 'white', emotion_st='surpris', n_mentions=10)

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [191]:
chart_surprise_layered =  (chart_surprise_bipoc | chart_surprise_white)
chart_surprise_layered.display(renderer='svg')

In [192]:
chart_surprise_layered_str = altair_saver.save(chart_surprise_layered, fmt='svg')
image_title.append("'surprise' by ethnicity")
svg_str.append(chart_surprise_layered_str)

### Happiness

In [193]:
chart_happiness_bipoc = wrapper_chart_emotion(df_emo_answers, 'happiness', 'bipoc', emotion_st='happi', n_mentions=10)
chart_happiness_white = wrapper_chart_emotion(df_emo_answers, 'happiness', 'white', emotion_st='happi', n_mentions=10)

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [194]:
chart_happiness_layered =  (chart_happiness_bipoc | chart_happiness_white)
chart_happiness_layered.display(renderer='svg')

In [195]:
chart_happiness_layered_str = altair_saver.save(chart_happiness_layered, fmt='svg')
image_title.append("'happiness' by ethnicity")
svg_str.append(chart_happiness_layered_str)

### Sadness

In [196]:
chart_sadness_bipoc = wrapper_chart_emotion(df_emo_answers, 'sadness', 'bipoc',  emotion_st='sad', n_mentions=10)
chart_sadness_white = wrapper_chart_emotion(df_emo_answers, 'sadness', 'white', emotion_st='sad', n_mentions=10)

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [197]:
chart_sadness_layered =  (chart_sadness_bipoc | chart_sadness_white)
chart_sadness_layered.display(renderer='svg')

In [198]:
chart_sadness_layered_str = altair_saver.save(chart_sadness_layered, fmt='svg')
image_title.append("'sadness' by ethnicity")
svg_str.append(chart_sadness_layered_str)

### Neutral

In [199]:
chart_neutral_bipoc = wrapper_chart_emotion(df_emo_answers, 'neutral', 'bipoc',  emotion_st='neutral', n_mentions=10)
chart_neutral_white = wrapper_chart_emotion(df_emo_answers, 'neutral', 'white', emotion_st='neutral', n_mentions=10)

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [200]:
chart_neutral_layered =  (chart_neutral_bipoc | chart_neutral_white)
chart_neutral_layered.display(renderer='svg')

In [201]:
chart_neutral_layered_str = altair_saver.save(chart_neutral_layered, fmt='svg')
image_title.append("'neutral' by ethnicity")
svg_str.append(chart_neutral_layered_str)

### Uncertain/Other

In [202]:
chart_uncertain_bipoc = wrapper_chart_emotion(df_emo_answers, 'uncertain', 'bipoc',  emotion_st='neutral', n_mentions=10)
chart_uncertain_white = wrapper_chart_emotion(df_emo_answers, 'uncertain', 'white',  emotion_st='neutral', n_mentions=10)

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [203]:
chart_uncertain_layered =  (chart_uncertain_bipoc | chart_uncertain_white)
chart_uncertain_layered.display(renderer='svg')

In [204]:
chart_uncertain_layered_str = altair_saver.save(chart_uncertain_layered, fmt='svg')
image_title.append("'uncertain' by ethnicity")
svg_str.append(chart_uncertain_layered_str)

### Most frequently used labels by photo

In [205]:
def photo_chart(df_emo_answers, emotion,  i, n_mentions=2):
    df = emotion_df_formated(df_emo_answers, emotion) # add emotion label
    photo_id_list = df['photo_id'].tolist() # photo id to list
    splited_photo_id = np.array_split(photo_id_list, 24) # split photo id by photo
    df = df[df['photo_id'].isin(splited_photo_id[i])] # select photo rows   
    df_url = df[['url']].head(1)  # get url for chart
    
    face = alt.Chart(df_url).mark_image(width=110, height=110, align='right', xOffset=0, yOffset=230).encode(url='url')
    
    df_emotion_ans = df.drop(['photo_id', 'ethnicity', 'sex', 'age', 'label', 'url', 'photoId'], axis=1) # clean cols
    df_stack_emotion = formating_words(df_emotion_ans) # clean up words
    df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) # steam
    df_stack_emotion_count = count_freq_labels(df_stack_emotion, X="stacked", col='emotion_ps_steamed') # group and count
    source = df_stack_emotion_count[df_stack_emotion_count['counts'] >= n_mentions]
    
    chart = simple_per_bar(source,\
                           title=f"Expected label: {emotion} | Labels with {n_mentions} or more mentions | n= {source['counts'].sum().astype(str)}", \
                           X='percent:Q', Y='emotion:N', width=300, height=300)
    
    return chart + face

In [206]:
photo_chart(df_emo_answers, 'anger',  0, n_mentions=2)

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [93]:
# def charts_emotion_faces(df_emo_answers, emotion):
#     charts = []
#     for i in range(0, 24):
#         charts.append(photo_chart(df_emo_answers, emotion,  i))
#     return charts

In [94]:
# def dashboard_emotion_faces(charts, rows):
#     if rows == 1:
#         dashboard= (charts[0] | charts[1] | charts[2])
#     if rows == 2:
#         dashboard= (charts[0] | charts[1] | charts[2]) & \
#                    (charts[3] | charts[4] | charts[5]) 
#     if rows == 3:
#         dashboard= (charts[0] | charts[1] | charts[2]) & \
#                    (charts[3] | charts[4] | charts[5]) & \
#                    (charts[6] | charts[7] | charts[8])
#     if rows == 4:
#         dashboard= (charts[0] | charts[1] | charts[2]) & \
#                    (charts[3] | charts[4] | charts[5]) & \
#                    (charts[6] | charts[7] | charts[8]) & \
#                    (charts[9] | charts[10] | charts[11])
#     if rows == 5:
#         dashboard= (charts[0] | charts[1] | charts[2]) & \
#                    (charts[3] | charts[4] | charts[5]) & \
#                    (charts[6] | charts[7] | charts[8]) & \
#                    (charts[9] | charts[10] | charts[11]) & \
#                    (charts[12] | charts[13] | charts[14]) 
#     if rows == 6:
#         dashboard= (charts[0] | charts[1] | charts[2]) & \
#                    (charts[3] | charts[4] | charts[5]) & \
#                    (charts[6] | charts[7] | charts[8]) & \
#                    (charts[9] | charts[10] | charts[11]) & \
#                    (charts[12] | charts[13] | charts[14]) & \
#                    (charts[15] | charts[16] | charts[17])
#     if rows == 7:
#         dashboard= (charts[0] | charts[1] | charts[2]) & \
#                    (charts[3] | charts[4] | charts[5]) & \
#                    (charts[6] | charts[7] | charts[8]) & \
#                    (charts[9] | charts[10] | charts[11]) & \
#                    (charts[12] | charts[13] | charts[14]) & \
#                    (charts[15] | charts[16] | charts[17]) & \
#                    (charts[18] | charts[19] | charts[20]) 
#     if rows == 8:
#         dashboard= (charts[0] | charts[1] | charts[2]) & \
#                    (charts[3] | charts[4] | charts[5]) & \
#                    (charts[6] | charts[7] | charts[8]) & \
#                    (charts[9] | charts[10] | charts[11]) & \
#                    (charts[12] | charts[13] | charts[14]) & \
#                    (charts[15] | charts[16] | charts[17]) & \
#                    (charts[18] | charts[19] | charts[20]) & \
#                    (charts[21] | charts[22] | charts[23]) 
        
#     return dashboard

In [95]:
# charts = charts_emotion_faces(df_emo_answers, 'sadness')
# dashboard_emotion_faces(charts, 8)

## Emotion percentages as feature vectors

In [210]:
photo_ids = pd.read_csv('../clean_data/photo_ids.csv')

In [211]:
df_emo_overall_raw = count_freq_labels(df_stack_single_word, X='stacked', col='emotion')
emotion_words_list_steam = df_emo_overall_raw['emotion'].str.lower().tolist()
emotion_words_list = ['happiness','neutral', 'surprise','sadness', 'disgust', 'anger', 'fear', 'uncertain']

In [212]:
# placeholder dataframe
index_photos = photo_ids.iloc[:, 0] + '_1'
df_emo_features = pd.DataFrame(columns=emotion_words_list_steam) 
# add key-ids
df_emo_features.insert(loc=0, column='photo_id', value=index_photos)

In [213]:
def fill_in_emotion_vectors(df_emo_features, df_emo_answers, emotion_words_list):
    for emo in range(0, len(emotion_words_list)):
        
        emotion = emotion_words_list[emo] # get emotion word
        df_ans = emotion_df_formated(df_emo_answers, emotion) # ans for all pics of emo-word
        photo_id_list = df_ans['photo_id'].tolist() # photo ids for the emotion word
        splited_photo_id = np.array_split(photo_id_list, 24) # photo_id_list as an array 
        
        for ids in range(0, len(splited_photo_id)):
            photo_id = splited_photo_id[ids]                
            df_single = df_ans[df_ans['photo_id'].isin(photo_id)] # get row for nth photo
            df_emotion_ans = df_single.drop(['photo_id', 'ethnicity', 'sex', \
                                             'age', 'label', 'url', 'photoId'], axis=1) # clean up for calculation
            df_stack_emotion = formating_words(df_emotion_ans) 
            df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) # steam
            source = count_freq_labels(df_stack_emotion, X='stacked', col='emotion_ps_steamed') # group and count / compute vectors
            source = source[['percent', 'emotion']].T
            source.columns = source.iloc[1].str.lower() # emotion words as cols
            source.drop('emotion', axis=0, inplace=True) # clean up emotion row
            source.reset_index(drop=True, inplace=True) # clean up index
            source.columns.name = None
            photo_id_str = photo_id[0] # photo id as str
            source.insert(0, 'photo_id', photo_id_str) # photo id as col
            # set photo id as col for easier manipulation
            df_emo_features = df_emo_features.set_index('photo_id') 
            # fill in vectors where the value is available
            df_emo_features.loc[photo_id_str] = pd.Series(source.T.to_dict()[0])
            
            df_emo_features['photo_id'] = df_emo_features.index # index back to column
            df_emo_features.index.name = None 
            df_emo_features.reset_index(drop=True, inplace=True) # clean up index
    
    return df_emo_features
            

In [214]:
df_emo_vectors = fill_in_emotion_vectors(df_emo_features, df_emo_answers, emotion_words_list)
df_emo_vectors = df_emo_vectors.fillna(0)
df_label_raw = pd.read_csv('../data/emotion_labels.csv')
df_emo_vectors = pd.concat([df_emo_vectors, df_label_raw], axis=1) # concat metadata
df_emo_vectors.to_csv('../clean_data/free_choice_uw_students_vectors.csv', index=False)

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


## Emotion percentages as feature vectors - count (for sentiment analysis)

In [215]:
# placeholder dataframe
index_photos = photo_ids.iloc[:, 0] + '_1'
df_emo_features = pd.DataFrame(columns=free_choice_word_list) # CHANGE TO NOT STEAMED
# add key-ids
df_emo_features.insert(loc=0, column='photo_id', value=index_photos)

In [216]:
def fill_in_emotion_counts(df_emo_features, df_emo_answers, emotion_words_list):
    for emo in range(0, len(emotion_words_list)):
        
        emotion = emotion_words_list[emo] # get emotion word
        df_ans = emotion_df_formated(df_emo_answers, emotion) # ans for all pics of emo-word
        photo_id_list = df_ans['photo_id'].tolist() # photo ids for the emotion word
        splited_photo_id = np.array_split(photo_id_list, 24) # photo_id_list as an array 
        
        for ids in range(0, len(splited_photo_id)):
            photo_id = splited_photo_id[ids]
            df_single = df_ans[df_ans['photo_id'].isin(photo_id)] # get row for nth photo
            df_emotion_ans = df_single.drop(['photo_id', 'ethnicity', 'sex', \
                                             'age', 'label', 'url', 'photoId'], axis=1) # clean up for calculation
            df_stack_emotion = formating_words(df_emotion_ans) 
            source = count_freq_labels(df_stack_emotion, X='stacked', col='emotion') # group and count / compute vectors   
            ## count or percet
            source = source[['counts', 'emotion']].T
            source.columns = source.iloc[1].str.lower() # emotion words as cols
            source.drop('emotion', axis=0, inplace=True) # clean up emotion row
            source.reset_index(drop=True, inplace=True) # clean up index
            source.columns.name = None
            photo_id_str = photo_id[0] # photo id as str
            source.insert(0, 'photo_id', photo_id_str) # photo id as col
            # set photo id as col for easier manipulation
            df_emo_features = df_emo_features.set_index('photo_id') 
            # fill in vectors where the value is available
            df_emo_features.loc[photo_id_str] = pd.Series(source.T.to_dict()[0])
            
            df_emo_features['photo_id'] = df_emo_features.index # index back to column
            df_emo_features.index.name = None 
            df_emo_features.reset_index(drop=True, inplace=True) # clean up index
    
    return df_emo_features
            

In [217]:
df_emo_counts = fill_in_emotion_counts(df_emo_features, df_emo_answers, emotion_words_list)
# df_emo_counts = df_emo_counts.fillna(0) # No need to fill NA as means need to be computed later ignoring them
df_label_raw = pd.read_csv('../data/emotion_labels.csv')
df_emo_counts = pd.concat([df_emo_counts, df_label_raw], axis=1) # concat metadata
df_emo_counts.to_csv('../clean_data/free_choice_uw_students_count_emotions.csv', index=False)

  df_stack_single_word['emotion'] = df_stack_single_word['emotion'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters


# Clustering

In [218]:
from kneed import KneeLocator
from sklearn import preprocessing as pp
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn import decomposition

In [219]:
X = df_emo_vectors.copy().drop(['photo_id', 'ethnicity', 'sex', \
                                   'age', 'label', 'url', 'photoId'], axis=1)

In [220]:
features = X.columns
sX = pp.MinMaxScaler(copy=True)
X.loc[:,features] = sX.fit_transform(X[features])

In [221]:
X.describe() # mean  = 0, SD =1

Unnamed: 0,happiness,sadness,anger,confused,surprise,disgust,upset,shocked,mad,scared,...,argumentative,ferocious,youthful,aggressive,furrowed,tooth,arrested,yellow,blurry,guilt
count,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,...,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0
mean,0.0,0.0,0.017728,0.0,0.0,0.06279,0.21865,0.0,0.160453,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.123626,0.0,0.0,0.168962,0.289025,0.0,0.264665,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.408934,0.0,0.252877,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [222]:
pca = decomposition.PCA(n_components=3)
pca.fit(X)
X_pca = pca.transform(X)

In [223]:
def k_means(n_clusters=3, n_inits=20, max_iter=1000, features=X):
    kmeans = KMeans(
    init="k-means++",
    n_clusters=n_clusters,
    n_init=n_inits,
    max_iter=max_iter,
    random_state=42)
    kmeans.fit(features)
    
    return kmeans

In [224]:
kmeans_kwargs = {
    "init": "k-means++",
    "n_init": 20,
    "max_iter": 1000,
    "random_state": 42,
}

# A list holds the SSE values for each k
sse_pca = []
for k in range(1, 100):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(X_pca)
    sse_pca.append(kmeans.inertia_)

In [225]:
kl_pca = KneeLocator(
    range(1, 100), sse_pca, curve="convex", direction="decreasing"
)

kl_pca.elbow 
# 14

14

In [226]:
source = pd.DataFrame({'y': sse_pca, 'x': range(1, 100)})

In [227]:
chart_elbow_pca = alt.Chart(source).mark_line().encode(
    alt.X('x:Q', title='Number of clusters - PCA'), 
    alt.Y('y:Q', title='SSE'))

In [228]:
# A list holds the silhouette coefficients for each k
silhouette_coefficients_pca = []

# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 100):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(X_pca)
    score = silhouette_score(X_pca, kmeans.labels_)
    silhouette_coefficients_pca.append(score)

In [229]:
source = pd.DataFrame({'y': silhouette_coefficients_pca, 'x': range(2, 100)})

In [230]:
chart_silhouette_pca = alt.Chart(source).mark_line().encode(
    alt.X('x:Q', title='Number of clusters - PCA'), 
    alt.Y('y:Q', title='Silhouette coefficients'))

In [231]:
k_means_eval_chart =  chart_silhouette_pca | chart_elbow_pca

In [232]:
k_means_eval_chart

In [233]:
k_means_eval_str = altair_saver.save(k_means_eval_chart, fmt='svg')
image_title.append('K-means evaluation')
svg_str.append(k_means_eval_str)

In [253]:
## RUN ONLY ONCE THEN SAVE

In [236]:
# k_means_run_pca = k_means(n_clusters=14, features=X_pca)

In [237]:
# df_label_raw['clusters_pca'] = k_means_run_pca.labels_

In [238]:
# dfs_kmeans_pca = [pd.DataFrame(y) for x, y in df_label_raw.groupby('clusters_pca', as_index=False)]

In [239]:
## add photo coordinates
# for i in range(0, len(dfs_kmeans_pca)):
#     num_items = len(dfs_kmeans_pca[i])
#     dfs_kmeans_pca[i]['x'] = np.linspace(0.1, 3.0, num=num_items)
#     dfs_kmeans_pca[i]['y'] = np.linspace(0.1, 3.0, num=num_items)

In [240]:
# def grid_photos(dfs_kmeans=dfs_kmeans_pca, nx=6, ny=6, cluster=0, width=600, height=600,title='title'):
    
#     nx, ny = (nx, ny)
#     x = np.linspace(0, 1, nx)
#     y = np.linspace(0, 1, ny)
#     xv, yv = np.meshgrid(x, y)
    
#     dfs_kmeans[cluster]['x'] = xv.ravel()[0:len(dfs_kmeans[cluster])]
#     dfs_kmeans[cluster]['y'] = yv.ravel()[0:len(dfs_kmeans[cluster])]
    
#     chart = alt.Chart(dfs_kmeans[cluster], title=title).mark_image(
#         width=50,
#         height=50
#     ).encode(
#         alt.X('x', axis=None),
#         alt.Y('y', axis=None),
#         url='url'
#     )
    
#     text = chart.mark_text(
#     align='center',
#     baseline='bottom',
#     yOffset = -25
#     ).encode(
#         alt.Text('label'),
#         color=alt.Color('label',
#                         scale=alt.Scale(
#                             domain=emotion_words_list,
#                             range=['#ff4444', '#4c809c', '#9ae354', '#0000AA', '#FFA500', '#E4D00A', '#c41cac', '#50C878']))
#     )
    
    
#     return (chart + text).properties(width=width, height=height)

In [241]:
# dims_clusters = []
# for i in range (0, 14):
#     dims_clusters.append(dfs_kmeans_pca[i].shape[0])
# dims_clusters

[5, 9, 17, 50, 8, 12, 11, 19, 9, 7, 8, 9, 13, 17]

In [250]:
# cluster_0 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=3, ny=3, cluster=0,  width=400, height=400, title='Cluster 1')
# cluster_1 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=3, ny=3, cluster=1,  width=400, height=400, title='Cluster 2')
# cluster_2 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=5, ny=5, cluster=2,  width=400, height=400, title='Cluster 3')
# cluster_3 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=8, ny=8, cluster=3,  width=400, height=400, title='Cluster 4')
# cluster_4 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=3, ny=3, cluster=4,  width=400, height=400, title='Cluster 5')
# cluster_5 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=4, ny=4, cluster=5,  width=400, height=400, title='Cluster 6')
# cluster_6 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=4, ny=4, cluster=6,  width=400, height=400, title='Cluster 7')
# cluster_7 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=5, ny=5, cluster=7,  width=400, height=400, title='Cluster 8')
# cluster_8 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=3, ny=3, cluster=8,  width=400, height=400, title='Cluster 9')
# cluster_9 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=3, ny=3, cluster=9,  width=400, height=400, title='Cluster 10')
# cluster_10 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=3, ny=3, cluster=10,  width=300, height=300, title='Cluster 11')
# cluster_11 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=3, ny=3, cluster=11,  width=300, height=300, title='Cluster 12')
# cluster_12 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=4, ny=4, cluster=10,  width=300, height=300, title='Cluster 13')
# cluster_13 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=5, ny=5, cluster=11,  width=300, height=300, title='Cluster 14')

In [251]:
# k_means_clusters = ((cluster_0 | cluster_1) & \
# (cluster_2 | cluster_3) & \
# (cluster_4 | cluster_5) & \
# (cluster_6 | cluster_7) & \
# (cluster_8 | cluster_9) & \
# (cluster_10 | cluster_11) & \
# (cluster_12 | cluster_13)).configure_axis(
#         grid=False
#     ).configure_view(
#         strokeOpacity=0
#     ).configure_concat(
#     spacing=50
# )

In [252]:
# k_means_clusters.display(renderer='svg')

# Dataframe for dashboard

In [246]:
images_strings = pd.DataFrame({'image_title': image_title, 'svg': svg_str})

In [247]:
images_strings.to_csv('../clean_data/free_choice_svg_strings.csv', index=False)
images_strings.to_csv('../../emotions_dashboard/data/free_choice_svg_strings.csv', index=False)

In [248]:
df_svg = pd.read_csv('../clean_data/free_choice_svg_strings.csv')

In [249]:
df_svg['image_title']

0                            Participants by sex
1                            Participants by age
2                      Participants by ethnicity
3               Participants by formal education
4         Overall results by expected label as %
5     Overall results by expected label as count
6                       Images depicting 'anger'
7                     Images depicting 'disgust'
8                        Images depicting 'fear'
9                    Images depicting 'surprise'
10                  Images depicting 'happiness'
11                    Images depicting 'sadness'
12        Images depicting 'uncertain (unknown)'
13                    Images depicting 'neutral'
14                          'anger' by ethnicity
15                        'disgust' by ethnicity
16                           'fear' by ethnicity
17                       'surprise' by ethnicity
18                      'happiness' by ethnicity
19                        'sadness' by ethnicity
20                  