In [4]:
import pandas as pd
import numpy as np
import altair as alt
import altair_saver

## Data ingestion

In [5]:
df = pd.read_csv('../data/emotion_free_choice_uw_students.csv')
df_label = pd.read_csv('../data/emotion_labels.csv')

In [6]:
df_label['url'] = df_label['url'].astype(str)

In [7]:
# replicate rows by 4 to match with free responses
df_labels = pd.DataFrame(np.repeat(df_label.values, 4, axis=0))
df_labels.columns = df_label.columns

In [8]:
df_labels.to_csv('../data/emotion_labels_free_choice.csv', index=False)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 633.0+ KB


In [10]:
df = df.iloc[19:, :]  # filter out test rows

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 19 to 100
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 513.9+ KB


In [12]:
df = df[df['Finished'] =='True'] # filter out incomplete surveys

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 19 to 95
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 319.9+ KB


In [14]:
df.rename(columns={
    'Q1.2': 'sex',
    'Q1.3_1': 'age',
    'Q1.4': 'ethnicity',
    'Q1.5': 'formal education',
    'Q1.6_1': 'income'}, inplace=True)

In [15]:
df.to_csv('../clean_data/free_choice_emotion_uw_students.csv', index=False)

In [16]:
# placeholders to save svg strings
svg_str = []
image_title = []

## Demographics

In [17]:
def count_freq_labels(df, X="all", col="emotion"):
    if X == "all":
        df_counts = df.stack().reset_index(drop=True).value_counts() # stack as series
        df_counts = df_counts.to_frame('counts') # get value_counts as df
        df_counts['emotion'] = df_counts.index # get index as col
    elif X == "stacked":
        df_counts = df[col].value_counts() # count word frequency
        df_counts = df_counts.to_frame('counts') # get value_counts as df
        df_counts['emotion'] = df_counts.index # get index as col    
    else:
        df_counts = df[X].reset_index(drop=True).value_counts() # stack as series
        df_counts = df_counts.to_frame('counts') # get value_counts as df
        df_counts[X] = df_counts.index # get index as col

    df_counts = df_counts.reset_index(drop=True) # clean index
    df_counts['percent'] = df_counts['counts'] / df_counts['counts'].sum() # compute percentage
    return df_counts

In [18]:
def simple_per_bar(
    df, title='Title', X='percent:Q', Y='emotion:N', \
    width=450, height=250, sort='-x', \
    text_size = 12, label_size = 11, title_size=12, \
    emotion='Some', color1='#0570b0', color2='orange'):
    
    bars = alt.Chart(df, title=title).mark_bar().encode(
        alt.X(X, axis=alt.Axis(format='.0%')),
        y=alt.Y(Y, sort=sort), 
        color=alt.condition(
            alt.datum.emotion == emotion,
            alt.value(color2),
            alt.value(color1)
        ))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    fontSize=text_size
    ).encode(
        alt.Text(X, format='.1%')
    )
    
    chart = (bars + text).configure_axis(
            labelFontSize=label_size,
            titleFontSize=title_size).properties(
                width=width, 
                height=height)
    
    
    return chart

In [19]:
def simple_count_bar(
    df, title='Title', X='counts:Q', Y='emotion:N', \
    width=450, height=250, sort='-x', \
    text_size = 12, label_size = 11, title_size=12,
    emotion='Some', color1='#0570b0', color2='#orange'):
    
    bars = alt.Chart(df, title=title).mark_bar().encode(
        alt.X(X),
        y=alt.Y(Y, sort=sort), 
        color=alt.condition(
            alt.datum.emotion == emotion,
            alt.value(color2),
            alt.value(color1)
        ))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    fontSize=text_size
    ).encode(
        alt.Text(X)
    )
    
    chart = (bars + text).configure_axis(
            labelFontSize=label_size,
            titleFontSize=title_size).properties(
                width=width, 
                height=height)
    
    
    return chart

In [20]:
source = count_freq_labels(df, X="sex") 
title = 'Sex | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'sex:N'
w, h= 450, 100
txs, ls, ts = 12, 12, 12

chart_sex = simple_per_bar(source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)

In [21]:
chart_sex.display(renderer='svg')

In [22]:
chart_sex_string = altair_saver.save(chart_sex, fmt='svg')
image_title.append('Participants by sex')
svg_str.append(chart_sex_string)

In [23]:
source = count_freq_labels(df, X="age") 
title = 'Age | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'age:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_age = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h, \
    text_size = txs, label_size = ls, title_size=ts)
chart_age.display(renderer='svg')

In [24]:
chart_age_str = altair_saver.save(chart_age, fmt='svg')
image_title.append('Participants by age')
svg_str.append(chart_age_str)

In [25]:
source = count_freq_labels(df, X="ethnicity") 
title = 'Ethnicity | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'ethnicity:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_ethnicity= simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h, \
    text_size = txs, label_size = ls, title_size=ts)
chart_ethnicity.display(renderer='svg')

In [26]:
chart_et_str = altair_saver.save(chart_ethnicity, fmt='svg')
image_title.append('Participants by ethnicity')
svg_str.append(chart_et_str)

In [27]:
source = count_freq_labels(df, X="formal education") 
title = 'Formal education | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'formal education:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_formal_education= simple_per_bar(
    source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)
chart_formal_education.display(renderer='svg')

In [28]:
chart_formal_education_str = altair_saver.save(chart_formal_education, fmt='svg')
image_title.append('Participants by formal education')
svg_str.append(chart_formal_education_str)

## Formating

In [29]:
df_emo_answers = df.loc[:, 'Q2.1_1':'Q195.1_4'] # get cols with words only

In [143]:
df_emo_answers.replace(['happy', 'happu', 'hapy', 'happyy', 'happyb', 'happt', 'haapy', 'hapoy'], 'happiness')
df_emo_answers.replace(['angry', 'angr'], 'anger')
df_emo_answers.replace(['sad', 'sadd', 'sadness'], 'sadness')
# [disgusted, disgust, disgusting, disguetsed, disugested]
# [surprise, surprised, surpsied, surpise, surprising, surprisef]
# [fear, fearful, fearfulness]



Unnamed: 0,Q2.1_1,Q2.1_2,Q2.1_3,Q2.1_4,Q3.1_1,Q3.1_2,Q3.1_3,Q3.1_4,Q4.1_1,Q4.1_2,...,Q193.1_3,Q193.1_4,Q194.1_1,Q194.1_2,Q194.1_3,Q194.1_4,Q195.1_1,Q195.1_2,Q195.1_3,Q195.1_4
19,angry,,,,yelling,,,,yelling,,...,,,funny,,,,silly,,,
20,,,,,mad,,,,mad,,...,,,dazed,,,,silly,,,
21,,,,,,,,,angry,,...,,,,,,,,,,
22,don't know,,,,?,,,,mad,,...,,,annoying,,,,,,,
23,Angry,Dissapointed,,,Angry,Spiteful,,,Angry,Frusturated,...,,,annoyed,,,,bored,,,
24,frustrated,,,,annoyed,,,,disappointed,,...,,,silly,,,,,,,
25,peaceful,,,,mad,,,,angry,,...,,,silly,,,,silly,,,
26,,,,,mad,,,,furious,,...,,,weird,,,,,,,na
27,Mad,,,,Mad,Frustrated,,,Angry,,...,,na,,,,na,,,,na
28,Calm,Happy,,,Angry,,,,Angry,,...,,,Disgust,,,,Wordless,,,


In [30]:
def formating_words(df, len_words=1, len_letters=2):
    df_stack = df.stack().reset_index(drop=True) # stack as series
    df_stack = df_stack.to_frame(name='emotion') # as DF
    df_stack['emotion'] = df_stack['emotion'].str.strip() # remove blank spaces
    df_stack['emotion'] = df_stack['emotion'].str.lower() # as lower case
    df_stack['emotion'] = df_stack['emotion'].replace({'na':'none'}) 
    df_stack['len_words'] = df_stack['emotion'].str.split().apply(len) # cnt number of words
    df_stack['len_letters'] = df_stack['emotion'].apply(len) # cont number of letters
    # get df with single words of 3 or more letters
    df_stack_single_word = df_stack[(df_stack['len_words'] == len_words) & (df_stack['len_letters'] > len_letters)] 
    return df_stack_single_word

In [31]:
df_stack_single_word = formating_words(df_emo_answers)

## Spell checking 
**NOTE**: poor results so far with this

In [32]:
# conda install -c conda-forge pattern 
# from pattern.en import suggest

In [33]:
# df_stack_single_word['emotion_spell_check'] = df_stack_single_word['emotion'].apply(lambda x: suggest(x)[0][0])
# df_stack_single_word['emotion'].size
# df_stack_single_word['emotion'].size - sum(df_stack_single_word['emotion'] == df_stack_single_word['emotion_spell_check']) # number of words changed

## Steamming

In [34]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

In [35]:
ps = PorterStemmer()
ls = LancasterStemmer()
snowball = SnowballStemmer(language='english')

In [36]:
# Porter stemmer
df_stack_single_word['emotion_ps_steamed'] = df_stack_single_word['emotion'].apply(lambda x: ps.stem(x))

In [37]:
# Lancaster stemmer
df_stack_single_word['emotion_ls_steamed'] = df_stack_single_word['emotion'].apply(lambda x: ls.stem(x))

In [38]:
# Snowball stemmer
df_stack_single_word['emotion_sb_steamed'] = df_stack_single_word['emotion'].apply(lambda x: snowball.stem(x))

In [39]:
df_stack_single_word.head(3)

Unnamed: 0,emotion,len_words,len_letters,emotion_ps_steamed,emotion_ls_steamed,emotion_sb_steamed
0,angry,1,5,angri,angry,angri
1,yelling,1,7,yell,yel,yell
2,yelling,1,7,yell,yel,yell


## Count frequency and plot

In [40]:
df_emo_overall = count_freq_labels(df_stack_single_word, X="stacked", col='emotion_ps_steamed')

In [41]:
free_choice_word_list = count_freq_labels(df_stack_single_word, 'emotion')['emotion']

In [141]:
# count_freq_labels(df_stack_single_word, X="stacked", col='emotion')['emotion'].to_csv('../clean_data/free_word_list_raw.csv', index=False)

In [42]:
# free_choice_word_list.to_csv('../clean_data/free_choice_word_list.csv', index=False)

In [43]:
# df_emo_overall.to_csv('../clean_data/free_choice_emotion_uw_students_overall.csv', index=False)

In [44]:
source = df_emo_overall[df_emo_overall['counts'] > 49]
title = 'Labels with 50 or more mentions (steamed) | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12

chart_overall_per = simple_per_bar(source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)
chart_overall_per.display(renderer='svg')

In [45]:
source = df_emo_overall[df_emo_overall['counts'] > 49]
title = 'Labels with 50 or more mentions (steamed) | n = '+ source['counts'].sum().astype(str)
X, Y = 'counts:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12

chart_overall_per = simple_count_bar(source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)
chart_overall_per.display(renderer='svg')

## Most frequently used word for each emotion category (grouping pictures by expected-emotion)

In [46]:
def emotion_df_formated(df_emo_answers, emotion_label):
    df_emo_cat = df_emo_answers.copy() 
    df_emo_cat_t = df_emo_cat.T # transpose
    df_emo_cat_t['photo_id'] = df_emo_cat_t.index # get index as col
    df_emo_cat_t = df_emo_cat_t.reset_index(drop=True) # clean index
    df_emo_cat_t_labels = pd.concat([df_emo_cat_t, df_labels], axis=1) # add metadata cols
    df_label =  df_emo_cat_t_labels[df_emo_cat_t_labels['label'] == emotion_label]
    
    return df_label

### Anger

In [47]:
emotion = 'anger'
emotion_st = 'angri'
df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion_ps_steamed')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_anger = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion_st, \
    text_size = txs, label_size = ls, title_size=ts)
chart_anger.display(renderer='svg')

In [48]:
chart_anger_str = altair_saver.save(chart_anger, fmt='svg')
image_title.append("Images depicting 'anger'")
svg_str.append(chart_anger_str)

### Disgust

In [49]:
emotion = 'disgust'
emotion_st = 'disgust'
df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion_ps_steamed')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_disgust = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion_st, \
    text_size = txs, label_size = ls, title_size=ts)
chart_disgust.display(renderer='svg')

In [50]:
chart_disgust_str = altair_saver.save(chart_disgust, fmt='svg')
image_title.append("Images depicting 'disgust'")
svg_str.append(chart_disgust_str)

### Fear

In [51]:
emotion = 'fear'
emotion_st = 'fear'
df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion_ps_steamed')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_fear = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion_st, \
    text_size = txs, label_size = ls, title_size=ts)
chart_fear.display(renderer='svg')

In [52]:
chart_fear_str = altair_saver.save(chart_fear, fmt='svg')
image_title.append("Images depicting 'fear'")
svg_str.append(chart_fear_str)

### Surprise

In [53]:
emotion = 'surprise'
emotion_st = 'surpris'
df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion_ps_steamed')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_surprise = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion_st, \
    text_size = txs, label_size = ls, title_size=ts)
chart_surprise.display(renderer='svg')

In [54]:
chart_surprise_str = altair_saver.save(chart_surprise, fmt='svg')
image_title.append("Images depicting 'surprise'")
svg_str.append(chart_surprise_str)

### Happiness

In [55]:
emotion = 'happiness'
emotion_st = 'happi'
df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion_ps_steamed')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_happiness = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion_st, \
    text_size = txs, label_size = ls, title_size=ts)
chart_happiness.display(renderer='svg')

In [56]:
chart_happiness_str = altair_saver.save(chart_happiness, fmt='svg')
image_title.append("Images depicting 'happiness'")
svg_str.append(chart_happiness_str)

### Sadness

In [57]:
emotion = 'sadness'
emotion_st = 'sad'
df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion_ps_steamed')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_sadness = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion_st, \
    text_size = txs, label_size = ls, title_size=ts)
chart_sadness.display(renderer='svg')

In [58]:
chart_sadness_str = altair_saver.save(chart_sadness, fmt='svg')
image_title.append("Images depicting 'sadness'")
svg_str.append(chart_sadness_str)

### Uncertain

In [59]:
emotion = 'uncertain'
emotion_st = 'uncertain'
df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
source = count_freq_labels(df_stack_emotion,X='stacked', col='emotion_ps_steamed')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_uncertain = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion_st, \
    text_size = txs, label_size = ls, title_size=ts)
chart_uncertain.display(renderer='svg')

In [60]:
chart_uncertain_str = altair_saver.save(chart_uncertain, fmt='svg')
image_title.append("Images depicting 'uncertain (unknown)'")
svg_str.append(chart_uncertain_str)

### Neutral

In [61]:
emotion = 'neutral'
emotion_st = 'neutral'
df_emotion = emotion_df_formated(df_emo_answers, emotion) # subset 'emotion' rows
df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
df_stack_emotion = formating_words(df_emotion_ans) # clean up
df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
source = count_freq_labels(df_stack_emotion, X='stacked', col='emotion_ps_steamed')
source_10 = source[source['counts'] > 10]

title = 'Expcted label: '+ emotion + ' | Labels with 10 or more mentions | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 600
txs, ls, ts = 12, 11, 12


chart_neutral = simple_per_bar(
    source_10, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion_st, \
    text_size = txs, label_size = ls, title_size=ts)
chart_neutral.display(renderer='svg')

In [62]:
chart_neutral_str = altair_saver.save(chart_neutral, fmt='svg')
image_title.append("Images depicting 'neutral'")
svg_str.append(chart_neutral_str)

## Most frequently used word for each emotion category (grouping pictures by expected-emotion), by ethnicity group

In [63]:
def simple_per_bar_concat(
    df, title='Title', X='percent:Q', Y='emotion:N', \
    width=450, height=250, sort='-x', \
    text_size = 12, label_size = 11, title_size=12, \
    emotion='Some', color1='#0570b0', color2='orange'):
    
    bars = alt.Chart(df, title=title).mark_bar().encode(
        alt.X(X, axis=alt.Axis(format='.0%')),
        y=alt.Y(Y, sort=sort), 
        color=alt.condition(
            alt.datum.emotion == emotion,
            alt.value(color2),
            alt.value(color1)
        ))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    fontSize=text_size
    ).encode(
        alt.Text(X, format='.1%')
    )
    
    chart = (bars + text)
    
    return chart

In [64]:
def emotion_df_formated_et(df_emo_answers, emotion_label, ethnicity):
    df_emo_cat = df_emo_answers.copy() 
    df_emo_cat_t = df_emo_cat.T # transpose
    df_emo_cat_t['photo_id'] = df_emo_cat_t.index # get index as col
    df_emo_cat_t = df_emo_cat_t.reset_index(drop=True) # clean index
    df_emo_cat_t_labels = pd.concat([df_emo_cat_t, df_labels], axis=1) # add metadata cols
    df_label =  df_emo_cat_t_labels[(df_emo_cat_t_labels['label'] == emotion_label) & (df_emo_cat_t_labels['ethnicity'] == ethnicity)]
    return df_label

In [65]:
def wrapper_chart_emotion(df_emo_answers, emotion, ethnicity, emotion_st='angri', n_mentions=10):
    df = emotion_df_formated_et(df_emo_answers, emotion,  ethnicity) # subset 'anger' rows
    df_emotion_ans = df.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
    df_stack_emotion = formating_words(df_emotion_ans) # clean up
    df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
    df_count = count_freq_labels(df_stack_emotion, X='stacked', col='emotion_ps_steamed') # count label freq
    source = df_count[df_count['counts'] >= n_mentions]

    chart = simple_per_bar_concat(
        source, \
        title=f"Expected label: {emotion} | Labels with {n_mentions} or more mentions | n= {source['counts'].sum().astype(str)}", \
        emotion=emotion_st)
    return chart

### Anger

In [66]:
chart_anger_bipoc = wrapper_chart_emotion(df_emo_answers, 'anger', 'bipoc', emotion_st='angri', n_mentions=10)
chart_anger_white = wrapper_chart_emotion(df_emo_answers, 'anger', 'white', emotion_st='angri', n_mentions=10)

In [67]:
chart_anger_layered =  (chart_anger_bipoc | chart_anger_white)

In [68]:
chart_anger_layered.display(renderer='svg')

In [69]:
chart_anger_layered_str = altair_saver.save(chart_anger_layered, fmt='svg')
image_title.append("'anger' by ethnicity")
svg_str.append(chart_anger_layered_str)

### Disgust

In [70]:
chart_disgust_bipoc = wrapper_chart_emotion(df_emo_answers, 'disgust', 'bipoc', emotion_st='disgust', n_mentions=10)
chart_disgust_white = wrapper_chart_emotion(df_emo_answers, 'disgust', 'white', emotion_st='disgust', n_mentions=10)

In [71]:
chart_disgust_layered =  (chart_disgust_bipoc | chart_disgust_white)
chart_disgust_layered.display(renderer='svg')

In [72]:
chart_disgust_layered_str = altair_saver.save(chart_disgust_layered, fmt='svg')
image_title.append("'disgust' by ethnicity")
svg_str.append(chart_disgust_layered_str)

### Fear

In [73]:
chart_fear_bipoc = wrapper_chart_emotion(df_emo_answers, 'fear', 'bipoc', emotion_st='disgust', n_mentions=10)
chart_fear_white = wrapper_chart_emotion(df_emo_answers, 'fear', 'white', emotion_st='disgust', n_mentions=10)

In [74]:
chart_fear_layered =  (chart_fear_bipoc | chart_fear_white)
chart_fear_layered.display(renderer='svg')

In [75]:
chart_fear_layered_str = altair_saver.save(chart_fear_layered, fmt='svg')
image_title.append("'fear' by ethnicity")
svg_str.append(chart_fear_layered_str)

### Surprise

In [76]:
chart_surprise_bipoc= wrapper_chart_emotion(df_emo_answers, 'surprise', 'bipoc', emotion_st='surpris', n_mentions=10)
chart_surprise_white = wrapper_chart_emotion(df_emo_answers, 'surprise', 'white', emotion_st='surpris', n_mentions=10)

In [77]:
chart_surprise_layered =  (chart_surprise_bipoc | chart_surprise_white)
chart_surprise_layered.display(renderer='svg')

In [78]:
chart_surprise_layered_str = altair_saver.save(chart_surprise_layered, fmt='svg')
image_title.append("'surprise' by ethnicity")
svg_str.append(chart_surprise_layered_str)

### Happiness

In [79]:
chart_happiness_bipoc = wrapper_chart_emotion(df_emo_answers, 'happiness', 'bipoc', emotion_st='happi', n_mentions=10)
chart_happiness_white = wrapper_chart_emotion(df_emo_answers, 'happiness', 'white', emotion_st='happi', n_mentions=10)

In [80]:
chart_happiness_layered =  (chart_happiness_bipoc | chart_happiness_white)
chart_happiness_layered.display(renderer='svg')

In [81]:
chart_happiness_layered_str = altair_saver.save(chart_happiness_layered, fmt='svg')
image_title.append("'happiness' by ethnicity")
svg_str.append(chart_happiness_layered_str)

### Sadness

In [82]:
chart_sadness_bipoc = wrapper_chart_emotion(df_emo_answers, 'sadness', 'bipoc',  emotion_st='sad', n_mentions=10)
chart_sadness_white = wrapper_chart_emotion(df_emo_answers, 'sadness', 'white', emotion_st='sad', n_mentions=10)

In [83]:
chart_sadness_layered =  (chart_sadness_bipoc | chart_sadness_white)
chart_sadness_layered.display(renderer='svg')

In [84]:
chart_sadness_layered_str = altair_saver.save(chart_sadness_layered, fmt='svg')
image_title.append("'sadness' by ethnicity")
svg_str.append(chart_sadness_layered_str)

### Neutral

In [85]:
chart_neutral_bipoc = wrapper_chart_emotion(df_emo_answers, 'neutral', 'bipoc',  emotion_st='neutral', n_mentions=10)
chart_neutral_white = wrapper_chart_emotion(df_emo_answers, 'neutral', 'white', emotion_st='neutral', n_mentions=10)

In [86]:
chart_neutral_layered =  (chart_neutral_bipoc | chart_neutral_white)
chart_neutral_layered.display(renderer='svg')

In [87]:
chart_neutral_layered_str = altair_saver.save(chart_neutral_layered, fmt='svg')
image_title.append("'neutral' by ethnicity")
svg_str.append(chart_neutral_layered_str)

### Uncertain/Other

In [88]:
chart_uncertain_bipoc = wrapper_chart_emotion(df_emo_answers, 'uncertain', 'bipoc',  emotion_st='neutral', n_mentions=10)
chart_uncertain_white = wrapper_chart_emotion(df_emo_answers, 'uncertain', 'white',  emotion_st='neutral', n_mentions=10)

In [89]:
chart_uncertain_layered =  (chart_uncertain_bipoc | chart_uncertain_white)
chart_uncertain_layered.display(renderer='svg')

In [90]:
chart_uncertain_layered_str = altair_saver.save(chart_uncertain_layered, fmt='svg')
image_title.append("'uncertain' by ethnicity")
svg_str.append(chart_uncertain_layered_str)

### Most frequently used labels by photo

In [93]:
def photo_chart(df_emo_answers, emotion,  i, n_mentions=2):
    df = emotion_df_formated(df_emo_answers, emotion) # add emotion label
    photo_id_list = df['photo_id'].tolist() # photo id to list
    splited_photo_id = np.array_split(photo_id_list, 24) # split photo id by photo
    df = df[df['photo_id'].isin(splited_photo_id[i])] # select photo rows   
    df_url = df[['url']].head(1)  # get url for chart
    
    face = alt.Chart(df_url).mark_image(width=110, height=110, align='right', xOffset=0, yOffset=230).encode(url='url')
    
    df_emotion_ans = df.drop(['photo_id', 'ethnicity', 'sex', 'age', 'label', 'url'], axis=1) # clean cols
    df_stack_emotion = formating_words(df_emotion_ans) # clean up words
    df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) # steam
    df_stack_emotion_count = count_freq_labels(df_stack_emotion, X="stacked", col='emotion_ps_steamed') # group and count
    source = df_stack_emotion_count[df_stack_emotion_count['counts'] >= n_mentions]
    
    chart = simple_per_bar(source,\
                           title=f"Expected label: {emotion} | Labels with {n_mentions} or more mentions | n= {source['counts'].sum().astype(str)}", \
                           X='percent:Q', Y='emotion:N', width=300, height=300)
    
    return chart + face

In [94]:
photo_chart(df_emo_answers, 'anger',  7)

In [None]:
# def charts_emotion_faces(df_emo_answers, emotion):
#     charts = []
#     for i in range(0, 24):
#         charts.append(photo_chart(df_emo_answers, emotion,  i))
#     return charts

In [None]:
# def dashboard_emotion_faces(charts, rows):
#     if rows == 1:
#         dashboard= (charts[0] | charts[1] | charts[2])
#     if rows == 2:
#         dashboard= (charts[0] | charts[1] | charts[2]) & \
#                    (charts[3] | charts[4] | charts[5]) 
#     if rows == 3:
#         dashboard= (charts[0] | charts[1] | charts[2]) & \
#                    (charts[3] | charts[4] | charts[5]) & \
#                    (charts[6] | charts[7] | charts[8])
#     if rows == 4:
#         dashboard= (charts[0] | charts[1] | charts[2]) & \
#                    (charts[3] | charts[4] | charts[5]) & \
#                    (charts[6] | charts[7] | charts[8]) & \
#                    (charts[9] | charts[10] | charts[11])
#     if rows == 5:
#         dashboard= (charts[0] | charts[1] | charts[2]) & \
#                    (charts[3] | charts[4] | charts[5]) & \
#                    (charts[6] | charts[7] | charts[8]) & \
#                    (charts[9] | charts[10] | charts[11]) & \
#                    (charts[12] | charts[13] | charts[14]) 
#     if rows == 6:
#         dashboard= (charts[0] | charts[1] | charts[2]) & \
#                    (charts[3] | charts[4] | charts[5]) & \
#                    (charts[6] | charts[7] | charts[8]) & \
#                    (charts[9] | charts[10] | charts[11]) & \
#                    (charts[12] | charts[13] | charts[14]) & \
#                    (charts[15] | charts[16] | charts[17])
#     if rows == 7:
#         dashboard= (charts[0] | charts[1] | charts[2]) & \
#                    (charts[3] | charts[4] | charts[5]) & \
#                    (charts[6] | charts[7] | charts[8]) & \
#                    (charts[9] | charts[10] | charts[11]) & \
#                    (charts[12] | charts[13] | charts[14]) & \
#                    (charts[15] | charts[16] | charts[17]) & \
#                    (charts[18] | charts[19] | charts[20]) 
#     if rows == 8:
#         dashboard= (charts[0] | charts[1] | charts[2]) & \
#                    (charts[3] | charts[4] | charts[5]) & \
#                    (charts[6] | charts[7] | charts[8]) & \
#                    (charts[9] | charts[10] | charts[11]) & \
#                    (charts[12] | charts[13] | charts[14]) & \
#                    (charts[15] | charts[16] | charts[17]) & \
#                    (charts[18] | charts[19] | charts[20]) & \
#                    (charts[21] | charts[22] | charts[23]) 
        
#     return dashboard

In [None]:
# charts = charts_emotion_faces(df_emo_answers, 'sadness')
# dashboard_emotion_faces(charts, 8)

## Emotion percentages as feature vectors

In [125]:
photo_ids = pd.read_csv('../clean_data/photo_ids.csv')

In [126]:
df_emo_overall_raw = count_freq_labels(df_stack_single_word, X='stacked', col='emotion_ps_steamed')
emotion_words_list_steam = df_emo_overall_raw['emotion'].str.lower().tolist()
emotion_words_list = ['happiness','neutral', 'surprise','sadness', 'disgust', 'anger', 'fear', 'uncertain']

In [127]:
# placeholder dataframe
index_photos = photo_ids.iloc[:, 0] + '_1'
df_emo_features = pd.DataFrame(columns=emotion_words_list_steam) 
# add key-ids
df_emo_features.insert(loc=0, column='photo_id', value=index_photos)

In [128]:
def fill_in_emotion_vectors(df_emo_features, df_emo_answers, emotion_words_list):
    for emo in range(0, len(emotion_words_list)):
        
        emotion = emotion_words_list[emo] # get emotion word
        df_ans = emotion_df_formated(df_emo_answers, emotion) # ans for all pics of emo-word
        photo_id_list = df_ans['photo_id'].tolist() # photo ids for the emotion word
        splited_photo_id = np.array_split(photo_id_list, 24) # photo_id_list as an array 
        
        for ids in range(0, len(splited_photo_id)):
            photo_id = splited_photo_id[ids]                
            df_single = df_ans[df_ans['photo_id'].isin(photo_id)] # get row for nth photo
            df_emotion_ans = df_single.drop(['photo_id', 'ethnicity', 'sex', \
                                             'age', 'label', 'url'], axis=1) # clean up for calculation
            df_stack_emotion = formating_words(df_emotion_ans) 
            df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) # steam
            source = count_freq_labels(df_stack_emotion, X='stacked', col='emotion_ps_steamed') # group and count / compute vectors
            source = source[['percent', 'emotion']].T
            source.columns = source.iloc[1].str.lower() # emotion words as cols
            source.drop('emotion', axis=0, inplace=True) # clean up emotion row
            source.reset_index(drop=True, inplace=True) # clean up index
            source.columns.name = None
            photo_id_str = photo_id[0] # photo id as str
            source.insert(0, 'photo_id', photo_id_str) # photo id as col
            # set photo id as col for easier manipulation
            df_emo_features = df_emo_features.set_index('photo_id') 
            # fill in vectors where the value is available
            df_emo_features.loc[photo_id_str] = pd.Series(source.T.to_dict()[0])
            
            df_emo_features['photo_id'] = df_emo_features.index # index back to column
            df_emo_features.index.name = None 
            df_emo_features.reset_index(drop=True, inplace=True) # clean up index
    
    return df_emo_features
            

In [129]:
df_emo_vectors = fill_in_emotion_vectors(df_emo_features, df_emo_answers, emotion_words_list)
df_emo_vectors = df_emo_vectors.fillna(0)
df_label_raw = pd.read_csv('../data/emotion_labels.csv')
df_emo_vectors = pd.concat([df_emo_vectors, df_label_raw], axis=1) # concat metadata
# df_emo_vectors.to_csv('../clean_data/free_choice_uw_students_vectors.csv', index=False)

In [130]:
df_emo_vectors

Unnamed: 0,happi,sad,angri,confus,shock,surpris,upset,mad,disgust,scare,...,moodi,broken,sinist,schock,photo_id,ethnicity,sex,age,label,url
0,0.014085,0.000000,0.140845,0.042254,0.014085,0.014085,0.014085,0.070423,0.000000,0.000000,...,0.0,0.0,0.0,0.0,Q2.1_1,bipoc,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
1,0.012987,0.000000,0.337662,0.012987,0.000000,0.000000,0.116883,0.142857,0.000000,0.000000,...,0.0,0.0,0.0,0.0,Q3.1_1,bipoc,female,child,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
2,0.000000,0.000000,0.395062,0.000000,0.000000,0.000000,0.086420,0.111111,0.000000,0.000000,...,0.0,0.0,0.0,0.0,Q4.1_1,bipoc,male,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
3,0.000000,0.211765,0.035294,0.188235,0.011765,0.011765,0.047059,0.023529,0.000000,0.011765,...,0.0,0.0,0.0,0.0,Q5.1_1,bipoc,male,child,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
4,0.024691,0.000000,0.246914,0.000000,0.049383,0.000000,0.061728,0.135802,0.000000,0.000000,...,0.0,0.0,0.0,0.0,Q6.1_1,white,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,0.031250,0.000000,0.062500,0.156250,0.000000,0.000000,0.031250,0.015625,0.000000,0.000000,...,0.0,0.0,0.0,0.0,Q191.1_1,bipoc,male,child,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...
190,0.000000,0.016129,0.016129,0.306452,0.096774,0.032258,0.016129,0.016129,0.016129,0.000000,...,0.0,0.0,0.0,0.0,Q192.1_1,white,female,adult,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...
191,0.016393,0.000000,0.147541,0.016393,0.000000,0.000000,0.032787,0.049180,0.000000,0.000000,...,0.0,0.0,0.0,0.0,Q193.1_1,white,female,child,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...
192,0.046875,0.000000,0.015625,0.015625,0.000000,0.000000,0.015625,0.015625,0.015625,0.000000,...,0.0,0.0,0.0,0.0,Q194.1_1,white,male,adult,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...


In [131]:
# placeholder dataframe
index_photos = photo_ids.iloc[:, 0] + '_1'
df_emo_features = pd.DataFrame(columns=free_choice_word_list) # CHANGE TO NOT STEAMED
# add key-ids
df_emo_features.insert(loc=0, column='photo_id', value=index_photos)

In [133]:
def fill_in_emotion_counts(df_emo_features, df_emo_answers, emotion_words_list):
    for emo in range(0, len(emotion_words_list)):
        
        emotion = emotion_words_list[emo] # get emotion word
        df_ans = emotion_df_formated(df_emo_answers, emotion) # ans for all pics of emo-word
        photo_id_list = df_ans['photo_id'].tolist() # photo ids for the emotion word
        splited_photo_id = np.array_split(photo_id_list, 24) # photo_id_list as an array 
        
        for ids in range(0, len(splited_photo_id)):
            photo_id = splited_photo_id[ids]
            df_single = df_ans[df_ans['photo_id'].isin(photo_id)] # get row for nth photo
            df_emotion_ans = df_single.drop(['photo_id', 'ethnicity', 'sex', \
                                             'age', 'label', 'url'], axis=1) # clean up for calculation
            df_stack_emotion = formating_words(df_emotion_ans) 
            source = count_freq_labels(df_stack_emotion, X='stacked', col='emotion') # group and count / compute vectors   
            ## count or percet
            source = source[['counts', 'emotion']].T
            source.columns = source.iloc[1].str.lower() # emotion words as cols
            source.drop('emotion', axis=0, inplace=True) # clean up emotion row
            source.reset_index(drop=True, inplace=True) # clean up index
            source.columns.name = None
            photo_id_str = photo_id[0] # photo id as str
            source.insert(0, 'photo_id', photo_id_str) # photo id as col
            # set photo id as col for easier manipulation
            df_emo_features = df_emo_features.set_index('photo_id') 
            # fill in vectors where the value is available
            df_emo_features.loc[photo_id_str] = pd.Series(source.T.to_dict()[0])
            
            df_emo_features['photo_id'] = df_emo_features.index # index back to column
            df_emo_features.index.name = None 
            df_emo_features.reset_index(drop=True, inplace=True) # clean up index
    
    return df_emo_features
            

In [134]:
df_emo_counts = fill_in_emotion_counts(df_emo_features, df_emo_answers, emotion_words_list)
# df_emo_counts = df_emo_counts.fillna(0) # No need to fill NA as means need to be computed later ignoring them
df_label_raw = pd.read_csv('../data/emotion_labels.csv')
df_emo_counts = pd.concat([df_emo_counts, df_label_raw], axis=1) # concat metadata
# df_emo_counts.to_csv('../clean_data/free_choice_uw_students_count_emotions.csv', index=False)

# Clustering

In [None]:
from kneed import KneeLocator
from sklearn import preprocessing as pp
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn import decomposition

In [None]:
X = df_emo_vectors.copy().drop(['photo_id', 'ethnicity', 'sex', \
                                   'age', 'label', 'url'], axis=1)

In [None]:
features = X.columns
sX = pp.MinMaxScaler(copy=True)
X.loc[:,features] = sX.fit_transform(X[features])

In [None]:
# X.describe() # mean  = 0, SD =1

In [None]:
# pca = decomposition.PCA(n_components=3)
# pca.fit(X)
# X_pca = pca.transform(X)

In [None]:
# def k_means(n_clusters=3, n_inits=20, max_iter=1000, features=X):
#     kmeans = KMeans(
#     init="k-means++",
#     n_clusters=n_clusters,
#     n_init=n_inits,
#     max_iter=max_iter,
#     random_state=42)
#     kmeans.fit(features)
    
#     return kmeans

In [None]:
# kmeans_kwargs = {
#     "init": "k-means++",
#     "n_init": 20,
#     "max_iter": 1000,
#     "random_state": 42,
# }

# # A list holds the SSE values for each k
# sse_pca = []
# for k in range(1, 100):
#     kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
#     kmeans.fit(X_pca)
#     sse_pca.append(kmeans.inertia_)

In [None]:
# kl_pca = KneeLocator(
#     range(1, 100), sse_pca, curve="convex", direction="decreasing"
# )

# kl_pca.elbow

In [None]:
# source = pd.DataFrame({'y': sse_pca, 'x': range(1, 100)})

In [None]:
# chart_elbow_pca = alt.Chart(source).mark_line().encode(
#     alt.X('x:Q', title='Number of clusters - PCA'), 
#     alt.Y('y:Q', title='SSE'))

In [None]:
# # A list holds the silhouette coefficients for each k
# silhouette_coefficients_pca = []

# # Notice you start at 2 clusters for silhouette coefficient
# for k in range(2, 100):
#     kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
#     kmeans.fit(X_pca)
#     score = silhouette_score(X_pca, kmeans.labels_)
#     silhouette_coefficients_pca.append(score)

In [None]:
# source = pd.DataFrame({'y': silhouette_coefficients_pca, 'x': range(2, 100)})

In [None]:
# chart_silhouette_pca = alt.Chart(source).mark_line().encode(
#     alt.X('x:Q', title='Number of clusters - PCA'), 
#     alt.Y('y:Q', title='Silhouette coefficients'))

In [None]:
k_means_eval_chart =  chart_silhouette_pca | chart_elbow_pca

In [None]:
k_means_eval_chart

In [None]:
k_means_eval_str = altair_saver.save(k_means_eval_chart, fmt='svg')
image_title.append('K-means evaluation')
svg_str.append(k_means_eval_str)

In [None]:
# k_means_run_pca = k_means(n_clusters=10, features=X_pca)

In [None]:
# df_label_raw['clusters_pca'] = k_means_run_pca.labels_

In [None]:
# dfs_kmeans_pca = [pd.DataFrame(y) for x, y in df_label_raw.groupby('clusters_pca', as_index=False)]

In [None]:
# ## add photo coordinates
# for i in range(0, len(dfs_kmeans_pca)):
#     num_items = len(dfs_kmeans_pca[i])
#     dfs_kmeans_pca[i]['x'] = np.linspace(0.1, 3.0, num=num_items)
#     dfs_kmeans_pca[i]['y'] = np.linspace(0.1, 3.0, num=num_items)

In [None]:
# def grid_photos(dfs_kmeans=dfs_kmeans_pca, nx=6, ny=6, cluster=0, width=600, height=600,title='title'):
    
#     nx, ny = (nx, ny)
#     x = np.linspace(0, 1, nx)
#     y = np.linspace(0, 1, ny)
#     xv, yv = np.meshgrid(x, y)
    
#     dfs_kmeans[cluster]['x'] = xv.ravel()[0:len(dfs_kmeans[cluster])]
#     dfs_kmeans[cluster]['y'] = yv.ravel()[0:len(dfs_kmeans[cluster])]
    
#     chart = alt.Chart(dfs_kmeans[cluster], title=title).mark_image(
#         width=50,
#         height=50
#     ).encode(
#         alt.X('x', axis=None),
#         alt.Y('y', axis=None),
#         url='url'
#     )
    
#     text = chart.mark_text(
#     align='center',
#     baseline='bottom',
#     yOffset = -25
#     ).encode(
#         alt.Text('label'),
#         color=alt.Color('label',
#                         scale=alt.Scale(
#                             domain=emotion_words_list,
#                             range=['#ff4444', '#4c809c', '#9ae354', '#0000AA', '#FFA500', '#E4D00A', '#c41cac', '#50C878']))
#     )
    
    
#     return (chart + text).properties(width=width, height=height)

In [None]:
# dims_clusters = []
# for i in range (0, 10):
#     dims_clusters.append(dfs_kmeans_pca[i].shape[0])
# dims_clusters

In [None]:
cluster_0 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=6, ny=6, cluster=0,  width=400, height=400, title='Cluster 1')
cluster_1 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=6, ny=6, cluster=1,  width=400, height=400, title='Cluster 2')
cluster_2 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=3, ny=3, cluster=2,  width=400, height=400, title='Cluster 3')
cluster_3 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=6, ny=6, cluster=3,  width=400, height=400, title='Cluster 4')
cluster_4 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=4, ny=4, cluster=4,  width=400, height=400, title='Cluster 5')
cluster_5 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=5, ny=5, cluster=5,  width=400, height=400, title='Cluster 6')
cluster_6 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=5, ny=5, cluster=6,  width=400, height=400, title='Cluster 7')
cluster_7 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=4, ny=4, cluster=7,  width=400, height=400, title='Cluster 8')
cluster_8 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=4, ny=4, cluster=8,  width=400, height=400, title='Cluster 9')
cluster_9 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=5, ny=5, cluster=9,  width=400, height=400, title='Cluster 10')
# cluster_10 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=4, ny=4, cluster=10,  width=300, height=300, title='Cluster 11')
# cluster_11 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=1, ny=1, cluster=11,  width=300, height=300, title='Cluster 12')

In [None]:
k_means_clusters = ((cluster_0 | cluster_1) & \
(cluster_2 | cluster_3) & \
(cluster_4 | cluster_5) & \
(cluster_6 | cluster_7) & \
(cluster_8 | cluster_9)).configure_axis(
        grid=False
    ).configure_view(
        strokeOpacity=0
    ).configure_concat(
    spacing=50
)

In [None]:
k_means_clusters.display(renderer='svg')

# Dataframe for dashboard

In [None]:
images_strings = pd.DataFrame({'image_title': image_title, 'svg': svg_str})

In [None]:
images_strings.to_csv('../clean_data/free_choice_svg_strings.csv', index=False)
images_strings.to_csv('../../emotions_dashboard/data/free_choice_svg_strings.csv', index=False)

In [None]:
df_svg = pd.read_csv('../clean_data/free_choice_svg_strings.csv')

In [3]:
df_svg

NameError: name 'df_svg' is not defined