In [312]:
import pandas as pd
import numpy as np
import altair as alt

## Data ingestion

In [313]:
df = pd.read_csv('../data/emotion_free_choice_uw_students.csv')
df_label = pd.read_csv('../data/emotion_labels.csv')

In [314]:
df_label['url'] = df_label['url'].astype(str)

In [315]:
# replicate rows by 4 to match with free responses
df_labels = pd.DataFrame(np.repeat(df_label.values, 4, axis=0))
df_labels.columns = df_label.columns

In [316]:
df_labels.to_csv('../data/emotion_labels_free_choice.csv', index=False)

In [317]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 633.0+ KB


In [318]:
df = df.iloc[19:, :]  # filter out test rows

In [319]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 19 to 100
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 513.9+ KB


In [320]:
df = df[df['Finished'] =='True'] # filter out incomplete surveys

In [321]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 19 to 95
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 319.9+ KB


In [322]:
df.rename(columns={
    'Q1.2': 'sex',
    'Q1.3_1': 'age',
    'Q1.4': 'ethnicity',
    'Q1.5': 'formal education',
    'Q1.6_1': 'income'}, inplace=True)

In [323]:
df.to_csv('../clean_data/free_choice_emotion_uw_students.csv', index=False)

In [324]:
# placeholders to save svg strings
svg_str = []
image_title = []

## Demographics

In [325]:
def count_freq_labels(df, X="all" ):
    if X == "all":
        df_counts = df.stack().reset_index(drop=True).value_counts() # stack as series
        df_counts = df_counts.to_frame('counts') # get value_counts as df
        df_counts['emotion'] = df_counts.index # get index as col
    else:
        df_counts = df[X].reset_index(drop=True).value_counts() # stack as series
        df_counts = df_counts.to_frame('counts') # get value_counts as df
        df_counts[X] = df_counts.index # get index as col

    df_counts = df_counts.reset_index(drop=True) # clean index
    df_counts['percent'] = df_counts['counts'] / df_counts['counts'].sum() # compute percentage
    return df_counts

In [326]:
def simple_per_bar(
    df, title='Title', X='percent:Q', Y='emotion:N', \
    width=450, height=250, sort='-x', \
    text_size = 12, label_size = 11, title_size=12, \
    emotion='Some', color1='#0570b0', color2='orange'):
    
    bars = alt.Chart(df, title=title).mark_bar().encode(
        alt.X(X, axis=alt.Axis(format='.0%')),
        y=alt.Y(Y, sort=sort), 
        color=alt.condition(
            alt.datum.emotion == emotion,
            alt.value(color2),
            alt.value(color1)
        ))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    fontSize=text_size
    ).encode(
        alt.Text(X, format='.1%')
    )
    
    chart = (bars + text).configure_axis(
            labelFontSize=label_size,
            titleFontSize=title_size).properties(
                width=width, 
                height=height)
    
    
    return chart

In [327]:
source = count_freq_labels(df, X="sex") 
title = 'Sex | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'sex:N'
w, h= 450, 100
txs, ls, ts = 12, 12, 12

chart_sex = simple_per_bar(source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)

In [328]:
chart_sex.display(renderer='svg')

In [329]:
chart_sex_string = altair_saver.save(chart_sex, fmt='svg')
image_title.append('Participants by sex')
svg_str.append(chart_sex_string)

In [330]:
source = count_freq_labels(df, X="age") 
title = 'Age | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'age:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_age = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h, \
    text_size = txs, label_size = ls, title_size=ts)
chart_age.display(renderer='svg')

In [331]:
chart_age_str = altair_saver.save(chart_age, fmt='svg')
image_title.append('Participants by age')
svg_str.append(chart_age_str)

In [332]:
source = count_freq_labels(df, X="ethnicity") 
title = 'Ethnicity | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'ethnicity:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_ethnicity= simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h, \
    text_size = txs, label_size = ls, title_size=ts)
chart_ethnicity.display(renderer='svg')

In [333]:
chart_et_str = altair_saver.save(chart_ethnicity, fmt='svg')
image_title.append('Participants by ethnicity')
svg_str.append(chart_et_str)

In [334]:
source = count_freq_labels(df, X="formal education") 
title = 'Formal education | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'formal education:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_formal_education= simple_per_bar(
    source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)
chart_formal_education.display(renderer='svg')

In [335]:
chart_formal_education_str = altair_saver.save(chart_formal_education, fmt='svg')
image_title.append('Participants by formal education')
svg_str.append(chart_formal_education_str)

## Formating

In [336]:
df_emo_answers = df.loc[:, 'Q2.1_1':'Q195.1_4'] # get cols with words only

In [337]:
def formating_words(df, len_words=1, len_letters=2):
    df_stack = df.stack().reset_index(drop=True) # stack as series
    df_stack = df_stack.to_frame(name='emotion') # as DF
    df_stack['emotion'] = df_stack['emotion'].str.strip() # remove blank spaces
    df_stack['emotion'] = df_stack['emotion'].str.lower() # as lower case
    df_stack['emotion'] = df_stack['emotion'].replace({'na':'none'}) 
    df_stack['len_words'] = df_stack['emotion'].str.split().apply(len) # cnt number of words
    df_stack['len_letters'] = df_stack['emotion'].apply(len) # cont number of letters
    # get df with single words of 3 or more letters
    df_stack_single_word = df_stack[(df_stack['len_words'] == len_words) & (df_stack['len_letters'] > len_letters)] 
    return df_stack_single_word

In [338]:
df_stack_single_word = formating_words(df_emo_answers)

## Spell checking 
**NOTE**: poor results so far with this

In [339]:
# conda install -c conda-forge pattern 
# from pattern.en import suggest

In [340]:
# df_stack_single_word['emotion_spell_check'] = df_stack_single_word['emotion'].apply(lambda x: suggest(x)[0][0])
# df_stack_single_word['emotion'].size
# df_stack_single_word['emotion'].size - sum(df_stack_single_word['emotion'] == df_stack_single_word['emotion_spell_check']) # number of words changed

## Steamming

In [341]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

In [342]:
ps = PorterStemmer()
ls = LancasterStemmer()
snowball = SnowballStemmer(language='english')

In [343]:
# Porter stemmer
df_stack_single_word['emotion_ps_steamed'] = df_stack_single_word['emotion'].apply(lambda x: ps.stem(x))

In [344]:
# Lancaster stemmer
df_stack_single_word['emotion_ls_steamed'] = df_stack_single_word['emotion'].apply(lambda x: ls.stem(x))

In [345]:
# Snowball stemmer
df_stack_single_word['emotion_sb_steamed'] = df_stack_single_word['emotion'].apply(lambda x: snowball.stem(x))

In [346]:
df_stack_single_word.head(3)

Unnamed: 0,emotion,len_words,len_letters,emotion_ps_steamed,emotion_ls_steamed,emotion_sb_steamed
0,angry,1,5,angri,angry,angri
1,yelling,1,7,yell,yel,yell
2,yelling,1,7,yell,yel,yell


## Count frequency and plot

In [347]:
def count_freq_labels(df, col):
    df_counts = df[col].value_counts() # count word frequency 
    df_counts = df_counts.to_frame('counts') # get value_counts as df
    df_counts['emotion'] = df_counts.index # get index as col
    df_counts = df_counts.reset_index(drop=True) # clean index
    df_counts['percent'] = df_counts['counts'] / df_counts['counts'].sum() # compute percentage
    return df_counts

In [348]:
def simple_per_bar(df, X, Y, top, width=300, height=300, title='Title'):
    bars = alt.Chart(df.head(top), title=title).mark_bar().encode(
        alt.X(X, axis=alt.Axis(format='.0%')),
        y=alt.Y(Y, sort='-x'))
    
    text = bars.mark_text(
        align='left',
        baseline='middle',
        dx=3,  # Nudges text to right so it doesn't appear on top of the bar
        fontSize=10
    ).encode(
        alt.Text(X, format='.2%')
    )
    
    chart = (bars + text)
    
    return chart.properties(width=width, height=height)

In [349]:
def simple_count_bar(df, X, Y, top, width=300, height=300, title='Title'):
    bars = alt.Chart(df.head(top), title=title).mark_bar().encode(
        alt.X(X),
        y=alt.Y(Y, sort='-x'))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    fontSize=10
    ).encode(
        alt.Text('counts:Q')
    )
    
    chart = (bars + text)
    
    return chart.properties(width=width, height=height)

In [350]:
df_emo_overall = count_freq_labels(df_stack_single_word, 'emotion_ps_steamed')

In [351]:
free_choice_word_list = count_freq_labels(df_stack_single_word, 'emotion')['emotion']

In [352]:
free_choice_word_list.to_csv('../clean_data/free_choice_word_list.csv', index=False)

In [353]:
df_emo_overall.to_csv('../clean_data/free_choice_emotion_uw_students_overall.csv', index=False)

In [354]:
df_emo_overall = count_freq_labels(df_stack_single_word, 'emotion_ps_steamed')
simple_per_bar(df_emo_overall, 'percent:Q', 'emotion:N', width=300, height=500, top=50, title='Most frequently selected labels | n = '+df_emo_overall['counts'].sum().astype(str))

In [355]:
simple_count_bar(df_emo_overall, 'counts:Q', 'emotion:N', top=20, title='Most frequently selected labels | n = '+df_emo_overall['counts'].sum().astype(str))

## Most frequently used word for each emotion category (grouping pictures by expected-emotion)

In [356]:
def df_add_label(df_emo_answers, emotion_label):
    df_emo_cat = df_emo_answers.copy() 
    df_emo_cat_t = df_emo_cat.T # transpose
    df_emo_cat_t['photo_id'] = df_emo_cat_t.index # get index as col
    df_emo_cat_t = df_emo_cat_t.reset_index(drop=True) # clean index
    df_emo_cat_t_labels = pd.concat([df_emo_cat_t, df_labels], axis=1) # add metadata cols
    df_label =  df_emo_cat_t_labels[df_emo_cat_t_labels['label'] == emotion_label]
    
    return df_label

In [357]:
    df_emotion = df_add_label(df, 'anger') # subset 'emotion' rows
    df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label'], axis=1)
    df_stack_emotion = formating_words(df_emotion_ans) # clean up
    # df_stack_anger['emotion_spell_check'] = df_stack_single_word['emotion'].apply(lambda x: suggest(x)[0][0])
    df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
    df_stack_emotion_count = count_freq_labels(df_stack_emotion, 'emotion_ps_steamed')

In [358]:
def chart_wrapper(df, emotion, title):
    df_emotion = df_add_label(df, emotion) # subset 'emotion' rows
    df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label'], axis=1)
    df_stack_emotion = formating_words(df_emotion_ans) # clean up
    # df_stack_anger['emotion_spell_check'] = df_stack_single_word['emotion'].apply(lambda x: suggest(x)[0][0])
    df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
    df_stack_emotion_count = count_freq_labels(df_stack_emotion, 'emotion_ps_steamed')
    n = df_stack_emotion_count['counts'].sum()    
    chart = simple_per_bar(df_stack_emotion_count, 'percent:Q', 'emotion:N', top=20, title=title+" | n= "+n.astype(str))
    
    return chart

### Charts

In [359]:
chart_anger = chart_wrapper(df_emo_answers, 'anger', 'Expected label: '+ ' anger')
chart_disgust = chart_wrapper(df_emo_answers, 'disgust', 'Expected label: '+ ' disgust')
chart_fear = chart_wrapper(df_emo_answers, 'fear', 'Expected label: '+ ' fear')
chart_surprise = chart_wrapper(df_emo_answers, 'surprise', 'Expected label: '+ ' surprise')
chart_happiness = chart_wrapper(df_emo_answers, 'happiness', 'Expected label: '+ ' happiness')
chart_sadness = chart_wrapper(df_emo_answers, 'sadness', 'Expected label: '+ ' sadness')
chart_uncertain = chart_wrapper(df_emo_answers, 'uncertain', 'Expected label: '+ ' uncertain')
chart_neutral = chart_wrapper(df_emo_answers, 'neutral', 'Expected label: '+ ' neutral')

In [360]:
(chart_anger | chart_disgust) & (chart_fear | chart_surprise) & (chart_happiness | chart_sadness) & (chart_neutral | chart_uncertain) 

### Most frequently used labels by photo

In [361]:
def photo_chart(df_emo_answers, emotion,  i):
    df = df_add_label(df_emo_answers, emotion) # add emotion label
    photo_id_list = df['photo_id'].tolist() # photo id to list
    splited_photo_id = np.array_split(photo_id_list, 24) # split photo id by photo
    df = df[df['photo_id'].isin(splited_photo_id[i])] # select photo rows   
    df_url = df[['url']].head(1)  # get url for chart
    
    face = alt.Chart(df_url).mark_image(width=110, height=110, align='right', xOffset=0, yOffset=230).encode(url='url')
    
    df_emotion_ans = df.drop(['photo_id', 'ethnicity', 'sex', 'age', 'label', 'url'], axis=1) # clean cols
    df_stack_emotion = formating_words(df_emotion_ans) # clean up words
    df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) # steam
    df_stack_emotion_count = count_freq_labels(df_stack_emotion, 'emotion_ps_steamed') # group and count
    
    chart = simple_per_bar(df_stack_emotion_count, 'percent:Q', 'emotion:N', width=300, height=300, top=20, title="Expected: "+" "+emotion+" | n = "+    df_stack_emotion_count['counts'].sum().astype(str))
    
    return chart + face

In [362]:
photo_chart(df_emo_answers, 'anger',  7)

In [363]:
def charts_emotion_faces(df_emo_answers, emotion):
    charts = []
    for i in range(0, 24):
        charts.append(photo_chart(df_emo_answers, emotion,  i))
    return charts

In [364]:
def dashboard_emotion_faces(charts, rows):
    if rows == 1:
        dashboard= (charts[0] | charts[1] | charts[2])
    if rows == 2:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) 
    if rows == 3:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8])
    if rows == 4:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11])
    if rows == 5:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11]) & \
                   (charts[12] | charts[13] | charts[14]) 
    if rows == 6:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11]) & \
                   (charts[12] | charts[13] | charts[14]) & \
                   (charts[15] | charts[16] | charts[17])
    if rows == 7:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11]) & \
                   (charts[12] | charts[13] | charts[14]) & \
                   (charts[15] | charts[16] | charts[17]) & \
                   (charts[18] | charts[19] | charts[20]) 
    if rows == 8:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11]) & \
                   (charts[12] | charts[13] | charts[14]) & \
                   (charts[15] | charts[16] | charts[17]) & \
                   (charts[18] | charts[19] | charts[20]) & \
                   (charts[21] | charts[22] | charts[23]) 
        
    return dashboard

In [365]:
# charts = charts_emotion_faces(df_emo_answers, 'sadness')
# dashboard_emotion_faces(charts, 8)

## Emotion percentages as feature vectors

In [378]:
photo_ids = pd.read_csv('../clean_data/photo_ids.csv')

In [379]:
df_emo_overall_raw = count_freq_labels(df_stack_single_word, 'emotion_ps_steamed')
emotion_words_list_steam = df_emo_overall_raw['emotion'].str.lower().tolist()
emotion_words_list = ['happiness','neutral', 'surprise','sadness', 'disgust', 'anger', 'fear', 'uncertain']

In [380]:
# placeholder dataframe
index_photos = photo_ids.iloc[:, 0] + '_1'
df_emo_features = pd.DataFrame(columns=emotion_words_list_steam) 
# add key-ids
df_emo_features.insert(loc=0, column='photo_id', value=index_photos)

In [381]:
def fill_in_emotion_vectors(df_emo_features, df_emo_answers, emotion_words_list):
    for emo in range(0, len(emotion_words_list)):
        
        emotion = emotion_words_list[emo] # get emotion word
        df_ans = df_add_label(df_emo_answers, emotion) # ans for all pics of emo-word
        photo_id_list = df_ans['photo_id'].tolist() # photo ids for the emotion word
        splited_photo_id = np.array_split(photo_id_list, 24) # photo_id_list as an array 
        
        for ids in range(0, len(splited_photo_id)):
            photo_id = splited_photo_id[ids]
            df_single = df_ans[df_ans['photo_id'].isin(photo_id)] # get row for nth photo
            df_emotion_ans = df_single.drop(['photo_id', 'ethnicity', 'sex', \
                                             'age', 'label', 'url'], axis=1) # clean up for calculation
            df_stack_emotion = formating_words(df_emotion_ans) 
            df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) # steam
            source = count_freq_labels(df_stack_emotion, 'emotion_ps_steamed') # group and count / compute vectors
            source = source[['percent', 'emotion']].T
            source.columns = source.iloc[1].str.lower() # emotion words as cols
            source.drop('emotion', axis=0, inplace=True) # clean up emotion row
            source.reset_index(drop=True, inplace=True) # clean up index
            source.columns.name = None
            photo_id_str = photo_id[0] # photo id as str
            source.insert(0, 'photo_id', photo_id_str) # photo id as col
            # set photo id as col for easier manipulation
            df_emo_features = df_emo_features.set_index('photo_id') 
            # fill in vectors where the value is available
            df_emo_features.loc[photo_id_str] = pd.Series(source.T.to_dict()[0])
            
            df_emo_features['photo_id'] = df_emo_features.index # index back to column
            df_emo_features.index.name = None 
            df_emo_features.reset_index(drop=True, inplace=True) # clean up index
    
    return df_emo_features
            

In [383]:
df_emo_vectors = fill_in_emotion_vectors(df_emo_features, df_emo_answers, emotion_words_list)# df_emo_vectors = df_emo_vectors.fillna(0)
df_emo_vectors = df_emo_vectors.fillna(0)
df_label_raw = pd.read_csv('../data/emotion_labels.csv')
df_emo_vectors = pd.concat([df_emo_vectors, df_label_raw], axis=1) # concat metadata
df_emo_vectors.to_csv('../clean_data/free_choice_uw_students_vectors.csv', index=False)

In [400]:
# placeholder dataframe
index_photos = photo_ids.iloc[:, 0] + '_1'
df_emo_features = pd.DataFrame(columns=free_choice_word_list) # CHANGE TO NOT STEAMED
# add key-ids
df_emo_features.insert(loc=0, column='photo_id', value=index_photos)

In [406]:
def fill_in_emotion_counts(df_emo_features, df_emo_answers, emotion_words_list):
    for emo in range(0, len(emotion_words_list)):
        
        emotion = emotion_words_list[emo] # get emotion word
        df_ans = df_add_label(df_emo_answers, emotion) # ans for all pics of emo-word
        photo_id_list = df_ans['photo_id'].tolist() # photo ids for the emotion word
        splited_photo_id = np.array_split(photo_id_list, 24) # photo_id_list as an array 
        
        for ids in range(0, len(splited_photo_id)):
            photo_id = splited_photo_id[ids]
            df_single = df_ans[df_ans['photo_id'].isin(photo_id)] # get row for nth photo
            df_emotion_ans = df_single.drop(['photo_id', 'ethnicity', 'sex', \
                                             'age', 'label', 'url'], axis=1) # clean up for calculation
            df_stack_emotion = formating_words(df_emotion_ans) 
            source = count_freq_labels(df_stack_emotion, 'emotion') # group and count / compute vectors   
            ## count or percet
            source = source[['counts', 'emotion']].T
            source.columns = source.iloc[1].str.lower() # emotion words as cols
            source.drop('emotion', axis=0, inplace=True) # clean up emotion row
            source.reset_index(drop=True, inplace=True) # clean up index
            source.columns.name = None
            photo_id_str = photo_id[0] # photo id as str
            source.insert(0, 'photo_id', photo_id_str) # photo id as col
            # set photo id as col for easier manipulation
            df_emo_features = df_emo_features.set_index('photo_id') 
            # fill in vectors where the value is available
            df_emo_features.loc[photo_id_str] = pd.Series(source.T.to_dict()[0])
            
            df_emo_features['photo_id'] = df_emo_features.index # index back to column
            df_emo_features.index.name = None 
            df_emo_features.reset_index(drop=True, inplace=True) # clean up index
    
    return df_emo_features
            

In [407]:
df_emo_counts = fill_in_emotion_counts(df_emo_features, df_emo_answers, emotion_words_list)
# df_emo_counts = df_emo_counts.fillna(0) # replace NAs with 0
# df_label_raw = pd.read_csv('../data/emotion_labels.csv')
# df_emo_counts = pd.concat([df_emo_counts, df_label_raw], axis=1) # concat metadata
# df_emo_counts.to_csv('../clean_data/free_choice_uw_students_count_emotions.csv', index=False)

In [408]:
df_emo_counts

emotion,happy,sad,angry,confused,upset,shocked,surprised,mad,scared,none,...,broken,moody,sexy,anrgy,touring,tounge,unknowing,pyscho,schocked,photo_id
0,1,,10,2,1,,,5,,2,...,,,,,,,,,,Q2.1_1
1,1,,26,1,9,,,11,,,...,,,,,,,,,,Q3.1_1
2,,,32,,7,,,9,,,...,,,,,,,,,,Q4.1_1
3,,17,3,13,4,1,1,2,1,,...,,,,,,,,,,Q5.1_1
4,2,,20,,5,4,,11,,,...,,,,,,,,,,Q6.1_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,2,,4,10,2,,,1,,5,...,,,,,,,,,,Q191.1_1
190,,1,1,18,1,6,2,1,,4,...,,,,,,,,,,Q192.1_1
191,1,,9,1,2,,,3,,4,...,,,,,,,,,,Q193.1_1
192,3,,1,1,1,,,1,,4,...,,,,,,,,,,Q194.1_1


# Clustering

In [None]:
from kneed import KneeLocator
from sklearn import preprocessing as pp
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler
from sklearn import decomposition

In [None]:
X = df_emo_vectors.copy().drop(['photo_id', 'ethnicity', 'sex', \
                                   'age', 'label', 'url'], axis=1)

In [None]:
features = X.columns
sX = pp.MinMaxScaler(copy=True)
X.loc[:,features] = sX.fit_transform(X[features])

In [None]:
# X.describe() # mean  = 0, SD =1

In [None]:
# pca = decomposition.PCA(n_components=3)
# pca.fit(X)
# X_pca = pca.transform(X)

In [None]:
# def k_means(n_clusters=3, n_inits=20, max_iter=1000, features=X):
#     kmeans = KMeans(
#     init="k-means++",
#     n_clusters=n_clusters,
#     n_init=n_inits,
#     max_iter=max_iter,
#     random_state=42)
#     kmeans.fit(features)
    
#     return kmeans

In [None]:
# kmeans_kwargs = {
#     "init": "k-means++",
#     "n_init": 20,
#     "max_iter": 1000,
#     "random_state": 42,
# }

# # A list holds the SSE values for each k
# sse_pca = []
# for k in range(1, 100):
#     kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
#     kmeans.fit(X_pca)
#     sse_pca.append(kmeans.inertia_)

In [None]:
# kl_pca = KneeLocator(
#     range(1, 100), sse_pca, curve="convex", direction="decreasing"
# )

# kl_pca.elbow

In [None]:
# source = pd.DataFrame({'y': sse_pca, 'x': range(1, 100)})

In [None]:
# chart_elbow_pca = alt.Chart(source).mark_line().encode(
#     alt.X('x:Q', title='Number of clusters - PCA'), 
#     alt.Y('y:Q', title='SSE'))

In [None]:
# # A list holds the silhouette coefficients for each k
# silhouette_coefficients_pca = []

# # Notice you start at 2 clusters for silhouette coefficient
# for k in range(2, 100):
#     kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
#     kmeans.fit(X_pca)
#     score = silhouette_score(X_pca, kmeans.labels_)
#     silhouette_coefficients_pca.append(score)

In [None]:
# source = pd.DataFrame({'y': silhouette_coefficients_pca, 'x': range(2, 100)})

In [None]:
# chart_silhouette_pca = alt.Chart(source).mark_line().encode(
#     alt.X('x:Q', title='Number of clusters - PCA'), 
#     alt.Y('y:Q', title='Silhouette coefficients'))

In [None]:
k_means_eval_chart =  chart_silhouette_pca | chart_elbow_pca

In [None]:
k_means_eval_chart

In [None]:
k_means_eval_str = altair_saver.save(k_means_eval_chart, fmt='svg')
image_title.append('K-means evaluation')
svg_str.append(k_means_eval_str)

In [None]:
# k_means_run_pca = k_means(n_clusters=10, features=X_pca)

In [None]:
# df_label_raw['clusters_pca'] = k_means_run_pca.labels_

In [None]:
# dfs_kmeans_pca = [pd.DataFrame(y) for x, y in df_label_raw.groupby('clusters_pca', as_index=False)]

In [None]:
# ## add photo coordinates
# for i in range(0, len(dfs_kmeans_pca)):
#     num_items = len(dfs_kmeans_pca[i])
#     dfs_kmeans_pca[i]['x'] = np.linspace(0.1, 3.0, num=num_items)
#     dfs_kmeans_pca[i]['y'] = np.linspace(0.1, 3.0, num=num_items)

In [None]:
# def grid_photos(dfs_kmeans=dfs_kmeans_pca, nx=6, ny=6, cluster=0, width=600, height=600,title='title'):
    
#     nx, ny = (nx, ny)
#     x = np.linspace(0, 1, nx)
#     y = np.linspace(0, 1, ny)
#     xv, yv = np.meshgrid(x, y)
    
#     dfs_kmeans[cluster]['x'] = xv.ravel()[0:len(dfs_kmeans[cluster])]
#     dfs_kmeans[cluster]['y'] = yv.ravel()[0:len(dfs_kmeans[cluster])]
    
#     chart = alt.Chart(dfs_kmeans[cluster], title=title).mark_image(
#         width=50,
#         height=50
#     ).encode(
#         alt.X('x', axis=None),
#         alt.Y('y', axis=None),
#         url='url'
#     )
    
#     text = chart.mark_text(
#     align='center',
#     baseline='bottom',
#     yOffset = -25
#     ).encode(
#         alt.Text('label'),
#         color=alt.Color('label',
#                         scale=alt.Scale(
#                             domain=emotion_words_list,
#                             range=['#ff4444', '#4c809c', '#9ae354', '#0000AA', '#FFA500', '#E4D00A', '#c41cac', '#50C878']))
#     )
    
    
#     return (chart + text).properties(width=width, height=height)

In [None]:
# dims_clusters = []
# for i in range (0, 10):
#     dims_clusters.append(dfs_kmeans_pca[i].shape[0])
# dims_clusters

In [None]:
cluster_0 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=6, ny=6, cluster=0,  width=400, height=400, title='Cluster 1')
cluster_1 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=6, ny=6, cluster=1,  width=400, height=400, title='Cluster 2')
cluster_2 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=3, ny=3, cluster=2,  width=400, height=400, title='Cluster 3')
cluster_3 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=6, ny=6, cluster=3,  width=400, height=400, title='Cluster 4')
cluster_4 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=4, ny=4, cluster=4,  width=400, height=400, title='Cluster 5')
cluster_5 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=5, ny=5, cluster=5,  width=400, height=400, title='Cluster 6')
cluster_6 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=5, ny=5, cluster=6,  width=400, height=400, title='Cluster 7')
cluster_7 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=4, ny=4, cluster=7,  width=400, height=400, title='Cluster 8')
cluster_8 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=4, ny=4, cluster=8,  width=400, height=400, title='Cluster 9')
cluster_9 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=5, ny=5, cluster=9,  width=400, height=400, title='Cluster 10')
# cluster_10 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=4, ny=4, cluster=10,  width=300, height=300, title='Cluster 11')
# cluster_11 = grid_photos(dfs_kmeans = dfs_kmeans_pca, nx=1, ny=1, cluster=11,  width=300, height=300, title='Cluster 12')

In [None]:
k_means_clusters = ((cluster_0 | cluster_1) & \
(cluster_2 | cluster_3) & \
(cluster_4 | cluster_5) & \
(cluster_6 | cluster_7) & \
(cluster_8 | cluster_9)).configure_axis(
        grid=False
    ).configure_view(
        strokeOpacity=0
    ).configure_concat(
    spacing=50
)

In [None]:
k_means_clusters.display(renderer='svg')

## PCA - 2D

In [None]:
pca = decomposition.PCA(n_components=2)
pca.fit(X)
X_pca_2 = pca.transform(X)

In [None]:
df_label_raw['x_pca'], df_label_raw['y_pca'] = X_pca_2[:, 0], X_pca_2[:, 1]

In [None]:
pca_images_free = alt.Chart(df_label_raw, width=600, height=600).mark_image(
    width=35,
    height=35
).encode(
    alt.X('x_pca:Q'),
    alt.Y('y_pca:Q'),
    url='url'
)

In [None]:
pca_images_free

In [None]:
## read image to concat from other notebook
%store -r pca_images_forced

In [None]:
pca_images_forced | pca_images_free

In [None]:
pca_text_free = alt.Chart(df_label_raw, width=600, height=600).mark_text(
    align='center',
    baseline='bottom',
    fontSize=13,
    ).encode(
        alt.X('x_pca'),
        alt.Y('y_pca'),
        alt.Text('label'),
        color=alt.Color('label',
                        scale=alt.Scale(
                            domain=emotion_words_list,
                            range=['#ff4444', '#4c809c', 'green', '#ffc286', '#FFA500', '#bd6499', '#c41cac', 'black']),
                       legend=None)
    )

In [None]:
pca_text_free

In [None]:
## read image to concat from other notebook
%store -r pca_text_forced

In [None]:
pca_text_forced | pca_text_free

## PCA - 3D

In [None]:
import plotly.express as px
from IPython.display import HTML

In [None]:
pca = decomposition.PCA(n_components=3)
pca.fit(X)
X_pca_3 = pca.transform(X)

In [None]:
df_label_raw['x_pca_3'], df_label_raw['y_pca_3'], df_label_raw['z_pca_3'] = X_pca_3[:, 0], X_pca_3[:, 1], X_pca_3[:, 2]

In [None]:
df_label_raw.to_csv('../clean_data/free_labeling_pca_3.csv', index=False)

In [None]:
fig = px.scatter_3d(df_label_raw, width=700, height=600, x='x_pca_3', y='y_pca_3', z='z_pca_3',
              color='label')

fig.update_traces(marker=dict(size=8,
                              line=dict(width=1,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

fig.update_layout(
    scene = dict(
        xaxis = dict(nticks=8, range=[-1.6,1.6],),
        yaxis = dict(nticks=8, range=[-1.6,1.6],),
        zaxis = dict(nticks=8, range=[-1.6,1.6],)))

HTML(fig.to_html())

# Valence analysis

In [None]:
!pip install afinn

In [None]:
from afinn import Afinn

In [None]:
afinn = Afinn(language='en')

In [None]:
afinn.score('irritated')

# Dataframe for dashboard

In [None]:
images_strings = pd.DataFrame({'image_title': image_title, 'svg': svg_str})

In [None]:
images_strings.to_csv('../clean_data/free_choice_svg_strings.csv', index=False)
images_strings.to_csv('../../emotions_dashboard/data/free_choice_svg_strings.csv', index=False)

In [None]:
df_svg = pd.read_csv('../clean_data/free_choice_svg_strings.csv')

In [311]:
df_svg

Unnamed: 0,image_title,svg
0,Participants by sex,"<svg xmlns=""http://www.w3.org/2000/svg"" xmlns:..."
1,Participants by age,"<svg xmlns=""http://www.w3.org/2000/svg"" xmlns:..."
2,Participants by ethnicity,"<svg xmlns=""http://www.w3.org/2000/svg"" xmlns:..."
3,Participants by formal education,"<svg xmlns=""http://www.w3.org/2000/svg"" xmlns:..."
4,K-means evaluation,"<svg xmlns=""http://www.w3.org/2000/svg"" xmlns:..."
5,K-means evaluation,"<svg xmlns=""http://www.w3.org/2000/svg"" xmlns:..."
6,K-means evaluation,"<svg xmlns=""http://www.w3.org/2000/svg"" xmlns:..."
