In [2]:
import pandas as pd
import numpy as np
import altair as alt

## Data ingestion

In [97]:
df = pd.read_csv('../data/emotion_free_choice_uw_students.csv')
df_label = pd.read_csv('../data/emotion_labels.csv')

In [4]:
df_label['url'] = df_label['url'].astype(str)

In [5]:
# replicate rows by 4 to match with free responses
df_labels = pd.DataFrame(np.repeat(df_label.values, 4, axis=0))
df_labels.columns = df_label.columns

In [6]:
df_labels.to_csv('../data/emotion_labels_free_choice.csv', index=False)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 633.0+ KB


In [8]:
df = df.iloc[19:, :]  # filter out test rows

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 19 to 100
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 513.9+ KB


In [10]:
df = df[df['Finished'] =='True'] # filter out incomplete surveys

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 19 to 95
Columns: 802 entries, StartDate to Q195.1_4
dtypes: object(802)
memory usage: 319.9+ KB


In [12]:
df.to_csv('../clean_data/free_choice_emotion_uw_students.csv', index=False)

## Formating

In [13]:
df_emo_answers = df.loc[:, 'Q2.1_1':'Q195.1_4'] # get cols with words only

In [14]:
def formating_words(df, len_words=1, len_letters=2):
    df_stack = df.stack().reset_index(drop=True) # stack as series
    df_stack = df_stack.to_frame(name='emotion') # as DF
    df_stack['emotion'] = df_stack['emotion'].str.strip() # remove blank spaces
    df_stack['emotion'] = df_stack['emotion'].str.lower() # as lower case
    df_stack['emotion'] = df_stack['emotion'].replace({'na':'none'}) 
    df_stack['len_words'] = df_stack['emotion'].str.split().apply(len) # cnt number of words
    df_stack['len_letters'] = df_stack['emotion'].apply(len) # cont number of letters
    # get df with single words of 3 or more letters
    df_stack_single_word = df_stack[(df_stack['len_words'] == len_words) & (df_stack['len_letters'] > len_letters)] 
    return df_stack_single_word

In [15]:
df_stack_single_word = formating_words(df_emo_answers)

## Spell checking 
**NOTE**: poor results so far with this

In [16]:
# conda install -c conda-forge pattern 
# from pattern.en import suggest

In [17]:
# df_stack_single_word['emotion_spell_check'] = df_stack_single_word['emotion'].apply(lambda x: suggest(x)[0][0])
# df_stack_single_word['emotion'].size
# df_stack_single_word['emotion'].size - sum(df_stack_single_word['emotion'] == df_stack_single_word['emotion_spell_check']) # number of words changed

## Steamming

In [18]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer

In [19]:
ps = PorterStemmer()
ls = LancasterStemmer()
snowball = SnowballStemmer(language='english')

In [20]:
# Porter stemmer
df_stack_single_word['emotion_ps_steamed'] = df_stack_single_word['emotion'].apply(lambda x: ps.stem(x))

In [21]:
# Lancaster stemmer
df_stack_single_word['emotion_ls_steamed'] = df_stack_single_word['emotion'].apply(lambda x: ls.stem(x))

In [22]:
# Snowball stemmer
df_stack_single_word['emotion_sb_steamed'] = df_stack_single_word['emotion'].apply(lambda x: snowball.stem(x))

In [23]:
df_stack_single_word.head(3)

Unnamed: 0,emotion,len_words,len_letters,emotion_ps_steamed,emotion_ls_steamed,emotion_sb_steamed
0,angry,1,5,angri,angry,angri
1,yelling,1,7,yell,yel,yell
2,yelling,1,7,yell,yel,yell


## Count frequency and plot

In [24]:
def count_freq_labels(df, col):
    df_counts = df[col].value_counts() # count word frequency 
    df_counts = df_counts.to_frame('counts') # get value_counts as df
    df_counts['emotion'] = df_counts.index # get index as col
    df_counts = df_counts.reset_index(drop=True) # clean index
    df_counts['percent'] = df_counts['counts'] / df_counts['counts'].sum() # compute percentage
    return df_counts

In [25]:
def simple_per_bar(df, X, Y, top, width=300, height=300, title='Title'):
    bars = alt.Chart(df.head(top), title=title).mark_bar().encode(
        alt.X(X, axis=alt.Axis(format='.0%')),
        y=alt.Y(Y, sort='-x'))
    
    text = bars.mark_text(
        align='left',
        baseline='middle',
        dx=3,  # Nudges text to right so it doesn't appear on top of the bar
        fontSize=10
    ).encode(
        alt.Text(X, format='.2%')
    )
    
    chart = (bars + text)
    
    return chart.properties(width=width, height=height)

In [26]:
def simple_count_bar(df, X, Y, top, width=300, height=300, title='Title'):
    bars = alt.Chart(df.head(top), title=title).mark_bar().encode(
        alt.X(X),
        y=alt.Y(Y, sort='-x'))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    fontSize=10
    ).encode(
        alt.Text('counts:Q')
    )
    
    chart = (bars + text)
    
    return chart.properties(width=width, height=height)

In [27]:
df_emo_overall = count_freq_labels(df_stack_single_word, 'emotion_ps_steamed')

In [28]:
df_emo_overall.to_csv('../clean_data/free_choice_emotion_uw_students_overall.csv', index=False)

In [31]:
df_emo_overall = count_freq_labels(df_stack_single_word, 'emotion_ps_steamed')
simple_per_bar(df_emo_overall, 'percent:Q', 'emotion:N', width=300, height=500, top=50, title='Most frequently selected labels | n = '+df_emo_overall['counts'].sum().astype(str))

In [32]:
simple_count_bar(df_emo_overall, 'counts:Q', 'emotion:N', top=20, title='Most frequently selected labels | n = '+df_emo_overall['counts'].sum().astype(str))

## Most frequently used word for each emotion category (grouping pictures by expected-emotion)

In [33]:
def df_add_label(df_emo_answers, emotion_label):
    df_emo_cat = df_emo_answers.copy() 
    df_emo_cat_t = df_emo_cat.T # transpose
    df_emo_cat_t['photo_id'] = df_emo_cat_t.index # get index as col
    df_emo_cat_t = df_emo_cat_t.reset_index(drop=True) # clean index
    df_emo_cat_t_labels = pd.concat([df_emo_cat_t, df_labels], axis=1) # add metadata cols
    df_label =  df_emo_cat_t_labels[df_emo_cat_t_labels['label'] == emotion_label]
    
    return df_label

In [34]:
    df_emotion = df_add_label(df, 'anger') # subset 'emotion' rows
    df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label'], axis=1)
    df_stack_emotion = formating_words(df_emotion_ans) # clean up
    # df_stack_anger['emotion_spell_check'] = df_stack_single_word['emotion'].apply(lambda x: suggest(x)[0][0])
    df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
    df_stack_emotion_count = count_freq_labels(df_stack_emotion, 'emotion_ps_steamed')

In [35]:
def chart_wrapper(df, emotion, title):
    df_emotion = df_add_label(df, emotion) # subset 'emotion' rows
    df_emotion_ans = df_emotion.drop(['photo_id', 'ethnicity', 'sex','age', 'label'], axis=1)
    df_stack_emotion = formating_words(df_emotion_ans) # clean up
    # df_stack_anger['emotion_spell_check'] = df_stack_single_word['emotion'].apply(lambda x: suggest(x)[0][0])
    df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) #steam
    df_stack_emotion_count = count_freq_labels(df_stack_emotion, 'emotion_ps_steamed')
    n = df_stack_emotion_count['counts'].sum()    
    chart = simple_per_bar(df_stack_emotion_count, 'percent:Q', 'emotion:N', top=20, title=title+" | n= "+n.astype(str))
    
    return chart

### Charts

In [36]:
chart_anger = chart_wrapper(df_emo_answers, 'anger', 'Expected label: '+ ' anger')
chart_disgust = chart_wrapper(df_emo_answers, 'disgust', 'Expected label: '+ ' disgust')
chart_fear = chart_wrapper(df_emo_answers, 'fear', 'Expected label: '+ ' fear')
chart_surprise = chart_wrapper(df_emo_answers, 'surprise', 'Expected label: '+ ' surprise')
chart_happiness = chart_wrapper(df_emo_answers, 'happiness', 'Expected label: '+ ' happiness')
chart_sadness = chart_wrapper(df_emo_answers, 'sadness', 'Expected label: '+ ' sadness')
chart_uncertain = chart_wrapper(df_emo_answers, 'uncertain', 'Expected label: '+ ' uncertain')
chart_neutral = chart_wrapper(df_emo_answers, 'neutral', 'Expected label: '+ ' neutral')

In [37]:
(chart_anger | chart_disgust) & (chart_fear | chart_surprise) & (chart_happiness | chart_sadness) & (chart_neutral | chart_uncertain) 

### Most frequently used labels by photo

In [38]:
def photo_chart(df_emo_answers, emotion,  i):
    df = df_add_label(df_emo_answers, emotion) # add emotion label
    photo_id_list = df['photo_id'].tolist() # photo id to list
    splited_photo_id = np.array_split(photo_id_list, 24) # split photo id by photo
    df = df[df['photo_id'].isin(splited_photo_id[i])] # select photo rows   
    df_url = df[['url']].head(1)  # get url for chart
    
    face = alt.Chart(df_url).mark_image(width=110, height=110, align='right', xOffset=0, yOffset=230).encode(url='url')
    
    df_emotion_ans = df.drop(['photo_id', 'ethnicity', 'sex', 'age', 'label', 'url'], axis=1) # clean cols
    df_stack_emotion = formating_words(df_emotion_ans) # clean up words
    df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) # steam
    df_stack_emotion_count = count_freq_labels(df_stack_emotion, 'emotion_ps_steamed') # group and count
    
    chart = simple_per_bar(df_stack_emotion_count, 'percent:Q', 'emotion:N', width=300, height=300, top=20, title="Expected: "+" "+emotion+" | n = "+    df_stack_emotion_count['counts'].sum().astype(str))
    
    return chart + face

In [41]:
photo_chart(df_emo_answers, 'anger',  7)

In [42]:
def charts_emotion_faces(df_emo_answers, emotion):
    charts = []
    for i in range(0, 24):
        charts.append(photo_chart(df_emo_answers, emotion,  i))
    return charts

In [43]:
def dashboard_emotion_faces(charts, rows):
    if rows == 1:
        dashboard= (charts[0] | charts[1] | charts[2])
    if rows == 2:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) 
    if rows == 3:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8])
    if rows == 4:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11])
    if rows == 5:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11]) & \
                   (charts[12] | charts[13] | charts[14]) 
    if rows == 6:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11]) & \
                   (charts[12] | charts[13] | charts[14]) & \
                   (charts[15] | charts[16] | charts[17])
    if rows == 7:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11]) & \
                   (charts[12] | charts[13] | charts[14]) & \
                   (charts[15] | charts[16] | charts[17]) & \
                   (charts[18] | charts[19] | charts[20]) 
    if rows == 8:
        dashboard= (charts[0] | charts[1] | charts[2]) & \
                   (charts[3] | charts[4] | charts[5]) & \
                   (charts[6] | charts[7] | charts[8]) & \
                   (charts[9] | charts[10] | charts[11]) & \
                   (charts[12] | charts[13] | charts[14]) & \
                   (charts[15] | charts[16] | charts[17]) & \
                   (charts[18] | charts[19] | charts[20]) & \
                   (charts[21] | charts[22] | charts[23]) 
        
    return dashboard

In [116]:
# charts = charts_emotion_faces(df_emo_answers, 'sadness')
# dashboard_emotion_faces(charts, 8)

## Emotion percentages as feature vectors

In [230]:
photo_ids = pd.read_csv('../clean_data/photo_ids.csv')

In [231]:
df_emo_overall_raw = count_freq_labels(df_stack_single_word, 'emotion_ps_steamed')
emotion_words_list_steam = df_emo_overall_raw['emotion'].str.lower().tolist()
emotion_words_list = ['happiness','neutral', 'surprise','sadness', 'disgust', 'anger', 'fear', 'uncertain']

In [232]:
# placeholder dataframe
index_photos = photo_ids.iloc[:, 0] + '_1'
df_emo_features = pd.DataFrame(columns=emotion_words_list_steam) 
# add key-ids
df_emo_features.insert(loc=0, column='photo_id', value=index_photos) 

In [234]:
def fill_in_emotion_vectors(df_emo_features, df_emo_answers, emotion_words_list):
    for emo in range(0, len(emotion_words_list)):
        
        emotion = emotion_words_list[emo] # get emotion word
        df_ans = df_add_label(df_emo_answers, emotion) # ans for all pics of emo-word
        photo_id_list = df_ans['photo_id'].tolist() # photo ids for the emotion word
        splited_photo_id = np.array_split(photo_id_list, 24) # photo_id_list as an array 
        
        for ids in range(0, len(splited_photo_id)):
            photo_id = splited_photo_id[ids]
            df_single = df_ans[df_ans['photo_id'].isin(photo_id)] # get row for nth photo
            df_emotion_ans = df_single.drop(['photo_id', 'ethnicity', 'sex', \
                                             'age', 'label', 'url'], axis=1) # clean up for calculation
            df_stack_emotion = formating_words(df_emotion_ans) 
            df_stack_emotion['emotion_ps_steamed'] = df_stack_emotion['emotion'].apply(lambda x: ps.stem(x)) # steam
            source = count_freq_labels(df_stack_emotion, 'emotion_ps_steamed') # group and count / compute vectors
            source = source[['percent', 'emotion']].T
            source.columns = source.iloc[1].str.lower() # emotion words as cols
            source.drop('emotion', axis=0, inplace=True) # clean up emotion row
            source.reset_index(drop=True, inplace=True) # clean up index
            source.columns.name = None
            photo_id_str = photo_id[0] # photo id as str
            source.insert(0, 'photo_id', photo_id_str) # photo id as col
            # set photo id as col for easier manipulation
            df_emo_features = df_emo_features.set_index('photo_id') 
            # fill in vectors where the value is available
            df_emo_features.loc[photo_id_str] = pd.Series(source.T.to_dict()[0])
            
            df_emo_features['photo_id'] = df_emo_features.index # index back to column
            df_emo_features.index.name = None 
            df_emo_features.reset_index(drop=True, inplace=True) # clean up index
    
    return df_emo_features 
            

In [235]:
df_emo_vectors = fill_in_emotion_vectors(df_emo_features, df_emo_answers, emotion_words_list)

In [237]:
df_emo_vectors = df_emo_vectors.fillna(0)

In [240]:
df_label_raw = pd.read_csv('../data/emotion_labels.csv')

In [241]:
df_emo_vectors = pd.concat([df_emo_vectors, df_label_raw], axis=1) # concat metadata

In [243]:
df_emo_vectors.to_csv('../clean_data/free_choice_uw_students_vectors.csv', index=False)

# Clustering

In [244]:
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn import preprocessing as pp

In [245]:
X = df_emo_vectors.copy().drop(['photo_id', 'ethnicity', 'sex', \
                                   'age', 'label', 'url'], axis=1)

In [247]:
features = X.columns
sX = pp.StandardScaler(copy=True)
X.loc[:,features] = sX.fit_transform(X[features])

In [248]:
X.describe() # mean  = 0, SD =1

Unnamed: 0,happi,sad,angri,confus,shock,surpris,upset,mad,disgust,scare,...,pyscho,unknow,toung,tour,anrgi,sexi,moodi,broken,sinist,schock
count,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,...,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0
mean,-1.087332e-16,-6.066167e-17,1.333412e-16,-7.411025e-17,-7.210727e-17,2.632488e-17,1.0301040000000001e-17,7.267955000000001e-17,-1.250432e-16,-1.619552e-16,...,1.7168400000000002e-17,-3.076005e-17,2.632488e-17,6.295079000000001e-17,1.907123e-16,2.466526e-16,-8.355287000000001e-17,9.957670000000001e-17,-4.6926950000000006e-17,2.548076e-16
std,1.002587,1.002587,1.002587,1.002587,1.002587,1.002587,1.002587,1.002587,1.002587,1.002587,...,1.002587,1.002587,1.002587,1.002587,1.002587,1.002587,1.002587,1.002587,1.002587,1.002587
min,-0.6050522,-0.569823,-0.5612858,-0.647896,-0.5533402,-0.5480567,-0.8230025,-0.6452364,-0.418172,-0.4808068,...,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158
25%,-0.6050522,-0.569823,-0.5612858,-0.647896,-0.5533402,-0.5480567,-0.8230025,-0.6452364,-0.418172,-0.4808068,...,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158
50%,-0.5140442,-0.453603,-0.4280381,-0.4443574,-0.5533402,-0.5480567,-0.4350043,-0.6452364,-0.418172,-0.4808068,...,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158
75%,-0.02927102,0.01539813,0.07902204,0.1228584,0.04528947,0.04752725,0.5812946,0.2749819,-0.1834496,-0.05692729,...,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158,-0.07198158
max,3.035271,3.957941,3.992014,4.930494,3.957329,4.127494,3.077062,3.752866,5.632451,6.057532,...,13.89244,13.89244,13.89244,13.89244,13.89244,13.89244,13.89244,13.89244,13.89244,13.89244


In [270]:
def k_means(n_clusters=3, n_inits=30, max_iter=1000):
    kmeans = KMeans(
    init="random",
    n_clusters=n_clusters,
    n_init=n_inits,
    max_iter=max_iter,
    random_state=42)
    kmeans.fit(X)
    
    return kmeans

In [271]:
k_means_run = k_means()

In [272]:
kmeans_kwargs = {
    "init": "random",
    "n_init": 20,
    "max_iter": 1000,
    "random_state": 42,
}

# A list holds the SSE values for each k
sse = []
for k in range(1, 100):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)

In [273]:
source = pd.DataFrame({'y': sse, 'x': range(1, 100)})

In [274]:
chart_elbow = alt.Chart(source).mark_line().encode(
    alt.X('x:Q', title='Number of clusters'), 
    alt.Y('y:Q', title='SSE'))

In [275]:
chart_elbow

In [277]:
kl = KneeLocator(
    range(1, 100), sse, curve="convex", direction="decreasing"
)

kl.elbow

20

In [278]:
# A list holds the silhouette coefficients for each k
silhouette_coefficients = []

# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 100):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(X)
    score = silhouette_score(X, kmeans.labels_)
    silhouette_coefficients.append(score)

In [279]:
source = pd.DataFrame({'y': silhouette_coefficients, 'x': range(2, 100)})

In [280]:
chart_silhouette = alt.Chart(source).mark_line().encode(
    alt.X('x:Q', title='Number of clusters'), 
    alt.Y('y:Q', title='Silhouette coefficients'))

In [281]:
chart_silhouette | chart_elbow

In [282]:
k_means_run = k_means(n_clusters=20)

In [285]:
df_label_raw['clusters'] = k_means_run.labels_

In [286]:
dfs_kmeans = [pd.DataFrame(y) for x, y in df_label_raw.groupby('clusters', as_index=False)]

In [287]:
for i in range(0, len(dfs_kmeans)):
    num_items = len(dfs_kmeans[i])
    dfs_kmeans[i]['x'] = np.linspace(0.1, 3.0, num=num_items)
    dfs_kmeans[i]['y'] = np.linspace(0.1, 3.0, num=num_items)

In [288]:
def grid_photos(nx=6, ny=6, cluster=0, title='title'):
    
    nx, ny = (nx, ny)
    x = np.linspace(0, 1, nx)
    y = np.linspace(0, 1, ny)
    xv, yv = np.meshgrid(x, y)
    
    dfs_kmeans[cluster]['x'] = xv.ravel()[0:len(dfs_kmeans[cluster])]
    dfs_kmeans[cluster]['y'] = yv.ravel()[0:len(dfs_kmeans[cluster])]
    
    chart = alt.Chart(dfs_kmeans[cluster], title=title).mark_image(
        width=50,
        height=50
    ).encode(
        alt.X('x', axis=None),
        alt.Y('y', axis=None),
        url='url'
    )
    
    text = chart.mark_text(
    align='center',
    baseline='bottom',
    yOffset = -25
    #dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    ).encode(
        alt.Text('label')
    )
    
    return (chart + text)

In [295]:
cluster_2 = grid_photos(nx=10, ny=10, cluster=2, title='Cluster 1')

In [297]:
cluster_2