In [83]:
import pandas as pd
import numpy as np
import altair as alt

## Data ingestion and formating

In [84]:
df = pd.read_csv('../data/emotion_forced_choice_uw_students.csv')
df_labels = pd.read_csv('../data/emotion_labels.csv')

In [85]:
df = df.iloc[5:, :] # filter out test rows

In [86]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 5 to 85
Columns: 220 entries, StartDate to Q195.1
dtypes: object(220)
memory usage: 139.3+ KB


In [87]:
df = df[df['Finished'] =='True'] # filter out incomplete surveys

In [88]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51 entries, 5 to 85
Columns: 220 entries, StartDate to Q195.1
dtypes: object(220)
memory usage: 88.1+ KB


In [89]:
df.rename(columns={
    'Q1.2': 'sex',
    'Q1.3_1': 'age',
    'Q1.4': 'ethnicity',
    'Q1.5': 'formal education',
    'Q1.6_1': 'income'}, inplace=True)

In [90]:
df.to_csv('../clean_data/forced_choice_emotion_uw_students.csv', index=False)

### Words by frequency for all images (ranking)

In [91]:
def count_freq_labels(df, X="all" ):
    if X == "all":
        df_counts = df.stack().reset_index(drop=True).value_counts() # stack as series
        df_counts = df_counts.to_frame('counts') # get value_counts as df
        df_counts['emotion'] = df_counts.index # get index as col
    else:
        df_counts = df[X].reset_index(drop=True).value_counts() # stack as series
        df_counts = df_counts.to_frame('counts') # get value_counts as df
        df_counts[X] = df_counts.index # get index as col

    df_counts = df_counts.reset_index(drop=True) # clean index
    df_counts['percent'] = df_counts['counts'] / df_counts['counts'].sum() # compute percentage
    return df_counts

In [92]:
def simple_per_bar(
    df, title='Title', X='percent:Q', Y='emotion:N', \
    width=450, height=250, sort='-x', \
    text_size = 12, label_size = 11, title_size=12, \
    emotion='Some', color1='#0570b0', color2='orange'):
    
    bars = alt.Chart(df, title=title).mark_bar().encode(
        alt.X(X, axis=alt.Axis(format='.0%')),
        y=alt.Y(Y, sort=sort), 
        color=alt.condition(
            alt.datum.emotion == emotion,
            alt.value(color2),
            alt.value(color1)
        ))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    fontSize=text_size
    ).encode(
        alt.Text(X, format='.1%')
    )
    
    chart = (bars + text).configure_axis(
            labelFontSize=label_size,
            titleFontSize=title_size).properties(
                width=width, 
                height=height).display(renderer='svg')
    
    
    return chart

In [93]:
def simple_count_bar(
    
    df, title='Title', X='counts:Q', Y='emotion:N', \
    width=450, height=250, sort='-x', \
    text_size = 12, label_size = 11, title_size=12,
    emotion='Some', color1='#0570b0', color2='#orange'):
    
    bars = alt.Chart(df, title=title).mark_bar().encode(
        alt.X(X),
        y=alt.Y(Y, sort=sort), 
        color=alt.condition(
            alt.datum.emotion == emotion,
            alt.value(color2),
            alt.value(color1)
        ))
    
    text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3,  # Nudges text to right so it doesn't appear on top of the bar
    fontSize=text_size
    ).encode(
        alt.Text(X)
    )
    
    chart = (bars + text).configure_axis(
            labelFontSize=label_size,
            titleFontSize=title_size).properties(
                width=width, 
                height=height).display(renderer='svg')
    
    
    return chart

## Demographics

In [94]:
source = count_freq_labels(df, X="sex") 
title = 'Sex | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'sex:N'
w, h= 450, 100
txs, ls, ts = 12, 12, 12

chart_sex = simple_per_bar(source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)

In [95]:
source = count_freq_labels(df, X="age") 
title = 'Age | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'age:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_age = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h, \
    text_size = txs, label_size = ls, title_size=ts)
chart_age

In [96]:
source = count_freq_labels(df, X="ethnicity") 
title = 'Ethnicity | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'ethnicity:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_ethnicity= simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h, \
    text_size = txs, label_size = ls, title_size=ts)
chart_ethnicity

In [97]:
source = count_freq_labels(df, X="formal education") 
title = 'Formal education | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'formal education:N'
w, h= 450, 150
txs, ls, ts = 12, 11, 12

chart_formal_education= simple_per_bar(
    source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)
chart_formal_education

## Overall results

In [98]:
df_emo_answers = df.loc[:, 'Q2.1':'Q195.1'] # subset photos

In [99]:
source = count_freq_labels(df_emo_answers, X="all") 

# save list of emotions words to avoid repeating this computation later
emotion_words_list = source['emotion'].str.lower().tolist()
emotion_words_list.remove('other')

title = 'Labels frequency | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12

chart_overall_per = simple_per_bar(
    source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)
chart_overall_per

In [100]:
source = count_freq_labels(df_emo_answers, X="all") 
title = 'Labels frequency | n = '+ source['counts'].sum().astype(str)
X, Y = 'counts:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12

chart_overall_count = simple_count_bar(
    source, title=title, X=X, Y=Y, width=w, height=h, text_size = txs, label_size = ls, title_size=ts)
chart_overall_count

## Most frequently used word for each emotion category (grouping pictures by expected-emotion)

In [101]:
def emotion_df_formated(df_emo_answers, emotion_label):
    df_emo_cat = df_emo_answers.copy() 
    df_emo_cat_t = df_emo_cat.T # transpose
    df_emo_cat_t['photo_id'] = df_emo_cat_t.index # get index as col
    df_emo_cat_t = df_emo_cat_t.reset_index(drop=True) # clean index
    df_emo_cat_t_labels = pd.concat([df_emo_cat_t, df_labels], axis=1) # add metadata cols
    df_label =  df_emo_cat_t_labels[df_emo_cat_t_labels['label'] == emotion_label]
    return df_label

In [102]:
emotion = 'anger'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_anger = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h, emotion=emotion.capitalize(), \
    text_size = txs, label_size = ls, title_size=ts)

### Disgust

In [103]:
emotion = 'disgust'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_disgust = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h,  emotion=emotion.capitalize(),\
    text_size = txs, label_size = ls, title_size=ts)

### Fear

In [104]:
emotion = 'fear'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_fear = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h,  emotion=emotion.capitalize(),\
    text_size = txs, label_size = ls, title_size=ts)

### Surprise

In [105]:
emotion = 'surprise'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_surprise = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h,  emotion=emotion.capitalize(), \
    text_size = txs, label_size = ls, title_size=ts)

### Happiness

In [106]:
emotion = 'happiness'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_happiness = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h,  emotion=emotion.capitalize(),\
    text_size = txs, label_size = ls, title_size=ts)

### Sadness

In [107]:
emotion = 'sadness'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_sadness = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h,  emotion=emotion.capitalize(),\
    text_size = txs, label_size = ls, title_size=ts)

### Uncertain

In [108]:
emotion = 'uncertain'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_uncertain = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h,  emotion=emotion.capitalize(),\
    text_size = txs, label_size = ls, title_size=ts)

### Neutral

In [109]:
emotion = 'neutral'
df_formated = emotion_df_formated(df_emo_answers, emotion) # subset 'anger' rows
df_formated_ans = df_formated.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
source = count_freq_labels(df_formated_ans, X="all") 
title = 'Expcted label: '+ emotion + ' | n = '+ source['counts'].sum().astype(str)
X, Y = 'percent:Q', 'emotion:N'
w, h= 450, 200
txs, ls, ts = 12, 11, 12


chart_neutral = simple_per_bar(
    source, title=title, X=X, Y=Y, \
    width=w, height=h,  emotion=emotion.capitalize(),\
    text_size = txs, label_size = ls, title_size=ts)

## Most frequently used word for each emotion category (grouping pictures by expected-emotion), by ethnicity group

In [110]:
def emotion_df_formated_et(df_emo_answers, emotion_label, ethnicity):
    df_emo_cat = df_emo_answers.copy() 
    df_emo_cat_t = df_emo_cat.T # transpose
    df_emo_cat_t['photo_id'] = df_emo_cat_t.index # get index as col
    df_emo_cat_t = df_emo_cat_t.reset_index(drop=True) # clean index
    df_emo_cat_t_labels = pd.concat([df_emo_cat_t, df_labels], axis=1) # add metadata cols
    df_label =  df_emo_cat_t_labels[(df_emo_cat_t_labels['label'] == emotion_label) & (df_emo_cat_t_labels['ethnicity'] == ethnicity)]
    return df_label

In [111]:
def wrapper_chart_emotion(df_emo_answers, emotion, ethnicity):
    df = emotion_df_formated_et(df_emo_answers, emotion,  ethnicity) # subset 'anger' rows
    df_formated_ans = df.drop(['photo_id', 'ethnicity', 'sex','age', 'label', 'url'], axis=1)
    df_count = count_freq_labels(df_formated_ans) # count label freq
    chart = simple_per_bar(
        df_count, \
        title='Expected label: '+ emotion + ' | n = '+ df_count['counts'].sum().astype(str), \
         emotion=emotion.capitalize())
    return chart

## Anger

In [112]:
chart_anger_bipoc = wrapper_chart_emotion(df_emo_answers, 'anger', 'bipoc')
chart_anger_white = wrapper_chart_emotion(df_emo_answers, 'anger', 'white')

### Disgust

In [113]:
chart_disgust_bipoc = wrapper_chart_emotion(df_emo_answers, 'disgust', 'bipoc')
chart_disgust_white = wrapper_chart_emotion(df_emo_answers, 'disgust', 'white')

### Fear

In [114]:
chart_fear_bipoc = wrapper_chart_emotion(df_emo_answers, 'fear', 'bipoc')
chart_fear_white = wrapper_chart_emotion(df_emo_answers, 'fear', 'white')

### Surprise

In [115]:
chart_surprise_bipoc= wrapper_chart_emotion(df_emo_answers, 'surprise', 'bipoc')
chart_surprise_white = wrapper_chart_emotion(df_emo_answers, 'surprise', 'white')

### Happiness

In [116]:
chart_happiness_bipoc = wrapper_chart_emotion(df_emo_answers, 'happiness', 'bipoc')
chart_happiness_white = wrapper_chart_emotion(df_emo_answers, 'happiness', 'white')

### Sadness

In [117]:
chart_sadness_bipoc = wrapper_chart_emotion(df_emo_answers, 'sadness', 'bipoc')
chart_sadness_white = wrapper_chart_emotion(df_emo_answers, 'sadness', 'white')

### Neutral

In [118]:
chart_neutral_bipoc = wrapper_chart_emotion(df_emo_answers, 'neutral', 'bipoc')
chart_neutral_white = wrapper_chart_emotion(df_emo_answers, 'neutral', 'white')

### Uncertain/Other

In [119]:
chart_uncertain_bipoc = wrapper_chart_emotion(df_emo_answers, 'uncertain', 'bipoc')
chart_uncertain_white = wrapper_chart_emotion(df_emo_answers, 'uncertain', 'white')

## Emotion percentages as feature vectors

In [120]:
# placeholder dataframe
df_emo_features = pd.DataFrame(columns = emotion_words_list) 
# add key-ids
df_emo_features.insert(loc=0, column='photo_id', value=df_emo_answers.columns) 

In [121]:
df_emo_answers = df.loc[:, 'Q2.1':'Q195.1'] # subset photos

In [122]:
def fill_in_emotion_vectors(df_emo_features, df_emo_answers, emotion_words_list):
    for emo in range(0, len(emotion_words_list)):
        
        emotion = emotion_words_list[emo] # get emotion word
        df_ans = emotion_df_formated(df_emo_answers, emotion) # ans for all pics of emo-word
        photo_id_list = df_ans['photo_id'].tolist() # photo ids for the emotion word
        splited_photo_id = np.array_split(photo_id_list, 24) # photo_id_list as an array 
        
        for ids in range(0, len(splited_photo_id)):
            photo_id = splited_photo_id[ids]  
            df_single = df_ans[df_ans['photo_id'].isin(photo_id)] # get row for nth photo
            df_emotion_ans = df_single.drop(['photo_id', 'ethnicity', 'sex', \
                                             'age', 'label', 'url'], axis=1) # clean up for calculation
            source = count_freq_labels(df_emotion_ans, X="all") # compute vector-values
            source = source[['percent', 'emotion']].T
            source.columns = source.iloc[1].str.lower() # emotion words as cols
            source.drop('emotion', axis=0, inplace=True) # clean up emotion row
            source.reset_index(drop=True, inplace=True) # clean up index
            source.columns.name = None
            photo_id_str = photo_id[0] # photo id as str
            source.insert(0, 'photo_id', photo_id_str) # photo id as col
            # set photo id as col for easier manipulation
            df_emo_features = df_emo_features.set_index('photo_id') 
            # fill in vectors where the value is available
#             print(pd.Series(df_emo_features.loc[photo_id_str]))
            df_emo_features.loc[photo_id_str] = pd.Series(source.T.to_dict()[0])
            
            df_emo_features['photo_id'] = df_emo_features.index # index back to column
            df_emo_features.index.name = None 
            df_emo_features.reset_index(drop=True, inplace=True) # clean up index
    
    return df_emo_features 
            


In [123]:
df_emo_vectors = fill_in_emotion_vectors(df_emo_features, df_emo_answers, emotion_words_list)

In [124]:
df_emo_vectors = df_emo_vectors.fillna(0)

In [125]:
df_emo_vectors = pd.concat([df_emo_vectors, df_labels], axis=1) # concat metadata

In [126]:
df_emo_vectors.to_csv('../clean_data/forced_choice_uw_students_vectors.csv', index=False)

# Clustering

In [150]:
from kneed import KneeLocator
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

## K means

### Standarize feature matrix X

In [151]:
df_emo_vectors.describe()

Unnamed: 0,happiness,neutral,surprise,sadness,disgust,anger,fear,uncertain
count,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0
mean,0.156155,0.142713,0.135739,0.123711,0.112492,0.11128,0.067415,0.065595
std,0.296686,0.230102,0.236164,0.257818,0.197826,0.214703,0.128459,0.065076
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019608
50%,0.019608,0.029412,0.019608,0.019608,0.019608,0.019608,0.019608,0.039216
75%,0.093137,0.137255,0.117647,0.058824,0.098039,0.098039,0.053922,0.098039
max,0.980392,0.843137,0.921569,0.960784,0.921569,0.921569,0.666667,0.294118


In [152]:
df_emo_vectors.head()

Unnamed: 0,happiness,neutral,surprise,sadness,disgust,anger,fear,uncertain,photo_id,ethnicity,sex,age,label,url
0,0.019608,0.372549,0.019608,0.0,0.098039,0.27451,0.0,0.137255,Q2.1,bipoc,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
1,0.0,0.019608,0.0,0.0,0.098039,0.784314,0.039216,0.039216,Q3.1,bipoc,female,child,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
2,0.0,0.039216,0.0,0.0,0.0,0.921569,0.019608,0.0,Q4.1,bipoc,male,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
3,0.0,0.019608,0.058824,0.333333,0.039216,0.156863,0.078431,0.176471,Q5.1,bipoc,male,child,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
4,0.0,0.098039,0.039216,0.0,0.019608,0.686275,0.019608,0.039216,Q6.1,white,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...


In [153]:
X = df_emo_vectors.copy().drop(['photo_id', 'ethnicity', 'sex', \
                                   'age', 'label', 'url'], axis=1)

In [154]:
features = X.columns
sX = pp.StandardScaler(copy=True)

In [155]:
X.loc[:, features] = sX.fit_transform(X[features])

In [156]:
X.describe() # mean  = 0, SD =1

Unnamed: 0,happiness,neutral,surprise,sadness,disgust,anger,fear,uncertain
count,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0
mean,-5.2649750000000004e-17,2.323456e-16,-3.4909070000000004e-17,5.722798999999999e-19,3.862889e-17,-3.7770470000000003e-17,7.153499e-19,-7.325183000000001e-17
std,1.002587,1.002587,1.002587,1.002587,1.002587,1.002587,1.002587,1.002587
min,-0.5276933,-0.6218201,-0.5762526,-0.481082,-0.570114,-0.5196366,-0.5261521,-1.010593
25%,-0.5276933,-0.6218201,-0.5762526,-0.481082,-0.570114,-0.5196366,-0.5261521,-0.7085053
50%,-0.4614328,-0.4936688,-0.4930113,-0.4048321,-0.4707411,-0.4280748,-0.3731183,-0.4064173
75%,-0.2129559,-0.02378066,-0.07680507,-0.2523322,-0.07324915,-0.06182778,-0.1053093,0.4998466
max,2.785333,3.051851,3.336086,3.255164,4.100416,3.783766,4.676995,3.520726


In [157]:
kmeans = KMeans(
    init="random",
    n_clusters=3,
    n_init=10,
    max_iter=300,
    random_state=42
)

In [158]:
kmeans.fit(X)

KMeans(init='random', n_clusters=3, random_state=42)

In [159]:
kmeans.inertia_

1082.329849776996

In [160]:
kmeans.cluster_centers_

array([[-0.44429646, -0.39055855, -0.38537178, -0.33384075,  0.96331379,
         0.91219985, -0.39158792,  0.4581793 ],
       [-0.3920904 , -0.43307012,  1.50865039, -0.34276817, -0.33439209,
        -0.41955743,  1.22127963, -0.31508841],
       [ 0.45837723,  0.44381087, -0.45720864,  0.36668596, -0.44616494,
        -0.37490991, -0.32046156, -0.14006019]])

In [161]:
kmeans.n_iter_

6

In [163]:
kmeans.labels_[:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 2, 2, 0, 2, 0, 0, 2, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)

In [192]:
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

# A list holds the SSE values for each k
sse = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(X)
    sse.append(kmeans.inertia_)

In [193]:
source = pd.DataFrame({'y': sse, 'x': range(1, 11)})

In [194]:
chart_elbow = alt.Chart(source).mark_line().encode(
    alt.X('x:Q', title='Number of clusters'), 
    alt.Y('y:Q', title='SSE'))

In [195]:
kl = KneeLocator(
    range(1, 11), sse, curve="convex", direction="decreasing"
)

kl.elbow

6

In [196]:
# A list holds the silhouette coefficients for each k
silhouette_coefficients = []

# Notice you start at 2 clusters for silhouette coefficient
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(X)
    score = silhouette_score(X, kmeans.labels_)
    silhouette_coefficients.append(score)

In [197]:
silhouette_coefficients

[0.2040875897986279,
 0.24472340545858115,
 0.3228956651130875,
 0.3747794923947065,
 0.447073665139646,
 0.4808994764984768,
 0.49809610111994734,
 0.4784383471750964,
 0.4718231719970103]

In [187]:
source = pd.DataFrame({'y': silhouette_coefficients, 'x': range(2, 11)})

In [188]:
chart_silhouette = alt.Chart(source).mark_line().encode(
    alt.X('x:Q', title='Number of clusters'), 
    alt.Y('y:Q', title='Silhouette coefficients'))

In [199]:
chart_silhouette | chart_elbow