In [None]:
import nltk
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
import altair as alt
from sklearn import preprocessing

# Forced choice sentiment scores

In [738]:
forced_choice_df = pd.read_csv('../clean_data/forced_choice_uw_students_count_emotions.csv')

In [740]:
sia = SentimentIntensityAnalyzer()

In [741]:
forced_choice_emo_list = ['happiness', 'neutral', 'surprise' , 'sadness', 'disgust', 'anger', 'fear', 'uncertain']

In [742]:
emo_scores_forced = dict.fromkeys(forced_choice_emo_list) # placeholder dic with emotion-keys

In [743]:
emo_scores_forced.update({k: sia.polarity_scores(k) for k, v in emo_scores_forced.items()}) # dic with sentiment scores for each emotion word

In [744]:
def sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='compound', emo_list=forced_choice_emo_list):
    scores_df = pd.DataFrame(columns=emo_list, index=range(0,194)) #placeholder dataframe 
    for emo in dic_scores:
        scores_df[emo] = count_matrix[emo] * dic_scores[emo][sentiment]
    scores_df[f"mean-{sentiment}"] = scores_df.mean(axis=1)
    return scores_df

In [745]:
forced_compound = sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='compound', emo_list=forced_choice_emo_list)
forced_neg = sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='neg', emo_list=forced_choice_emo_list)
forced_pos = sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='pos', emo_list=forced_choice_emo_list)
forced_neu = sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='neu', emo_list=forced_choice_emo_list)

In [746]:
forced_compound.describe()

Unnamed: 0,happiness,neutral,surprise,sadness,disgust,anger,fear,uncertain,mean-compound
count,114.0,130.0,115.0,109.0,133.0,110.0,121.0,159.0,192.0
mean,7.554237,0.0,3.190501,-4.945409,-5.016032,-5.724199,-2.722573,-1.208201,-0.825831
std,9.886917,0.0,3.761548,7.007716,6.745409,7.424197,3.745245,0.955946,4.400171
min,0.5574,0.0,0.2732,-21.5796,-28.1718,-26.8793,-16.7926,-4.44,-11.55895
25%,0.5574,0.0,0.2732,-6.606,-5.3946,-7.863625,-2.9634,-1.776,-3.651467
50%,1.1148,0.0,1.0928,-0.8808,-1.7982,-1.7157,-0.9878,-0.888,-1.1203
75%,14.4924,0.0,5.464,-0.4404,-0.5994,-0.5719,-0.4939,-0.592,1.124152
max,27.87,0.0,12.8404,-0.4404,-0.5994,-0.5719,-0.4939,-0.296,13.68805


In [674]:
# forced_choice_df.to_csv('../clean_data/forced_choice_df_uw_students_sentiment_scores.csv', index=False)

# Free labeling sentiment scores

In [747]:
free_labeling_df = pd.read_csv('../clean_data/free_choice_uw_students_count_emotions.csv')
free_labeling_emo_list = pd.read_csv('../clean_data/free_choice_word_list.csv')
free_labeling_emo_list = free_labeling_emo_list['emotion'].tolist()

In [748]:
emo_scores_free = dict.fromkeys(free_labeling_emo_list)

In [749]:
emo_scores_free.update({k: sia.polarity_scores(k) for k, v in emo_scores_free.items()}) # dic with sentiment scores for each emotion word

In [750]:
free_compound = sentiment_matrix(dic_scores = emo_scores_free, count_matrix = free_labeling_df, sentiment='compound', emo_list=free_labeling_emo_list)
free_neg = sentiment_matrix(dic_scores = emo_scores_free, count_matrix = free_labeling_df, sentiment='neg', emo_list=free_labeling_emo_list)
free_pos = sentiment_matrix(dic_scores = emo_scores_free, count_matrix = free_labeling_df, sentiment='pos', emo_list=free_labeling_emo_list)
free_neu = sentiment_matrix(dic_scores = emo_scores_free, count_matrix = free_labeling_df, sentiment='neu', emo_list=free_labeling_emo_list)

In [751]:
free_compound.describe()

Unnamed: 0,happy,sad,angry,confused,upset,shocked,surprised,mad,scared,none,...,broken,moody,sexy,anrgy,touring,tounge,unknowing,pyscho,schocked,mean-compound
count,114.0,115.0,101.0,118.0,118.0,82.0,84.0,94.0,81.0,131.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,192.0
mean,7.008283,-4.248848,-3.538812,-1.601786,-1.543378,-1.878156,1.252732,-2.085939,-1.881215,0.0,...,-0.4767,-0.3612,0.5267,0.0,0.0,0.0,0.0,0.0,0.0,-0.201522
std,7.795894,5.361947,3.892807,1.655734,1.200521,1.750446,1.194602,1.714219,2.123546,0.0,...,,,,,,,,,,0.657242
min,0.5719,-20.0214,-16.3392,-8.9096,-4.9634,-7.0004,0.2263,-6.9146,-11.01,0.0,...,-0.4767,-0.3612,0.5267,0.0,0.0,0.0,0.0,0.0,0.0,-1.459473
25%,1.1438,-4.767,-4.5954,-2.2274,-2.2908,-2.8638,0.2263,-2.9634,-2.6424,0.0,...,-0.4767,-0.3612,0.5267,0.0,0.0,0.0,0.0,0.0,0.0,-0.692994
50%,3.14545,-1.4301,-2.0424,-0.9546,-1.1454,-1.2728,0.6789,-1.4817,-0.8808,0.0,...,-0.4767,-0.3612,0.5267,0.0,0.0,0.0,0.0,0.0,0.0,-0.259728
75%,12.438825,-0.4767,-0.5106,-0.3182,-0.3818,-0.3182,1.640675,-0.4939,-0.4404,0.0,...,-0.4767,-0.3612,0.5267,0.0,0.0,0.0,0.0,0.0,0.0,0.066164
max,24.0198,-0.4767,-0.5106,-0.3182,-0.3818,-0.3182,4.9786,-0.4939,-0.4404,0.0,...,-0.4767,-0.3612,0.5267,0.0,0.0,0.0,0.0,0.0,0.0,1.579425


In [680]:
# free_labeling_df.to_csv('../clean_data/free_labeling_df_uw_students_sentiment_scores.csv', index=False)

# Regression analysis - Free-labeling

In [752]:
reg_free = pd.DataFrame({'sex':free_labeling_df['sex'], 'age':free_labeling_df['age'], 'ethnicity':free_labeling_df['ethnicity'], \
                         'mean-compound':free_compound['mean-compound'], \
                         'mean-neg':free_neg['mean-neg'], \
                         'mean-pos':free_pos['mean-pos'], \
                         'mean-neu':free_neu['mean-neu']}) 

In [753]:
reg_free = reg_free.dropna()

## Descriptives plots

In [754]:
source = reg_free

hist_free_all_com = alt.Chart(source).mark_bar().encode(
    alt.X('mean-compound:Q',bin=alt.Bin(maxbins=30)),
    y='count()')

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'))

(hist_free_all_com + rule)

### Sentiment by sex

In [755]:
source = reg_free
hist_free_sex_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30)),
    alt.Y('count()',  stack=None),
    alt.Color('sex'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'),
    color=alt.Color('sex'))

(hist_free_sex_com + rule)

### Sentiment by ethnicity

In [756]:
source = reg_free
hist_free_ethnicity_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30)),
    alt.Y('count()',  stack=None),
    alt.Color('ethnicity'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'),
    color=alt.Color('ethnicity'))

(hist_free_ethnicity_com + rule)

### Sentiment by age

In [757]:
source = reg_free
hist_free_age_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30)),
    alt.Y('count()',  stack=None),
    alt.Color('age'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'),
    color=alt.Color('age'))

(hist_free_age_com + rule)

## Mean and error bar showing confidence interval

In [758]:
reg_free['groups'] = reg_free['ethnicity']+'_'+reg_free['sex']+'_'+reg_free['age']

In [759]:
error_bars = alt.Chart(reg_free).mark_errorbar(extent='ci').encode(
    x=alt.X('mean-compound:Q', scale=alt.Scale(zero=False), title='mean-compound-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N')
  
)

points = alt.Chart(reg_free).mark_point(filled=True, color='black').encode(
    x=alt.X('mean-compound:Q', aggregate='mean', title='mean-compound-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N'))

rule = alt.Chart(reg_free).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'))

In [760]:
(error_bars + points + rule)

In [761]:
error_bars = alt.Chart(reg_free).mark_errorbar(extent='ci').encode(
    x=alt.X('mean-neg:Q', scale=alt.Scale(zero=False), title='mean-neg-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N')
  
)

points = alt.Chart(reg_free).mark_point(filled=True, color='black').encode(
    x=alt.X('mean-neg:Q', aggregate='mean', title='mean-neg-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N'))

rule = alt.Chart(reg_free).mark_rule(color='red').encode(
    x=alt.X('mean-neg:Q', aggregate='mean'))

In [762]:
(error_bars + points + rule)

In [763]:
error_bars = alt.Chart(reg_free).mark_errorbar(extent='ci').encode(
    x=alt.X('mean-pos:Q', scale=alt.Scale(zero=False), title='mean-pos-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N')
  
)

points = alt.Chart(reg_free).mark_point(filled=True, color='black').encode(
    x=alt.X('mean-pos:Q', aggregate='mean', title='mean-pos-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N'))

rule = alt.Chart(reg_free).mark_rule(color='red').encode(
    x=alt.X('mean-pos:Q', aggregate='mean'))

In [764]:
(error_bars + points + rule)

In [765]:
error_bars = alt.Chart(reg_free).mark_errorbar(extent='ci').encode(
    x=alt.X('mean-neu:Q', scale=alt.Scale(zero=False), title='mean-neu-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N')
  
)

points = alt.Chart(reg_free).mark_point(filled=True, color='black').encode(
    x=alt.X('mean-neu:Q', aggregate='mean', title='mean-neu-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N'))

rule = alt.Chart(reg_free).mark_rule(color='red').encode(
    x=alt.X('mean-neu:Q', aggregate='mean'))

In [766]:
(error_bars + points + rule)

## Model - Free-labeling

In [767]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.graphics.api import interaction_plot, abline_plot
from statsmodels.stats.anova import anova_lm

In [768]:
# center independent variables
reg_free['sexC'] = reg_free['sex'].replace({'female': -0.5, 'male': 0.5})
reg_free['ageC'] = reg_free['age'].replace({'child': -0.5, 'adult': 0.5})
reg_free['ethnicityC'] = reg_free['ethnicity'].replace({'bipoc': -0.5, 'white': 0.5})

### Compound sentiment

In [769]:
reg_free_r = reg_free.rename(columns={"mean-compound": "sentimentCom" })

In [770]:
formula = "sentimentCom ~ sexC+ethnicityC+ageC"
lm = ols(formula, data=reg_free_r).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:           sentimentCom   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                 -0.011
Method:                 Least Squares   F-statistic:                    0.3055
Date:                Wed, 03 Nov 2021   Prob (F-statistic):              0.821
Time:                        12:27:21   Log-Likelihood:                -190.89
No. Observations:                 192   AIC:                             389.8
Df Residuals:                     188   BIC:                             402.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.2015      0.048     -4.225      0.0

In [771]:
infl = lm.get_influence()
# print(infl.summary_table())

#############################
# Check values for influece #
#############################

In [772]:
lm_interX = ols("sentimentCom ~ sexC*ethnicityC*ageC", data=reg_free_r).fit()
print(lm_interX.summary())

                            OLS Regression Results                            
Dep. Variable:           sentimentCom   R-squared:                       0.012
Model:                            OLS   Adj. R-squared:                 -0.025
Method:                 Least Squares   F-statistic:                    0.3272
Date:                Wed, 03 Nov 2021   Prob (F-statistic):              0.941
Time:                        12:27:26   Log-Likelihood:                -190.16
No. Observations:                 192   AIC:                             396.3
Df Residuals:                     184   BIC:                             422.4
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept               -0.2015 

In [773]:
infl_interX = lm_interX.get_influence()
# print(infl_interX.summary_table())

#############################
# Check values for influece #
#############################

In [715]:
table1 = anova_lm(lm, lm_interX)
print(table1)

   df_resid       ssr  df_diff   ss_diff        F    Pr(>F)
0     188.0  0.068595      0.0       NaN      NaN       NaN
1     184.0  0.068192      4.0  0.000404  0.27226  0.895585


### Negative sentiment

In [716]:
# reg_free_r = reg_free.rename(columns={"mean-neg": "sentimentNeg" })

In [717]:
# formula = "sentimentNeg ~ sexC+ethnicityC+ageC"
# lm = ols(formula, data=reg_free_r).fit()
# print(lm.summary())

In [718]:
# lm_interX = ols("sentimentNeg ~ sexC*ethnicityC*ageC", data=reg_free_r).fit()
# print(lm_interX.summary())

### Positive sentiment

In [719]:
# reg_free_r = reg_free.rename(columns={"mean-pos": "sentimentPos" })

In [720]:
# formula = "sentimentPos ~ sexC+ethnicityC+ageC"
# lm = ols(formula, data=reg_free_r).fit()
# print(lm.summary())

In [721]:
# lm_interX = ols("sentimentPos ~ sexC*ethnicityC*ageC", data=reg_free_r).fit()
# print(lm_interX.summary())

### Neutral sentiment

In [722]:
# reg_free_r = reg_free.rename(columns={"mean-neu": "sentimentNeu" })

In [723]:
# formula = "sentimentNeu ~ sexC+ethnicityC+ageC"
# lm = ols(formula, data=reg_free_r).fit()
# print(lm.summary())

In [724]:
# lm_interX = ols("sentimentNeu ~ sexC*ethnicityC*ageC", data=reg_free_r).fit()
# print(lm_interX.summary())

# Regression analysis forced-choice

In [725]:
reg_forced = pd.DataFrame({'sex':forced_choice_df['sex'], 'age':forced_choice_df['age'], 'ethnicity':forced_choice_df['ethnicity'], \
                         'mean-compound':forced_compound['mean-compound'], \
                         'mean-neg':forced_neg['mean-neg'], \
                         'mean-pos':forced_pos['mean-pos'], \
                         'mean-neu':forced_neu['mean-neu']}) 

In [726]:
reg_forced = reg_forced.dropna()

## Descriptive plots

In [727]:
source = reg_forced

hist_forced_all_com = alt.Chart(source).mark_bar().encode(
    alt.X('mean-compound:Q',bin=alt.Bin(maxbins=30)),
    y='count()')

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'))

(hist_forced_all_com + rule)

### Sentiment by sex

In [728]:
source = reg_forced

hist_forced_sex_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30)),
    alt.Y('count()',  stack=None),
    alt.Color('sex'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'),
    color=alt.Color('sex'))

(hist_forced_sex_com + rule)

### Sentiment by ethnicity

In [729]:
source = reg_forced

hist_ethnicity_sex_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30)),
    alt.Y('count()',  stack=None),
    alt.Color('ethnicity'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'),
    color=alt.Color('ethnicity'))

(hist_ethnicity_sex_com + rule)

### Sentiment by age

In [730]:
source = reg_forced

hist_forced_age_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30)),
    alt.Y('count()',  stack=None),
    alt.Color('age'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'),
    color=alt.Color('age'))

(hist_forced_age_com + rule)

## Mean and error bar showing confidence interval

In [731]:
reg_forced['groups'] = reg_forced['ethnicity']+'_'+reg_forced['sex']+'_'+reg_forced['age']

### compound

In [732]:
source = reg_forced

error_bars = alt.Chart(source).mark_errorbar(extent='ci').encode(
    x=alt.X('mean-compound:Q', scale=alt.Scale(zero=False), title='mean-compound-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N')
  
)

points = alt.Chart(source).mark_point(filled=True, color='black').encode(
    x=alt.X('mean-compound:Q', aggregate='mean', title='mean-compound-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'))

In [733]:
(error_bars+points+rule)

### negative

In [734]:
source = reg_forced

error_bars = alt.Chart(source).mark_errorbar(extent='ci').encode(
    x=alt.X('mean-neg:Q', scale=alt.Scale(zero=False), title='mean-neg-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N')
  
)

points = alt.Chart(source).mark_point(filled=True, color='black').encode(
    x=alt.X('mean-neg:Q', aggregate='mean', title='mean-neg-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-neg:Q', aggregate='mean'))

In [735]:
(error_bars+points+rule)

### positive

In [736]:
source = reg_forced

error_bars = alt.Chart(source).mark_errorbar(extent='ci').encode(
    x=alt.X('mean-pos:Q', scale=alt.Scale(zero=False), title='mean-pos-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N')
  
)

points = alt.Chart(source).mark_point(filled=True, color='black').encode(
    x=alt.X('mean-pos:Q', aggregate='mean', title='mean-pos-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-pos:Q', aggregate='mean'))

In [737]:
(error_bars+points+rule)

### neutral

In [778]:
source = reg_forced

error_bars = alt.Chart(source).mark_errorbar(extent='ci').encode(
    x=alt.X('mean-neu:Q', scale=alt.Scale(zero=False), title='mean-neu-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N')
  
)

points = alt.Chart(source).mark_point(filled=True, color='black').encode(
    x=alt.X('mean-neu:Q', aggregate='mean', title='mean-neu-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-neu:Q', aggregate='mean'))

In [779]:
(error_bars+points+rule)

## Model - Forced-choice

In [None]:
# 