In [180]:
import nltk
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
import altair as alt

# Forced choice sentiment scores

In [115]:
forced_choice_df = pd.read_csv('../clean_data/forced_choice_uw_students_count_emotions.csv')

In [116]:
sia = SentimentIntensityAnalyzer()

In [117]:
forced_choice_emo_list = ['happiness', 'neutral', 'surprise' , 'sadness', 'disgust', 'anger', 'fear', 'uncertain']

In [118]:
emo_scores_forced = dict.fromkeys(forced_choice_emo_list) # placeholder dic with emotion-keys

In [119]:
emo_scores_forced.update({k: sia.polarity_scores(k) for k, v in emo_scores_forced.items()}) # dic with sentiment scores for each emotion word

In [138]:
def sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='compound', emo_list=forced_choice_emo_list):
    scores_df = pd.DataFrame(columns=emo_list, index=range(0,194)) #placeholder dataframe 
    for emo in dic_scores:
        scores_df[emo] = count_matrix[emo] * dic_scores[emo][sentiment] 
    scores_df[f"mean-{sentiment}"] = scores_df.mean(axis=1)
    return scores_df

In [142]:
forced_compound = sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='compound', emo_list=forced_choice_emo_list)
forced_neg = sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='neg', emo_list=forced_choice_emo_list)
forced_pos = sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='pos', emo_list=forced_choice_emo_list)
forced_neu = sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='neu', emo_list=forced_choice_emo_list)

In [93]:
# forced_choice_df.to_csv('../clean_data/forced_choice_df_uw_students_sentiment_scores.csv', index=False)

# Free labeling sentiment scores

In [165]:
free_labeling_df = pd.read_csv('../clean_data/free_choice_uw_students_count_emotions.csv')
free_labeling_emo_list = pd.read_csv('../clean_data/free_choice_word_list.csv')
free_labeling_emo_list = free_labeling_emo_list['emotion'].tolist()

In [166]:
emo_scores_free = dict.fromkeys(free_labeling_emo_list)

In [169]:
emo_scores_free.update({k: sia.polarity_scores(k) for k, v in emo_scores_free.items()}) # dic with sentiment scores for each emotion word

In [171]:
free_compound = sentiment_matrix(dic_scores = emo_scores_free, count_matrix = free_labeling_df, sentiment='compound', emo_list=free_labeling_emo_list)
free_neg = sentiment_matrix(dic_scores = emo_scores_free, count_matrix = free_labeling_df, sentiment='neg', emo_list=free_labeling_emo_list)
free_pos = sentiment_matrix(dic_scores = emo_scores_free, count_matrix = free_labeling_df, sentiment='pos', emo_list=free_labeling_emo_list)
free_neu = sentiment_matrix(dic_scores = emo_scores_free, count_matrix = free_labeling_df, sentiment='neu', emo_list=free_labeling_emo_list)

In [94]:
# free_labeling_df.to_csv('../clean_data/free_labeling_df_uw_students_sentiment_scores.csv', index=False)

# Regression analysis - Free-labeling

In [192]:
reg_free = pd.DataFrame({'sex':free_labeling_df['sex'], 'age':free_labeling_df['age'], 'ethnicity':free_labeling_df['ethnicity'], 'mean-compound':free_compound['mean-compound']}) 
# X_free = pd.DataFrame({'sex':free_labeling_df['sex'], 'age':free_labeling_df['age'], 'ethnicity':free_labeling_df['ethnicity']})
# Y_free = pd.DataFrame({'mean-compound':free_compound['mean-compound']})

In [229]:
reg_free = reg_free.dropna()

## Descriptives plots

In [231]:
hist_free_all = alt.Chart(reg_free).mark_bar().encode(
    alt.X('mean-compound:Q',bin=alt.Bin(maxbins=30)),
    y='count()')
hist_free_all

### Sex

In [232]:
# source = reg_free[reg_free['sex']=='female']
# hist_free_female = alt.Chart(source).mark_bar().encode(
#     alt.X('mean-compound:Q',bin=alt.Bin(maxbins=30)),
#     y='count()')
# source = reg_free[reg_free['sex']=='male']
# hist_free_male = alt.Chart(source).mark_bar().encode(
#     alt.X('mean-compound:Q',bin=alt.Bin(maxbins=30)),
#     y='count()')

In [235]:
source = reg_free
alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30)),
    alt.Y('count()',  stack=None),
    alt.Color('sex'))

### Ethnicity

In [238]:
# source = reg_free[reg_free['ethnicity']=='bipoc']
# hist_free_bipoc = alt.Chart(source).mark_bar().encode(
#     alt.X('mean-compound:Q',bin=alt.Bin(maxbins=30)),
#     y='count()')
# source = reg_free[reg_free['ethnicity']=='white']
# hist_free_white = alt.Chart(source).mark_bar().encode(
#     alt.X('mean-compound:Q',bin=alt.Bin(maxbins=30)),
#     y='count()')

In [240]:
source = reg_free
alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30)),
    alt.Y('count()',  stack=None),
    alt.Color('ethnicity'))

### Age

In [241]:
source = reg_free
alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30)),
    alt.Y('count()',  stack=None),
    alt.Color('age'))

## Mean and error bar showing confidence interval

In [319]:
reg_free['groups'] = reg_free['ethnicity']+'_'+reg_free['sex']+'_'+reg_free['age']

In [334]:
error_bars = alt.Chart(reg_free).mark_errorbar(extent='ci').encode(
    x=alt.X('mean-compound:Q', scale=alt.Scale(zero=False), title='mean-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N')
  
)

points = alt.Chart(reg_free).mark_point(filled=True, color='black').encode(
    x=alt.X('mean-compound:Q', aggregate='mean', title='mean-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('ethnicity:N'))

rule = alt.Chart(reg_free).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'))

In [335]:
(error_bars + points + rule)

## Model

In [367]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.graphics.api import interaction_plot, abline_plot
from statsmodels.stats.anova import anova_lm

In [372]:
# center independent variables
reg_free['sexC'] = reg_free['sex'].replace({'female': -0.5, 'male': 0.5})
reg_free['ageC'] = reg_free['age'].replace({'child': -0.5, 'adult': 0.5})
reg_free['ethnicityC'] = reg_free['ethnicity'].replace({'bipoc': -0.5, 'white': 0.5})

In [375]:
reg_free_r = reg_free.rename(columns={"mean-compound": "sentiment" })

Unnamed: 0,sex,age,ethnicity,sentiment,groups,sexC,ageC,ethnicityC
0,female,adult,bipoc,-0.008748,bipoc_female_adult,-0.5,0.5,-0.5
1,female,child,bipoc,-0.025491,bipoc_female_child,-0.5,-0.5,-0.5
2,male,adult,bipoc,-0.031301,bipoc_male_adult,0.5,0.5,-0.5
3,male,child,bipoc,-0.020020,bipoc_male_child,0.5,-0.5,-0.5
4,female,adult,white,-0.018725,white_female_adult,-0.5,0.5,0.5
...,...,...,...,...,...,...,...,...
189,male,child,bipoc,-0.010073,bipoc_male_child,0.5,-0.5,-0.5
190,female,adult,white,-0.013303,white_female_adult,-0.5,0.5,0.5
191,female,child,white,-0.008703,white_female_child,-0.5,-0.5,0.5
192,male,adult,white,-0.006714,white_male_adult,0.5,0.5,0.5


In [379]:
formula = "sentiment ~ sexC+ethnicityC+ageC"
lm = ols(formula, data=reg_free_r).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:              sentiment   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                 -0.009
Method:                 Least Squares   F-statistic:                    0.4204
Date:                Tue, 02 Nov 2021   Prob (F-statistic):              0.739
Time:                        18:35:43   Log-Likelihood:                 489.52
No. Observations:                 192   AIC:                            -971.0
Df Residuals:                     188   BIC:                            -958.0
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.0076      0.001     -5.487      0.0

In [390]:
infl = lm.get_influence()
# print(infl.summary_table())

In [380]:
lm_interX = ols("sentiment ~ sexC*ethnicityC*ageC", data=reg_free_r).fit()
print(lm_interX.summary())

                            OLS Regression Results                            
Dep. Variable:              sentiment   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                 -0.025
Method:                 Least Squares   F-statistic:                    0.3330
Date:                Tue, 02 Nov 2021   Prob (F-statistic):              0.938
Time:                        18:36:02   Log-Likelihood:                 490.09
No. Observations:                 192   AIC:                            -964.2
Df Residuals:                     184   BIC:                            -938.1
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept               -0.0076 

In [392]:
infl_interX = lm_interX.get_influence()
# print(infl_interX.summary_table())

In [393]:
table1 = anova_lm(lm, lm_interX)
print(table1)

   df_resid       ssr  df_diff   ss_diff        F    Pr(>F)
0     188.0  0.068595      0.0       NaN      NaN       NaN
1     184.0  0.068192      4.0  0.000404  0.27226  0.895585


# Regression analysis forced-choice

In [None]:
X_free = pd.DataFrame({'sex':free_labeling_df['sex'], 'age':free_labeling_df['age'], 'ethnicity':free_labeling_df['ethnicity']})
Y_free = pd.DataFrame({'mean-compound':free_compound['mean-compound']})