In [2]:
import nltk
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
import altair as alt
from sklearn import preprocessing
import altair_saver

# Forced choice sentiment scores

In [3]:
forced_choice_df = pd.read_csv('../clean_data/forced_choice_uw_students_count_emotions.csv')

In [4]:
sia = SentimentIntensityAnalyzer()

In [5]:
forced_choice_emo_list = ['happiness', 'neutral', 'surprise' , 'sadness', 'disgust', 'anger', 'fear', 'uncertain']

In [6]:
emo_scores_forced = dict.fromkeys(forced_choice_emo_list) # placeholder dic with emotion-keys

In [7]:
emo_scores_forced.update({k: sia.polarity_scores(k) for k, v in emo_scores_forced.items()}) # dic with sentiment scores for each emotion word

In [8]:
def sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='compound', emo_list=forced_choice_emo_list):
    scores_df = pd.DataFrame(columns=emo_list, index=range(0,194)) #placeholder dataframe 
    for emo in dic_scores:
        scores_df[emo] = count_matrix[emo] * dic_scores[emo][sentiment]
    scores_df[f"mean-{sentiment}"] = scores_df.mean(axis=1)
    return scores_df

In [9]:
forced_compound = sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='compound', emo_list=forced_choice_emo_list)
forced_neg = sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='neg', emo_list=forced_choice_emo_list)
forced_pos = sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='pos', emo_list=forced_choice_emo_list)
forced_neu = sentiment_matrix(dic_scores = emo_scores_forced, count_matrix = forced_choice_df, sentiment='neu', emo_list=forced_choice_emo_list)

In [10]:
forced_compound.describe()

Unnamed: 0,happiness,neutral,surprise,sadness,disgust,anger,fear,uncertain,mean-compound
count,114.0,130.0,115.0,109.0,133.0,110.0,121.0,159.0,192.0
mean,7.554237,0.0,3.190501,-4.945409,-5.016032,-5.724199,-2.722573,-1.208201,-0.825831
std,9.886917,0.0,3.761548,7.007716,6.745409,7.424197,3.745245,0.955946,4.400171
min,0.5574,0.0,0.2732,-21.5796,-28.1718,-26.8793,-16.7926,-4.44,-11.55895
25%,0.5574,0.0,0.2732,-6.606,-5.3946,-7.863625,-2.9634,-1.776,-3.651467
50%,1.1148,0.0,1.0928,-0.8808,-1.7982,-1.7157,-0.9878,-0.888,-1.1203
75%,14.4924,0.0,5.464,-0.4404,-0.5994,-0.5719,-0.4939,-0.592,1.124152
max,27.87,0.0,12.8404,-0.4404,-0.5994,-0.5719,-0.4939,-0.296,13.68805


In [11]:
forced_compound

Unnamed: 0,happiness,neutral,surprise,sadness,disgust,anger,fear,uncertain,mean-compound
0,0.5574,0.0,0.2732,,-2.9970,-8.0066,,-2.072,-2.040833
1,,0.0,,,-2.9970,-22.8760,-0.9878,-0.592,-5.490560
2,,0.0,,,,-26.8793,-0.4939,,-9.124400
3,,0.0,0.8196,-7.4868,-1.1988,-4.5752,-1.9756,-2.664,-2.440114
4,,0.0,0.5464,,-0.5994,-20.0165,-0.4939,-0.592,-3.525900
...,...,...,...,...,...,...,...,...,...
189,0.5574,0.0,0.8196,,-4.1958,-0.5719,,-3.848,-1.206450
190,1.1148,0.0,4.3712,,-5.3946,-2.8595,,-2.368,-0.856017
191,1.1148,0.0,0.5464,-0.4404,-1.1988,-10.2942,,-3.848,-2.017171
192,0.5574,,0.2732,,-7.7922,,,-2.664,-2.406400


In [10]:
# forced_choice_df.to_csv('../clean_data/forced_choice_df_uw_students_sentiment_scores.csv', index=False)

# Free labeling sentiment scores

In [11]:
free_labeling_df = pd.read_csv('../clean_data/free_choice_uw_students_count_emotions.csv')
free_labeling_emo_list = pd.read_csv('../clean_data/free_choice_word_list.csv')
free_labeling_emo_list = free_labeling_emo_list['emotion'].tolist()

In [12]:
emo_scores_free = dict.fromkeys(free_labeling_emo_list)

In [13]:
emo_scores_free.update({k: sia.polarity_scores(k) for k, v in emo_scores_free.items()}) # dic with sentiment scores for each emotion word

In [14]:
free_compound = sentiment_matrix(dic_scores = emo_scores_free, count_matrix = free_labeling_df, sentiment='compound', emo_list=free_labeling_emo_list)
free_neg = sentiment_matrix(dic_scores = emo_scores_free, count_matrix = free_labeling_df, sentiment='neg', emo_list=free_labeling_emo_list)
free_pos = sentiment_matrix(dic_scores = emo_scores_free, count_matrix = free_labeling_df, sentiment='pos', emo_list=free_labeling_emo_list)
free_neu = sentiment_matrix(dic_scores = emo_scores_free, count_matrix = free_labeling_df, sentiment='neu', emo_list=free_labeling_emo_list)

In [15]:
free_compound.describe()

Unnamed: 0,happy,sad,angry,confused,upset,shocked,surprised,mad,scared,none,...,broken,moody,sexy,anrgy,touring,tounge,unknowing,pyscho,schocked,mean-compound
count,114.0,115.0,101.0,118.0,118.0,82.0,84.0,94.0,81.0,131.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,192.0
mean,7.008283,-4.248848,-3.538812,-1.601786,-1.543378,-1.878156,1.252732,-2.085939,-1.881215,0.0,...,-0.4767,-0.3612,0.5267,0.0,0.0,0.0,0.0,0.0,0.0,-0.201522
std,7.795894,5.361947,3.892807,1.655734,1.200521,1.750446,1.194602,1.714219,2.123546,0.0,...,,,,,,,,,,0.657242
min,0.5719,-20.0214,-16.3392,-8.9096,-4.9634,-7.0004,0.2263,-6.9146,-11.01,0.0,...,-0.4767,-0.3612,0.5267,0.0,0.0,0.0,0.0,0.0,0.0,-1.459473
25%,1.1438,-4.767,-4.5954,-2.2274,-2.2908,-2.8638,0.2263,-2.9634,-2.6424,0.0,...,-0.4767,-0.3612,0.5267,0.0,0.0,0.0,0.0,0.0,0.0,-0.692994
50%,3.14545,-1.4301,-2.0424,-0.9546,-1.1454,-1.2728,0.6789,-1.4817,-0.8808,0.0,...,-0.4767,-0.3612,0.5267,0.0,0.0,0.0,0.0,0.0,0.0,-0.259728
75%,12.438825,-0.4767,-0.5106,-0.3182,-0.3818,-0.3182,1.640675,-0.4939,-0.4404,0.0,...,-0.4767,-0.3612,0.5267,0.0,0.0,0.0,0.0,0.0,0.0,0.066164
max,24.0198,-0.4767,-0.5106,-0.3182,-0.3818,-0.3182,4.9786,-0.4939,-0.4404,0.0,...,-0.4767,-0.3612,0.5267,0.0,0.0,0.0,0.0,0.0,0.0,1.579425


In [16]:
# free_labeling_df.to_csv('../clean_data/free_labeling_df_uw_students_sentiment_scores.csv', index=False)

# Regression analysis - Free-labeling

In [17]:
reg_free = pd.DataFrame({'sex':free_labeling_df['sex'], 'age':free_labeling_df['age'], 'ethnicity':free_labeling_df['ethnicity'], \
                         'mean-compound':free_compound['mean-compound'], \
                         'mean-neg':free_neg['mean-neg'], \
                         'mean-pos':free_pos['mean-pos'], \
                         'mean-neu':free_neu['mean-neu']}) 

In [18]:
reg_free = reg_free.dropna()

## Descriptives plots

In [120]:
source = reg_free

hist_free_all_com = alt.Chart(source).mark_bar().encode(
    alt.X('mean-compound:Q',bin=alt.Bin(maxbins=30), title="mean sentiment distribution"),
    y='count()')

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'))

hist_free_all_com = hist_free_all_com + rule
hist_free_all_com.display(renderer='svg')

In [121]:
# svg_string(hist_free_all_com, 'Overall mean sentiment distribution')

Saved!


### Sentiment by sex

In [122]:
source = reg_free
hist_free_sex_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30), title='mean sentiment distribution'),
    alt.Y('count()',  stack=None),
    alt.Color('sex'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'),
    color=alt.Color('sex'))

hist_free_sex_com = hist_free_sex_com + rule
hist_free_sex_com

In [123]:
# svg_string(hist_free_sex_com, 'Mean sentiment by sex')

Saved!


### Sentiment by ethnicity

In [124]:
source = reg_free
hist_free_ethnicity_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30), title='mean sentiment distribution'),
    alt.Y('count()',  stack=None),
    alt.Color('ethnicity'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'),
    color=alt.Color('ethnicity'))

hist_free_ethnicity_com = hist_free_ethnicity_com + rule
hist_free_ethnicity_com

In [136]:
# svg_string(hist_free_ethnicity_com, 'Mean sentiment by ethnicity')

### Sentiment by age

In [126]:
source = reg_free
hist_free_age_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30), title='mean sentiment distribution'),
    alt.Y('count()',  stack=None),
    alt.Color('age'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'),
    color=alt.Color('age'))

hist_free_age_com = hist_free_age_com + rule
hist_free_age_com

In [127]:
# svg_string(hist_free_age_com, 'Mean sentiment by age group')

Saved!


## Mean and error bar showing confidence interval

In [139]:
reg_free['groups'] = reg_free['ethnicity']+'_'+reg_free['sex']+'_'+reg_free['age']

In [267]:
error_bars = alt.Chart(reg_free).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_errorbar(extent='ci').encode(
    x=alt.X('mean-compound:Q', scale=alt.Scale(zero=False), title='compound sentiment score - mean and std'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N')
  
)

points = alt.Chart(reg_free).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_point(filled=True, color='black').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N'))

rule = alt.Chart(reg_free).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'))

In [268]:
free_error_bar_com = error_bars + points + rule
free_error_bar_com

In [None]:
# svg_string(error_bar_chart, 'Compound sentiment score by group')

In [269]:
error_bars = alt.Chart(reg_free).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_errorbar(extent='ci').encode(
    x=alt.X('mean-neg:Q', scale=alt.Scale(zero=False), title='negative sentiment score - mean and std'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N')
  
)

points = alt.Chart(reg_free).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_point(filled=True, color='black').encode(
    x=alt.X('mean-neg:Q', aggregate='mean'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N'))

rule = alt.Chart(reg_free).mark_rule(color='red').encode(
    x=alt.X('mean-neg:Q', aggregate='mean'))

In [270]:
free_error_bar_neg = error_bars + points + rule
free_error_bar_neg

In [271]:
error_bars = alt.Chart(reg_free).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_errorbar(extent='ci').encode(
    x=alt.X('mean-pos:Q', scale=alt.Scale(zero=False), title='positive sentiment score - mean and std'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N')
  
)

points = alt.Chart(reg_free).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_point(filled=True, color='black').encode(
    x=alt.X('mean-pos:Q', aggregate='mean'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N'))

rule = alt.Chart(reg_free).mark_rule(color='red').encode(
    x=alt.X('mean-pos:Q', aggregate='mean'))

In [272]:
free_error_bar_pos = error_bars + points + rule
free_error_bar_pos

In [273]:
error_bars = alt.Chart(reg_free).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_errorbar(extent='ci').encode(
    x=alt.X('mean-neu:Q', scale=alt.Scale(zero=False), title='neutral sentiment score - mean and std'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N')
  
)

points = alt.Chart(reg_free).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_point(filled=True, color='black').encode(
    x=alt.X('mean-neu:Q', aggregate='mean'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N'))

rule = alt.Chart(reg_free).mark_rule(color='red').encode(
    x=alt.X('mean-neu:Q', aggregate='mean'))

In [274]:
free_error_bar_neu = error_bars + points + rule
free_error_bar_neu

## Model - Free-labeling

In [81]:
from statsmodels.compat import lzip
import statsmodels.api as sm
import statsmodels.stats.api as sms
from statsmodels.formula.api import ols
from statsmodels.graphics.api import interaction_plot, abline_plot
from statsmodels.stats.anova import anova_lm

In [33]:
# center independent variables
reg_free['sexC'] = reg_free['sex'].replace({'female': -0.5, 'male': 0.5})
reg_free['ageC'] = reg_free['age'].replace({'child': -0.5, 'adult': 0.5})
reg_free['ethnicityC'] = reg_free['ethnicity'].replace({'bipoc': -0.5, 'white': 0.5})

### Compound sentiment

In [73]:
reg_free_r = reg_free.rename(columns={"mean-compound": "sentimentCom" })

In [74]:
formula = "sentimentCom ~ sexC+ethnicityC+ageC"
lm = ols(formula, data=reg_free_r).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:           sentimentCom   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                 -0.011
Method:                 Least Squares   F-statistic:                    0.3055
Date:                Thu, 04 Nov 2021   Prob (F-statistic):              0.821
Time:                        07:39:59   Log-Likelihood:                -190.89
No. Observations:                 192   AIC:                             389.8
Df Residuals:                     188   BIC:                             402.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.2015      0.048     -4.225      0.0

In [83]:
infl = lm.get_influence()
# print(infl.summary_table())

#############################
# Check values for influece #
#############################

# Influence - Cook's d: nothing (criteria: greater than 1)
# Outliers - student residual: nothing (criteria: higher +/- 3)
# Leverage - hat diag: all values are the same (criteria: 3 times larger than the mean leverage value)


In [37]:
lm_interX = ols("sentimentCom ~ sexC*ethnicityC*ageC", data=reg_free_r).fit()
print(lm_interX.summary())

                            OLS Regression Results                            
Dep. Variable:           sentimentCom   R-squared:                       0.012
Model:                            OLS   Adj. R-squared:                 -0.025
Method:                 Least Squares   F-statistic:                    0.3272
Date:                Wed, 03 Nov 2021   Prob (F-statistic):              0.941
Time:                        12:43:20   Log-Likelihood:                -190.16
No. Observations:                 192   AIC:                             396.3
Df Residuals:                     184   BIC:                             422.4
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept               -0.2015 

In [90]:
infl_interX = lm_interX.get_influence()
# print(infl_interX.summary_table())

#############################
# Check values for influece #
#############################

# Influence - Cook's d: nothing (criteria: greater than 1)
# Outliers - student residual: nothing (criteria: higher +/- 3)
# Leverage - hat diag: all values are the same (criteria: 3 times larger than the mean leverage value)


In [39]:
table1 = anova_lm(lm, lm_interX)
print(table1)

   df_resid        ssr  df_diff   ss_diff         F    Pr(>F)
0     188.0  82.105595      0.0       NaN       NaN       NaN
1     184.0  81.491548      4.0  0.614047  0.346614  0.846149


In [231]:
import matplotlib.pyplot as plt

In [278]:
# lm_interX.model.exog
# lm_interX.model.exog_names

In [277]:
# reg_free_r_chart = reg_free_r[['sex','age','ethnicity','sentimentCom']]
# factor_groups = reg_free_r_chart.groupby(['sex','age', 'ethnicity'])

In [275]:
# for values, group in factor_groups:
#     i,j,k = values
#     print(i, j, k)
#     idx = group.index
#     print(idx)

In [276]:
# infl = lm_interX.get_influence()
# resid = infl.resid_studentized_internal
# plt.figure(figsize=(6,6))
# for values, group in factor_groups:
#     i,j,k = values
#     idx = group.index
#     plt.scatter(resid[idx], s=144, edgecolors='black')
# plt.xlabel('X');
# plt.ylabel('standardized resids');

# Regression analysis forced-choice

In [49]:
reg_forced = pd.DataFrame({'sex':forced_choice_df['sex'], 'age':forced_choice_df['age'], 'ethnicity':forced_choice_df['ethnicity'], \
                         'mean-compound':forced_compound['mean-compound'], \
                         'mean-neg':forced_neg['mean-neg'], \
                         'mean-pos':forced_pos['mean-pos'], \
                         'mean-neu':forced_neu['mean-neu']}) 

In [50]:
reg_forced = reg_forced.dropna()

## Descriptive plots

In [188]:
source = reg_forced

hist_forced_all_com = alt.Chart(source).mark_bar().encode(
    alt.X('mean-compound:Q',bin=alt.Bin(maxbins=30), title='mean sentiment distribution'),
    y='count()')

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'))

hist_forced_all_com = hist_forced_all_com + rule
hist_forced_all_com

In [189]:
test = error_bar_chart | hist_forced_all_com

In [190]:
svg_string(test, 'test')

Saved!


### Sentiment by sex

In [191]:
source = reg_forced

hist_forced_sex_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30), title='mean sentiment distribution'),
    alt.Y('count()',  stack=None),
    alt.Color('sex'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'),
    color=alt.Color('sex'))

hist_forced_sex_com =  hist_forced_sex_com + rule
hist_forced_sex_com

### Sentiment by ethnicity

In [192]:
source = reg_forced

hist_forced_ethnicity_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30),title='mean sentiment distribution'),
    alt.Y('count()',  stack=None),
    alt.Color('ethnicity'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'),
    color=alt.Color('ethnicity'))

hist_forced_ethnicity_com = hist_forced_ethnicity_com + rule
hist_forced_ethnicity_com

### Sentiment by age

In [193]:
source = reg_forced

hist_forced_age_com = alt.Chart(source).mark_area(
    opacity=0.5,
    interpolate='step'
).encode(
    alt.X('mean-compound:Q', bin=alt.Bin(maxbins=30), title='mean sentiment distribution'),
    alt.Y('count()',  stack=None),
    alt.Color('age'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'),
    color=alt.Color('age'))

hist_forced_age_com = hist_forced_age_com + rule
hist_forced_age_com

## Mean and error bar showing confidence interval

In [194]:
reg_forced['groups'] = reg_forced['ethnicity']+'_'+reg_forced['sex']+'_'+reg_forced['age']

### compound

In [279]:
source = reg_forced

error_bars = alt.Chart(source).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_errorbar(extent='ci').encode(
    x=alt.X('mean-compound:Q', scale=alt.Scale(zero=False),title='compound sentiment score - mean and std'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N')
  
)

points = alt.Chart(source).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_point(filled=True, color='black').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-compound:Q', aggregate='mean'))

In [280]:
forced_error_bar_com  = error_bars+points+rule
forced_error_bar_com 

### negative

In [281]:
source = reg_forced

error_bars = alt.Chart(source).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_errorbar(extent='ci').encode(
    x=alt.X('mean-neg:Q', scale=alt.Scale(zero=False), title='negative sentiment score - mean and std'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N')
  
)

points = alt.Chart(source).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_point(filled=True, color='black').encode(
    x=alt.X('mean-neg:Q', aggregate='mean'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-neg:Q', aggregate='mean'))

In [282]:
forced_error_bar_neg = error_bars+points+rule
forced_error_bar_neg 

### positive

In [283]:
source = reg_forced

error_bars = alt.Chart(source).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_errorbar(extent='ci').encode(
    x=alt.X('mean-pos:Q', scale=alt.Scale(zero=False), title='positive sentiment score - mean and std'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N')
  
)

points = alt.Chart(source).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_point(filled=True, color='black').encode(
    x=alt.X('mean-pos:Q', aggregate='mean', title='mean-pos-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-pos:Q', aggregate='mean'))

In [284]:
forced_error_bar_pos = error_bars+points+rule
forced_error_bar_pos 

### neutral

In [285]:
source = reg_forced

error_bars = alt.Chart(source).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_errorbar(extent='ci').encode(
    x=alt.X('mean-neu:Q', scale=alt.Scale(zero=False),title='neutral sentiment score - mean and std'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N')
  
)

points = alt.Chart(source).transform_calculate(
    categories="datum.ethnicity + '-' + datum.sex").mark_point(filled=True, color='black').encode(
    x=alt.X('mean-neu:Q', aggregate='mean', title='mean-neu-sentiment'),
    y=alt.Y('groups:N'),
    color=alt.Color('categories:N'))

rule = alt.Chart(source).mark_rule(color='red').encode(
    x=alt.X('mean-neu:Q', aggregate='mean'))

In [286]:
forced_error_bar_neu = error_bars+points+rule
forced_error_bar_neu 

## Model - Forced-choice

In [66]:
# center independent variables
reg_forced['sexC'] = reg_forced['sex'].replace({'female': -0.5, 'male': 0.5})
reg_forced['ageC'] = reg_forced['age'].replace({'child': -0.5, 'adult': 0.5})
reg_forced['ethnicityC'] = reg_forced['ethnicity'].replace({'bipoc': -0.5, 'white': 0.5})

In [67]:
reg_forced_r = reg_forced.rename(columns={"mean-compound": "sentimentCom" })

In [68]:
formula = "sentimentCom ~ sexC+ethnicityC+ageC"
lm = ols(formula, data=reg_forced_r).fit()
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:           sentimentCom   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                 -0.012
Method:                 Least Squares   F-statistic:                    0.2528
Date:                Wed, 03 Nov 2021   Prob (F-statistic):              0.859
Time:                        13:55:27   Log-Likelihood:                -556.02
No. Observations:                 192   AIC:                             1120.
Df Residuals:                     188   BIC:                             1133.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.8258      0.319     -2.585      0.0

In [69]:
infl = lm.get_influence()
# print(infl.summary_table())

#############################
# Check values for influece #
#############################

In [70]:
lm_interX = ols("sentimentCom ~ sexC*ethnicityC*ageC", data=reg_forced_r).fit()
print(lm_interX.summary())

                            OLS Regression Results                            
Dep. Variable:           sentimentCom   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                 -0.027
Method:                 Least Squares   F-statistic:                    0.2843
Date:                Wed, 03 Nov 2021   Prob (F-statistic):              0.960
Time:                        13:57:35   Log-Likelihood:                -555.38
No. Observations:                 192   AIC:                             1127.
Df Residuals:                     184   BIC:                             1153.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept               -0.8258 

In [71]:
infl_interX = lm_interX.get_influence()
# print(infl_interX.summary_table())

#############################
# Check values for influece #
#############################

In [72]:
table1 = anova_lm(lm, lm_interX)
print(table1)

   df_resid          ssr  df_diff    ss_diff         F    Pr(>F)
0     188.0  3683.187856      0.0        NaN       NaN       NaN
1     184.0  3658.479182      4.0  24.708674  0.310675  0.870597


# Save to dataframe to dashboard

In [287]:
# placeholders to save svg strings
svg_str = []
image_title = []

def svg_string(chart, title):
    chart_str = altair_saver.save(chart, fmt='svg')
    image_title.append(title)
    svg_str.append(chart_str)
    print('Saved!')

In [288]:
hist_all = hist_forced_all_com | hist_free_all_com  
hist_sex = hist_forced_sex_com | hist_free_sex_com  
hist_ethnicity = hist_forced_ethnicity_com | hist_free_ethnicity_com  
hist_age = hist_forced_age_com | hist_free_age_com  
error_bar_com = forced_error_bar_com | free_error_bar_com  
error_bar_neg = forced_error_bar_neg | free_error_bar_neg  
error_bar_pos = forced_error_bar_pos | free_error_bar_pos  
error_bar_neu = forced_error_bar_neu | free_error_bar_neu  

In [289]:
svg_string(hist_all, 'Overall mean sentiment distribution')
svg_string(hist_sex, 'Mean sentiment distribution by sex')
svg_string(hist_ethnicity, 'Mean sentiment distribution by ethnicity')
svg_string(hist_age, 'Mean sentiment distribution by age group')
svg_string(error_bar_com, 'Compounded sentiment score')
svg_string(error_bar_neg, 'Negative sentiment score')
svg_string(error_bar_pos, 'Positive sentiment score')
svg_string(error_bar_neu, 'Neutral sentiment score ')

Saved!
Saved!
Saved!
Saved!
Saved!
Saved!
Saved!
Saved!


In [290]:
images_strings = pd.DataFrame({'image_title': image_title, 'svg': svg_str})

In [291]:
images_strings.to_csv('../clean_data/sentiment_svg_strings.csv', index=False)
images_strings.to_csv('../../emotions_dashboard/data/sentiment_svg_strings.csv', index=False)

In [292]:
df_svg = pd.read_csv('../clean_data/sentiment_svg_strings.csv')

In [293]:
df_svg['image_title']

0         Overall mean sentiment distribution
1          Mean sentiment distribution by sex
2    Mean sentiment distribution by ethnicity
3    Mean sentiment distribution by age group
4                  Compounded sentiment score
5                    Negative sentiment score
6                    Positive sentiment score
7                    Neutral sentiment score 
Name: image_title, dtype: object