In [49]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
from scipy.stats import levene

Load data

In [50]:
data = pd.read_csv("data/engineered_morality.csv")
data.columns

  data = pd.read_csv("data/engineered_morality.csv")


Index(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'created', 'author', 'score',
       'num_comments', 'link', 'cleaned_text', 'word_count', 'type', 'link_id',
       'year', 'month', 'Segment_1', 'i', 'you', 'shehe', 'they', 'emo_pos',
       'emo_neg', 'emo_anx', 'emo_anger', 'emo_sad', 'moral', 'Segment',
       'Care_Virtue', 'Care_Vice', 'Fairness_Virtue', 'Fairness_Vice',
       'Loyalty_Virtue', 'Loyalty_Vice', 'Authority_Virtue', 'Authority_Vice',
       'Sanctity_Virtue', 'Sanctity_Vice', 'Care_total', 'Fairness_total',
       'Loyalty_total', 'Authority_total', 'Sanctity_total', 'Virtue_total',
       'Vice_total', 'Foundations_total_score', 'Subreddit'],
      dtype='object')

## Research Question 1: Is moral language present in discussions about self-improvement?

### LIWC moral score

Observe means

In [3]:
sub_means = data.groupby('Subreddit')['moral'].mean()
print(sub_means)

Subreddit
homeowners         0.154216
investing          0.193725
selfimprovement    0.330980
Name: moral, dtype: float64


Observe variance of moral scores

In [4]:
sub_variances = data.groupby('Subreddit')['moral'].var()
print(sub_variances)

Subreddit
homeowners         0.243729
investing          0.303137
selfimprovement    0.490467
Name: moral, dtype: float64


Statistically check differences in variances

In [5]:
# Split the data by subreddit
group1 = data[data['Subreddit'] == 'selfimprovement']['moral']
group2 = data[data['Subreddit'] == 'homeowners']['moral']
group3 = data[data['Subreddit'] == 'investing']['moral']

# Levene's test for equal variances
stat, p_value = levene(group1, group2, group3)

print(f"Levene’s test statistic: {stat:.4f}")
print(f"p-value: {p_value:.4f}")

Levene’s test statistic: 12522.9065
p-value: 0.0000


Heterogeneous variances

Create a binary variable: 1 if morality score > 0, else 0. This is to try to model the likelihood of containing *any* moral language.

In [55]:
data['moral_present'] = (data['moral'] > 0).astype(int)

Check binary distribution

In [7]:
# Get value counts 
counts = data['moral_present'].value_counts()

# Convert to percentages
percentages = counts / counts.sum() * 100

summary_df = pd.DataFrame({
    'Count': counts,
    'Percentage': percentages.round(2)
})

print(summary_df)

                 Count  Percentage
moral_present                     
0              1211271        80.3
1               297194        19.7


Check distribution by subreddit

In [8]:
subreddit_percent = data.groupby('Subreddit')['moral_present'].value_counts(normalize=True).unstack().fillna(0) * 100
subreddit_percent = subreddit_percent.round(2)
print(subreddit_percent)

moral_present        0      1
Subreddit                    
homeowners       86.98  13.02
investing        83.79  16.21
selfimprovement  70.25  29.75


Make the subreddit column a categorical variable that will be used as a categorical predictor (X)

In [9]:
data['Subreddit'] = data['Subreddit'].astype('category')

Logistic model --> Is moral language present?

Run the logistic regression

In [12]:
logit_model = smf.glm(
    formula='moral_present ~ C(Subreddit, Treatment(reference="selfimprovement"))',
    data=data,
    family=sm.families.Binomial()
).fit(cov_type='HC1')  # HC1 is a covariance estimator that adjusts for 
                          #heteroskedasticity

print(logit_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:          moral_present   No. Observations:              1508465
Model:                            GLM   Df Residuals:                  1508462
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -7.2422e+05
Date:                Sun, 06 Apr 2025   Deviance:                   1.4484e+06
Time:                        17:27:00   Pearson chi2:                 1.51e+06
No. Iterations:                     5   Pseudo R-squ. (CS):            0.03176
Covariance Type:                  HC1                                         
                                                                         coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------

Marginal effects

In [13]:
mfx = logit_model.get_margeff()
print(mfx.summary())

         GLM Marginal Effects        
Dep. Variable:          moral_present
Method:                          dydx
At:                           overall
                                                                        dy/dx    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------------------------
C(Subreddit, Treatment(reference="selfimprovement"))[T.homeowners]    -0.1590      0.001   -204.931      0.000      -0.161      -0.158
C(Subreddit, Treatment(reference="selfimprovement"))[T.investing]     -0.1198      0.001   -162.795      0.000      -0.121      -0.118


Get probabilities of showing moral language, to make it more intuitive

In [14]:
# Coefficients from the GLM output
intercept = -0.8593  # Selfimprovement (baseline)
coef_homeowners = -1.0399
coef_investing = -0.7834

Compute log-odds for each subreddit

In [15]:
logit_selfimprovement = intercept 
logit_homeowners = intercept + coef_homeowners
logit_investing = intercept + coef_investing


print("Log-Odds:")
print(f"Selfimprovement: {logit_selfimprovement}")
print(f"Homeowners: {logit_homeowners}")
print(f"Investing: {logit_investing}")


Log-Odds:
Selfimprovement: -0.8593
Homeowners: -1.8992
Investing: -1.6427


Convert log-odds with predicted probabilities

In [17]:
# Logistic function
def logit_to_prob(logit):
    return np.exp(logit) / (1 + np.exp(logit))

# Apply to each group
prob_selfimprovement = logit_to_prob(logit_selfimprovement)
prob_homeowners = logit_to_prob(logit_homeowners)
prob_investing = logit_to_prob(logit_investing)


print("\nPredicted Probabilities:")
print(f"Selfimprovement: {prob_selfimprovement:.4f}")
print(f"Homeowners: {prob_homeowners:.4f}")
print(f"Investing: {prob_investing:.4f}")


Predicted Probabilities:
Selfimprovement: 0.2975
Homeowners: 0.1302
Investing: 0.1621


### Moral Foundations Dictionary Score

Obseve means

In [18]:
sub_means_mfd = data.groupby('Subreddit')['Foundations_total_score'].mean()
print(sub_means_mfd)

Subreddit
homeowners         1.446587
investing          1.652188
selfimprovement    1.958449
Name: Foundations_total_score, dtype: float64


  sub_means_mfd = data.groupby('Subreddit')['Foundations_total_score'].mean()


Observe variances

In [19]:
sub_variances_mfd = data.groupby('Subreddit')['Foundations_total_score'].var()
print(sub_variances_mfd)

Subreddit
homeowners         2.641538
investing          3.255616
selfimprovement    3.195955
Name: Foundations_total_score, dtype: float64


  sub_variances_mfd = data.groupby('Subreddit')['Foundations_total_score'].var()


Levene tests to check differences in variances

In [20]:
# Split the data by subreddit
group1 = data[data['Subreddit'] == 'selfimprovement']['Foundations_total_score']
group2 = data[data['Subreddit'] == 'homeowners']['Foundations_total_score']
group3 = data[data['Subreddit'] == 'investing']['Foundations_total_score']

# Levene's test for equal variances
stat, p_value = levene(group1, group2, group3)

print(f"Levene’s test statistic: {stat:.4f}")
print(f"p-value: {p_value:.4f}")


Levene’s test statistic: 1558.6439
p-value: 0.0000


Variances are different here too. 

Create a binary variable: 1 if morality score > 0, else 0. This is to try to model the likelihood of containing *any* moral language.

In [21]:
data['moral_present_mfd'] = (data['Foundations_total_score'] > 0).astype(int)

Check binary distribution

In [22]:
# Get value counts 
counts = data['moral_present_mfd'].value_counts()

# Convert to percentages
percentages = counts / counts.sum() * 100

summary_df = pd.DataFrame({
    'Count': counts,
    'Percentage': percentages.round(2)
})

print(summary_df)

                     Count  Percentage
moral_present_mfd                     
1                  1069640       70.91
0                   438825       29.09


Check distribution by subreddit

In [23]:
subreddit_percent = data.groupby('Subreddit')['moral_present_mfd'].value_counts(normalize=True).unstack().fillna(0) * 100
subreddit_percent = subreddit_percent.round(2)
print(subreddit_percent)

moral_present_mfd      0      1
Subreddit                      
homeowners         35.04  64.96
investing          32.06  67.94
selfimprovement    20.28  79.72


  subreddit_percent = data.groupby('Subreddit')['moral_present_mfd'].value_counts(normalize=True).unstack().fillna(0) * 100


Logistic model

In [24]:
logit_model_mfd = smf.glm(
    formula='moral_present_mfd ~ C(Subreddit, Treatment(reference="selfimprovement"))',
    data=data,
    family=sm.families.Binomial()
).fit(cov_type='HC1')  # HC1 again for heteroskedasticity

print(logit_model_mfd.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:      moral_present_mfd   No. Observations:              1508465
Model:                            GLM   Df Residuals:                  1508462
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -8.9414e+05
Date:                Sun, 06 Apr 2025   Deviance:                   1.7883e+06
Time:                        17:33:28   Pearson chi2:                 1.51e+06
No. Iterations:                     5   Pseudo R-squ. (CS):            0.02023
Covariance Type:                  HC1                                         
                                                                         coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------

Marginal effects

In [25]:
mfx = logit_model_mfd.get_margeff()
print(mfx.summary())

         GLM Marginal Effects        
Dep. Variable:      moral_present_mfd
Method:                          dydx
At:                           overall
                                                                        dy/dx    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------------------------------------
C(Subreddit, Treatment(reference="selfimprovement"))[T.homeowners]    -0.1520      0.001   -168.534      0.000      -0.154      -0.150
C(Subreddit, Treatment(reference="selfimprovement"))[T.investing]     -0.1249      0.001   -136.124      0.000      -0.127      -0.123


Get probabilities of showing moral language, to make it more intuitive

In [30]:
# Coefficients from the GLM output
intercept = 1.3688 #Selfimprovement (baseline)
coef_homeowners = -0.7516
coef_investing = -0.6178


Compute log-odds for each subreddit

In [31]:
logit_selfimprovement = intercept 
logit_homeowners = intercept + coef_homeowners
logit_investing = intercept + coef_investing

print("Log-Odds:")
print(f"Selfimprovement: {logit_selfimprovement}")
print(f"Homeowners: {logit_homeowners}")
print(f"Investing: {logit_investing}")

Log-Odds:
Selfimprovement: 1.3688
Homeowners: 0.6172
Investing: 0.751


Convert log-odds with predicted probabilities

Guide: https://sebastiansauer.github.io/convert_logit2prob/

In [32]:
# Logistic function
def logit_to_prob(logit):
    return np.exp(logit) / (1 + np.exp(logit))

# Apply to each group
prob_selfimprovement = logit_to_prob(logit_selfimprovement)
prob_homeowners = logit_to_prob(logit_homeowners)
prob_investing = logit_to_prob(logit_investing)


print("\nPredicted Probabilities:")
print(f"Selfimprovement: {prob_selfimprovement:.4f}")
print(f"Homeowners: {prob_homeowners:.4f}")
print(f"Investing: {prob_investing:.4f}")



Predicted Probabilities:
Selfimprovement: 0.7972
Homeowners: 0.6496
Investing: 0.6794


### Conclusion RQ1

According to both the LIWC's moral dimension and The Moral Foundations dictionary, the subreddit r/selfimprovement has a higher probability of showing moral language.

## RQ 3. How is the use of moral language associated with emotions in the context of self-improvement?

In [51]:
selfimpr = data[data.Subreddit == "selfimprovement"]
selfimpr.shape

(506574, 45)

In [57]:
selfimpr['moral_present'] = np.where(selfimpr['moral'] > 0, 'present', 'not present')
selfimpr['moral_present'] = selfimpr['moral_present'].astype('category')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selfimpr['moral_present'] = np.where(selfimpr['moral'] > 0, 'present', 'not present')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selfimpr['moral_present'] = selfimpr['moral_present'].astype('category')


In [20]:
selfimpr.moral_present.value_counts()

moral_present
not present    355876
present        150698
Name: count, dtype: int64

### General emo_neg

Check variance

In [24]:
# Split the data by morality presence
group1 = selfimpr[selfimpr['moral_present'] == 'not present']['emo_neg']
group2 = selfimpr[selfimpr['moral_present'] == 'present']['emo_neg']

# Levene's test for equal variances
stat, p_value = levene(group1, group2)

print(f"Levene’s test statistic: {stat:.4f}")
print(f"p-value: {p_value:.4f}")

Levene’s test statistic: 10.8353
p-value: 0.0010


Heterogeneous variances

Create binary label for negative emotion score

In [21]:
selfimpr['emo_neg_present'] = (selfimpr['emo_neg'] > 0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selfimpr['emo_neg_present'] = (selfimpr['emo_neg'] > 0).astype(int)


In [25]:
selfimpr.emo_neg_present.value_counts()

emo_neg_present
1    286026
0    220548
Name: count, dtype: int64

Logistic regression

In [27]:
logit_model = smf.glm(
    formula='emo_neg_present ~ C(moral_present, Treatment(reference="not present"))',
    data=selfimpr,
    family=sm.families.Binomial()
).fit(cov_type='HC1')  # HC1 is a covariance estimator that adjusts for 
                          #heteroskedasticity

print(logit_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:        emo_neg_present   No. Observations:               506574
Model:                            GLM   Df Residuals:                   506572
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -3.3893e+05
Date:                Sun, 06 Apr 2025   Deviance:                   6.7786e+05
Time:                        21:41:15   Pearson chi2:                 5.07e+05
No. Iterations:                     5   Pseudo R-squ. (CS):            0.03093
Covariance Type:                  HC1                                         
                                                                      coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------

Calculate odds ratio

In [62]:
coeff = 0.8056
np.exp(coeff)

2.2380389194051817

Posts with moral language are approximately 2.23 times more likely to express anger language

### Anger

Binary label for anger score

In [52]:
selfimpr['emo_ang_present'] = (selfimpr['emo_anger'] > 0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selfimpr['emo_ang_present'] = (selfimpr['emo_anger'] > 0).astype(int)


In [53]:
selfimpr.emo_ang_present.value_counts()

emo_ang_present
0    427030
1     79544
Name: count, dtype: int64

Logistic regression

In [58]:
logit_model = smf.glm(
    formula='emo_ang_present ~ C(moral_present, Treatment(reference="not present"))',
    data=selfimpr,
    family=sm.families.Binomial()
).fit(cov_type='HC1')  

print(logit_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:        emo_ang_present   No. Observations:               506574
Model:                            GLM   Df Residuals:                   506572
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -2.1340e+05
Date:                Sat, 26 Apr 2025   Deviance:                   4.2680e+05
Time:                        19:38:19   Pearson chi2:                 5.07e+05
No. Iterations:                     5   Pseudo R-squ. (CS):            0.02652
Covariance Type:                  HC1                                         
                                                                      coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------

Calculate odds ratio

In [61]:
coeff = 0.9331
np.exp(coeff)

2.542378346979088

Posts with moral language are approximately 2.54 times more likely to express anger language