In [28]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np
from scipy.stats import levene

Load data

In [43]:
data = pd.read_csv("data/engineered_morality.csv")
data.columns

  data = pd.read_csv("data/engineered_morality.csv")


Index(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'created', 'author', 'score',
       'num_comments', 'link', 'cleaned_text', 'word_count', 'type', 'link_id',
       'year', 'month', 'Segment_1', 'i', 'you', 'shehe', 'they', 'emo_pos',
       'emo_neg', 'emo_anx', 'emo_anger', 'emo_sad', 'moral', 'Segment',
       'Care_Virtue', 'Care_Vice', 'Fairness_Virtue', 'Fairness_Vice',
       'Loyalty_Virtue', 'Loyalty_Vice', 'Authority_Virtue', 'Authority_Vice',
       'Sanctity_Virtue', 'Sanctity_Vice', 'Care_total', 'Fairness_total',
       'Loyalty_total', 'Authority_total', 'Sanctity_total', 'Virtue_total',
       'Vice_total', 'Foundations_total_score', 'Subreddit'],
      dtype='object')

### LIWC moral score

Observe means

In [44]:
sub_means = data.groupby('Subreddit')['moral'].mean()
print(sub_means)

Subreddit
homeowners         0.154216
investing          0.193725
selfimprovement    0.330980
Name: moral, dtype: float64


Observe variance of moral scores

In [45]:
sub_variances = data.groupby('Subreddit')['moral'].var()
print(sub_variances)

Subreddit
homeowners         0.243729
investing          0.303137
selfimprovement    0.490467
Name: moral, dtype: float64


Statistically check differences in variances

In [46]:
# Split the data by subreddit
group1 = data[data['Subreddit'] == 'selfimprovement']['moral']
group2 = data[data['Subreddit'] == 'homeowners']['moral']
group3 = data[data['Subreddit'] == 'investing']['moral']

# Levene's test for equal variances
stat, p_value = levene(group1, group2, group3)

print(f"Levene’s test statistic: {stat:.4f}")
print(f"p-value: {p_value:.4f}")

Levene’s test statistic: 12522.9065
p-value: 0.0000


Heterogeneous variances

Create a binary variable: 1 if morality score > 0, else 0. This is to try to model the likelihood of containing *any* moral language.

In [47]:
data['moral_present'] = (data['moral'] > 0).astype(int)

Check binary distribution

In [48]:
# Get value counts 
counts = data['moral_present'].value_counts()

# Convert to percentages
percentages = counts / counts.sum() * 100

summary_df = pd.DataFrame({
    'Count': counts,
    'Percentage': percentages.round(2)
})

print(summary_df)

                 Count  Percentage
moral_present                     
0              1211271        80.3
1               297194        19.7


Check distribution by subreddit

In [49]:
subreddit_percent = data.groupby('Subreddit')['moral_present'].value_counts(normalize=True).unstack().fillna(0) * 100
subreddit_percent = subreddit_percent.round(2)
print(subreddit_percent)

moral_present        0      1
Subreddit                    
homeowners       86.98  13.02
investing        83.79  16.21
selfimprovement  70.25  29.75


Make the subreddit column a categorical variable that will be used as a categorical predictor (X)

In [50]:
data['Subreddit'] = data['Subreddit'].astype('category')

Logistic model --> Is moral language present?

Run the logistic regression

In [51]:
logit_model = smf.glm(
    formula='moral_present ~ C(Subreddit)',
    data=data,
    family=sm.families.Binomial()
).fit(cov_type='HC1')  # HC1 is a covariance estimator that adjusts for 
                          #heteroskedasticity

print(logit_model.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:          moral_present   No. Observations:              1508465
Model:                            GLM   Df Residuals:                  1508462
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -7.2422e+05
Date:                Sun, 06 Apr 2025   Deviance:                   1.4484e+06
Time:                        15:20:06   Pearson chi2:                 1.51e+06
No. Iterations:                     5   Pseudo R-squ. (CS):            0.03176
Covariance Type:                  HC1                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 

Marginal effects

In [39]:
mfx = logit_model.get_margeff()
print(mfx.summary())

         GLM Marginal Effects        
Dep. Variable:          moral_present
Method:                          dydx
At:                           overall
                                     dy/dx    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
C(Subreddit)[T.investing]           0.0392      0.001     45.141      0.000       0.038       0.041
C(Subreddit)[T.selfimprovement]     0.1590      0.001    204.931      0.000       0.158       0.161


Get probabilities of showing moral language, to make it more intuitive

In [52]:
# Coefficients from the GLM output
intercept = -1.8992  # Homeowners (baseline)
coef_investing = 0.2565
coef_selfimprovement = 1.0399

Compute log-odds for each subreddit

In [54]:
logit_homeowners = intercept 
logit_investing = intercept + coef_investing
logit_selfimprovement = intercept + coef_selfimprovement

print("Log-Odds:")
print(f"Homeowners: {logit_homeowners}")
print(f"Investing: {logit_investing}")
print(f"Selfimprovement: {logit_selfimprovement}")


Log-Odds:
Homeowners: -1.8992
Investing: -1.6427
Selfimprovement: -0.8593


Convert log-odds with predicted probabilities

In [55]:
# Logistic function
def logit_to_prob(logit):
    return np.exp(logit) / (1 + np.exp(logit))

# Apply to each group
prob_homeowners = logit_to_prob(logit_homeowners)
prob_investing = logit_to_prob(logit_investing)
prob_selfimprovement = logit_to_prob(logit_selfimprovement)

print("\nPredicted Probabilities:")
print(f"Homeowners: {prob_homeowners:.4f}")
print(f"Investing: {prob_investing:.4f}")
print(f"Selfimprovement: {prob_selfimprovement:.4f}")



Predicted Probabilities:
Homeowners: 0.1302
Investing: 0.1621
Selfimprovement: 0.2975


### Moral Foundations Dictionary Score

Obseve means

In [56]:
sub_means_mfd = data.groupby('Subreddit')['Foundations_total_score'].mean()
print(sub_means_mfd)

Subreddit
homeowners         1.446587
investing          1.652188
selfimprovement    1.958449
Name: Foundations_total_score, dtype: float64


  sub_means_mfd = data.groupby('Subreddit')['Foundations_total_score'].mean()


Observe variances

In [57]:
sub_variances_mfd = data.groupby('Subreddit')['Foundations_total_score'].var()
print(sub_variances_mfd)

Subreddit
homeowners         2.641538
investing          3.255616
selfimprovement    3.195955
Name: Foundations_total_score, dtype: float64


  sub_variances_mfd = data.groupby('Subreddit')['Foundations_total_score'].var()


Levene tests to check differences in variances

In [58]:
# Split the data by subreddit
group1 = data[data['Subreddit'] == 'selfimprovement']['Foundations_total_score']
group2 = data[data['Subreddit'] == 'homeowners']['Foundations_total_score']
group3 = data[data['Subreddit'] == 'investing']['Foundations_total_score']

# Levene's test for equal variances
stat, p_value = levene(group1, group2, group3)

print(f"Levene’s test statistic: {stat:.4f}")
print(f"p-value: {p_value:.4f}")


Levene’s test statistic: 1558.6439
p-value: 0.0000


Variances are different here too. 

Create a binary variable: 1 if morality score > 0, else 0. This is to try to model the likelihood of containing *any* moral language.

In [59]:
data['moral_present_mfd'] = (data['Foundations_total_score'] > 0).astype(int)

Check binary distribution

In [60]:
# Get value counts 
counts = data['moral_present_mfd'].value_counts()

# Convert to percentages
percentages = counts / counts.sum() * 100

summary_df = pd.DataFrame({
    'Count': counts,
    'Percentage': percentages.round(2)
})

print(summary_df)

                     Count  Percentage
moral_present_mfd                     
1                  1069640       70.91
0                   438825       29.09


Check distribution by subreddit

In [61]:
subreddit_percent = data.groupby('Subreddit')['moral_present_mfd'].value_counts(normalize=True).unstack().fillna(0) * 100
subreddit_percent = subreddit_percent.round(2)
print(subreddit_percent)

moral_present_mfd      0      1
Subreddit                      
homeowners         35.04  64.96
investing          32.06  67.94
selfimprovement    20.28  79.72


  subreddit_percent = data.groupby('Subreddit')['moral_present_mfd'].value_counts(normalize=True).unstack().fillna(0) * 100


Logistic model

In [62]:
logit_model_mfd = smf.glm(
    formula='moral_present_mfd ~ C(Subreddit)',
    data=data,
    family=sm.families.Binomial()
).fit(cov_type='HC1')  # HC1 again for heteroskedasticity

print(logit_model_mfd.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:      moral_present_mfd   No. Observations:              1508465
Model:                            GLM   Df Residuals:                  1508462
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -8.9414e+05
Date:                Sun, 06 Apr 2025   Deviance:                   1.7883e+06
Time:                        15:21:47   Pearson chi2:                 1.51e+06
No. Iterations:                     5   Pseudo R-squ. (CS):            0.02023
Covariance Type:                  HC1                                         
                                      coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
Intercept 

Marginal effects

In [63]:
mfx = logit_model_mfd.get_margeff()
print(mfx.summary())

         GLM Marginal Effects        
Dep. Variable:      moral_present_mfd
Method:                          dydx
At:                           overall
                                     dy/dx    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------------------
C(Subreddit)[T.investing]           0.0270      0.001     31.626      0.000       0.025       0.029
C(Subreddit)[T.selfimprovement]     0.1520      0.001    168.534      0.000       0.150       0.154


Get probabilities of showing moral language, to make it more intuitive

In [64]:
# Coefficients from the GLM output
intercept = 0.6172 #Homeowners (baseline)
coef_investing = 0.1138
coef_selfimprovement = 0.7516

Compute log-odds for each subreddit

In [65]:
logit_homeowners = intercept 
logit_investing = intercept + coef_investing
logit_selfimprovement = intercept + coef_selfimprovement

print("Log-Odds:")
print(f"Homeowners: {logit_homeowners}")
print(f"Investing: {logit_investing}")
print(f"Selfimprovement: {logit_selfimprovement}")

Log-Odds:
Homeowners: 0.6172
Investing: 0.731
Selfimprovement: 1.3688


Convert log-odds with predicted probabilities

In [67]:
# Logistic function
def logit_to_prob(logit):
    return np.exp(logit) / (1 + np.exp(logit))

# Apply to each group
prob_homeowners = logit_to_prob(logit_homeowners)
prob_investing = logit_to_prob(logit_investing)
prob_selfimprovement = logit_to_prob(logit_selfimprovement)

print("\nPredicted Probabilities:")
print(f"Homeowners: {prob_homeowners:.4f}")
print(f"Investing: {prob_investing:.4f}")
print(f"Selfimprovement: {prob_selfimprovement:.4f}")


Predicted Probabilities:
Homeowners: 0.6496
Investing: 0.6750
Selfimprovement: 0.7972


## Conclusion

According to both the LIWC's moral dimension and The Moral Foundations dictionary, the subreddit r/selfimprovement has a higher probability of showing moral language.