### Import Libraries

In [1]:
import pandas as pd
from scipy.stats import spearmanr, pointbiserialr
from nltk import word_tokenize, pos_tag, sent_tokenize
from lexical_diversity import lex_div as ld
from collections import Counter
import glob
import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.serif'] = ['Helvetica'] + plt.rcParams['font.serif']
plt.rcParams['font.size'] = 14
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

### Read the Files

In [2]:
df = pd.read_parquet('../data/submissions.parquet')

In [3]:
# keep only the test set for analysis
df = df[df['train_valid_test'] == 'test']

## **Extract Features**

#### Break Submissions into Sentences

In [19]:
df['clean_body_sentences'] = df['clean_body'].apply(lambda x: sent_tokenize(x))

In [None]:
df['clean_body_tokens'] = df['clean_body'].apply(lambda x: word_tokenize(x))
df['clean_body_tokens'] = df['clean_body_tokens'].apply(lambda x: [i for i in x if i!=''])

### **Question marks**

In [29]:
df['is_question'] = df['clean_body_sentences'].apply(lambda x: 1 if '?' in x else 0)
df['question_ratio'] = df['clean_body_sentences'].apply(lambda x: len([i for i in x if '?' in i])/len(x))

### **Gratitude**

In [32]:
lexicon = {"thanks", "contented", "blessed", "thank you", "thankful for", "grateful for", "greatful for", "my gratitude", "i appreciate", "made me smile", "make me smile", "i super appreciate", "i deeply appreciate", "i really appreciate", "bless your soul", "made my day", "tysm", "thx", "shout out to"}

In [None]:
# binary
df['gratitude'] = df['clean_body'].apply(lambda x: 1 if any([i in x.lower() for i in lexicon]) else 0)

In [None]:
# get the counts
df['gratitude_count'] = df['clean_body'].apply(lambda x: sum([x.lower().count(w) for w in lexicon]))

# get the ratio
df['gratitude_ratio'] = df['gratitude_count'] / df['clean_body_tokens'].apply(lambda x: len(x) if len(x) > 0 else 1)

### **Proper Nouns from POS Tags**

In [69]:
df['pos_tags'] = df['clean_body'].apply(lambda x: pos_tag(word_tokenize(x, language='english')))

In [70]:
df['pos_tag_counts'] = df['pos_tags'].apply(lambda x: Counter([i[1] for i in x]))

In [71]:
df['proper_noun_count'] = df['pos_tag_counts'].apply(lambda x: x.get("NNP",0) + x.get("NNPS",0))

df['proper_noun_ratio'] = df['proper_noun_count'] / df['pos_tags'].apply(lambda x: len(x) if len(x) > 0 else 1)

### **Elaboration**

In [74]:
df['text_length'] = df['pos_tag_counts'].apply(lambda x: sum(x.values()))

In [None]:
lexical_items = {'NN', 'NNS', 'NNP', 'NNPS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS'}

df['lexical_item_count'] = df['pos_tags'].apply(lambda x: len(set([t for t in x if t[1] in lexical_items])))

In [123]:
df['clean_body_tokens'] = df['clean_body'].apply(lambda x: ld.tokenize(x))

In [125]:
df['mtld'] = df['clean_body_tokens'].apply(lambda x: ld.mtld(x))

### **Hedge Words**

In [76]:
lexicon = []
# read the files and dump the contents into the list "lexicon"
for _ in glob.glob('../data/hedge_resources/*'):
    name = _.split('/')[-1].split('.')[0]
    with open(_, 'r') as f:
        temp = f.read().splitlines()
        temp = [i for i in temp if i!='' and i[0]!='#']

        if name == 'booster_words':
            temp = [f"not {i}" for i in temp] + [f"without {i}" for i in temp]
        lexicon.extend(temp)

# lowercase the lexicon
lexicon = [i.lower() for i in lexicon]

In [None]:
df['hedge'] = df['clean_body'].apply(lambda x: 1 if any([i in x.lower() for i in lexicon]) else 0)

# get the counts
df['hedge_count'] = df['clean_body'].apply(lambda x: sum([x.lower().count(w) for w in lexicon]))

df['hedge_ratio'] = df['hedge_count'] / df['pos_tags'].apply(lambda x: len(x))

### **Sentiment**

In [78]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [79]:
sid = SentimentIntensityAnalyzer()

In [80]:
df['polarity'] = df['clean_body'].apply(lambda x: sid.polarity_scores(x)['compound'])

In [81]:
df['positive_polarity'] = 0
df['negative_polarity'] = 0
df.loc[df['polarity']>0, 'positive_polarity'] = 1
df.loc[df['polarity']<0, 'negative_polarity'] = 1

## **Check Correlations**

In [87]:
c_median = df['c_score'].median()

In [88]:
FILTER = df['c_score']>c_median

In [89]:
df[FILTER]['hedge'].sum() / len(df[FILTER])
df[~FILTER]['hedge'].sum() / len(df[~FILTER])

0.560849985667126

In [42]:
for col in ['is_question', 'gratitude', 'hedge', 'positive_polarity', 'negative_polarity']:
    print(col)
    print("controversial:", pointbiserialr(df[FILTER][col], df[FILTER]['ta_score']))
    print(f'percentage of occurrence: {round(df[FILTER][col].sum()/len(df[FILTER]),2)*100}%')
    
    print("non-controversial:", pointbiserialr(df[~FILTER][col], df[~FILTER]['ta_score']))
    print(f'percentage of occurrence: {round(df[~FILTER][col].sum()/len(df[~FILTER]),2)*100}%')
    print('-----------------------------------')

is_question
controversial: SignificanceResult(statistic=-0.2997438110394871, pvalue=0.0)
percentage of occurrence: 40.0%
non-controversial: SignificanceResult(statistic=-0.2713689094556239, pvalue=0.0)
percentage of occurrence: 43.0%
-----------------------------------
gratitude
controversial: SignificanceResult(statistic=-0.026114766055494044, pvalue=7.435967524729462e-10)
percentage of occurrence: 3.0%
non-controversial: SignificanceResult(statistic=-0.05178714486500907, pvalue=2.589252454852214e-34)
percentage of occurrence: 5.0%
-----------------------------------
hedge
controversial: SignificanceResult(statistic=-0.10210276705398079, pvalue=1.2236794759213476e-128)
percentage of occurrence: 52.0%
non-controversial: SignificanceResult(statistic=-0.14696688144137235, pvalue=8.367918990359034e-266)
percentage of occurrence: 52.0%
-----------------------------------
positive_polarity
controversial: SignificanceResult(statistic=-0.10166432547299345, pvalue=1.5089553959678863e-127)
perc

In [142]:
for col in ['question_ratio', 'gratitude_ratio', 'proper_noun_ratio', 'text_length', 'lexical_item_count', 'mtld', 'submission_openai', 'polarity', 'hedge_ratio']:
    print(col)
    print("controversial:", spearmanr(df[FILTER][col], df[FILTER]['ta_score']))
    print(f'percentage of occurrence: {round((df[FILTER][col]!=0).sum()/len(df[FILTER]),2)*100}%')
    print("non-controversial:", spearmanr(df[~FILTER][col], df[~FILTER]['ta_score']))
    print(f'percentage of occurrence: {round((df[~FILTER][col]!=0).sum()/len(df[~FILTER]),2)*100}%')
    print('-----------------------------------')

question_ratio
controversial: SignificanceResult(statistic=-0.4389331973250033, pvalue=0.0)
percentage of occurrence: 47.0%
non-controversial: SignificanceResult(statistic=-0.3786187406989999, pvalue=0.0)
percentage of occurrence: 50.0%
-----------------------------------
gratitude_ratio
controversial: SignificanceResult(statistic=-0.06333147156947244, pvalue=1.6124441617310633e-96)
percentage of occurrence: 3.0%
non-controversial: SignificanceResult(statistic=-0.10826713691935491, pvalue=2.9334393419763457e-279)
percentage of occurrence: 5.0%
-----------------------------------
proper_noun_ratio
controversial: SignificanceResult(statistic=0.35727541911536714, pvalue=0.0)
percentage of occurrence: 81.0%
non-controversial: SignificanceResult(statistic=0.002093977937221086, pvalue=0.4910752747427096)
percentage of occurrence: 64.0%
-----------------------------------
text_length
controversial: SignificanceResult(statistic=-0.24512543007594223, pvalue=0.0)
percentage of occurrence: 100.0%

### Correlation Between Elaboration Metrics

In [145]:
spearmanr(df['mtld'], df['lexical_item_count'])

SignificanceResult(statistic=0.7980426656494866, pvalue=0.0)

In [146]:
spearmanr(df['text_length'], df['lexical_item_count'])

SignificanceResult(statistic=0.9779860625958424, pvalue=0.0)

## Regression Analysis

In [109]:
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

### Variation Inflation Factor

In [115]:
# the independent variables set
X = df[['c_score','question_ratio','gratitude_ratio','proper_noun_ratio','lexical_item_count','hedge_ratio','polarity']]

# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]

vif_data

Unnamed: 0,feature,VIF
0,c_score,2.467624
1,question_ratio,1.470034
2,gratitude_ratio,1.005694
3,proper_noun_ratio,1.600044
4,lexical_item_count,1.16393
5,hedge_ratio,1.400745
6,polarity,1.054376


In [116]:
# prepare X and y
X = sm.add_constant(X)

y = df['ta_score']

# initialize and fit the model
model = sm.OLS(y, X)
results = model.fit()

# see the results
results.summary()

0,1,2,3
Dep. Variable:,ta_score,R-squared:,0.371
Model:,OLS,Adj. R-squared:,0.371
Method:,Least Squares,F-statistic:,18190.0
Date:,"Tue, 22 Apr 2025",Prob (F-statistic):,0.0
Time:,21:12:12,Log-Likelihood:,248480.0
No. Observations:,216286,AIC:,-496900.0
Df Residuals:,216278,BIC:,-496900.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.0537,0.000,128.575,0.000,0.053,0.055
c_score,0.3437,0.001,264.250,0.000,0.341,0.346
question_ratio,-0.0667,0.001,-130.057,0.000,-0.068,-0.066
gratitude_ratio,-0.0175,0.022,-0.800,0.424,-0.060,0.025
proper_noun_ratio,0.0461,0.001,43.530,0.000,0.044,0.048
lexical_item_count,1.001e-06,1.45e-06,0.692,0.489,-1.83e-06,3.84e-06
hedge_ratio,0.0172,0.004,4.886,0.000,0.010,0.024
polarity,-0.0185,0.000,-61.206,0.000,-0.019,-0.018

0,1,2,3
Omnibus:,28978.938,Durbin-Watson:,1.534
Prob(Omnibus):,0.0,Jarque-Bera (JB):,44342.61
Skew:,0.966,Prob(JB):,0.0
Kurtosis:,4.091,Cond. No.,16800.0


## **Qualitative Analysis** (Table 2)

### Find BERTopic topic names associated with highly controversial topics
Topics are chosen from 2023 Gallup survey (https://news.gallup.com/poll/509129/update-partisan-gaps-expand-government-power-climate.aspx)
- Abortion
- Healthcare
- Gun Laws
- LGBTQ+
- Climate Change

In [95]:
for keyword in ['abortion', 'healthcare', 'medicare', 'guns', 'gay', 'climate']:
    print(f"{keyword}:")
    print(df[df['bertopic'].apply(lambda x: keyword in x)]['bertopic'].value_counts())
    print('----------------')

abortion:
bertopic
18_abortion_ban_rights_texas          406
257_walker_georgia_senate_abortion     40
Name: count, dtype: int64
----------------
healthcare:
bertopic
54_insurance_healthcare_health_medical    173
Name: count, dtype: int64
----------------
medicare:
bertopic
271_medicare_medicaid_expansion_coverage    37
Name: count, dtype: int64
----------------
guns:
bertopic
17_gun_guns_shootings_shooting    412
Name: count, dtype: int64
----------------
gay:
bertopic
71_gender_gay_nonbinary_bisexual       141
223_cup_world_football_gay              45
234_cases_outbreak_monkey_gay           44
329_samesex_marriage_gay_court          30
622_homophobic_gay_homophobia_queer     14
Name: count, dtype: int64
----------------
climate:
bertopic
32_climate_warming_change_global       258
135_climate_heat_hottest_warming        75
728_activists_climate_painting_pour     12
771_activists_climate_traffic_block     11
Name: count, dtype: int64
----------------


In [None]:
# prepare the filter to find high and low TA submissions
ta_median = df['ta_score'].median()
TA_FILTER = df['ta_score']>ta_median

for bertopic in ['18_abortion_ban_rights_texas', '271_medicare_medicaid_expansion_coverage', '17_gun_guns_shootings_shooting','71_gender_gay_nonbinary_bisexual', '32_climate_warming_change_global']:
    
    # given the topic, filter out the submissions
    BERTOPIC_FILTER = df['bertopic'].apply(lambda x: bertopic in x)

    # see examples combining both filters
    print(bertopic)
    print("High TA:", df[BERTOPIC_FILTER & TA_FILTER]['clean_body'].sample(1).iloc[0], '\n--------')
    print("Low TA:", df[BERTOPIC_FILTER & ~TA_FILTER]['clean_body'].sample(1).iloc[0], '\n--------\n')

18_abortion_ban_rights_texas
High TA: Cherokee Nation: Governor’s claim of ‘abortion on-demand’ on Tribal lands is ‘irresponsible’ 
--------
Low TA: How soon could US states outlaw abortions if Roe v Wade is overturned? | Roe v Wade | The Guardian 
--------

271_medicare_medicaid_expansion_coverage
High TA: GOP dominated Wisconsin legislature yet again rejects ACA Medicaid expansion for WI, (the only Midwestern state to reject it), because someone on Medicaid might win the lottery but still remain on Medicaid 
--------
Low TA: Democrats want to rescue union pensions from the party's failed bailout plan 
--------

17_gun_guns_shootings_shooting
High TA: They Want to Take Your Guns; CBS Cheers Canada’s Mass Gun Bans, Confiscation Plans 
--------
Low TA: If guns aren’t the issue, people are, then wouldn’t that somewhat insinuate that we’re purposefully giving the wrong people guns? Asking because that’s how I’m starting to view it. I’m more than likely missing something here so I’d like s