# 1 The preprocessing procedure.
Import the necessary libraries required for data processing.

In [None]:
import nltk
import numpy as np
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=8)
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import r2_score
import itertools
from matplotlib.colors import Normalize, LogNorm
import scipy.stats as stats
from adjustText import adjust_text
from cliffs_delta import cliffs_delta
from statsmodels.nonparametric.bandwidths import bw_silverman
import diptest

# Add a label to the image.
letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l','m','n','o','p','q','r','s','t','u']

plt.rcParams.update({
    'axes.titlesize': 16,  
    'axes.labelsize': 16,  
    'xtick.labelsize': 14, 
    'ytick.labelsize': 14, 
    'figure.dpi' : 300 ,
    'font.weight' : 'normal'  
})

line_color = {'Democrats': 'blue', 'Republicans': 'red'}
scatter_color = {'Democrats': '#66CCFF', 'Republicans': '#FF3333'}

In [None]:
# Calculate Word Entropy
def entropy(text):
    wordlist = nltk.word_tokenize(text.lower())
    freq_dict = nltk.probability.FreqDist(wordlist)
    freq = np.array(list(freq_dict.values())) / len(wordlist)
    return -(freq * np.log2(freq)).sum()

In [None]:
# the dataset that aggregated texts by authors.
dataset = pd.read_csv("userDataset.csv")

# dataset['entropy'] = dataset['text'].map(entropy)

## 2 The phenomenon of polarization across different platforms.

In [None]:
# Draw the fig4
jointDataset = dataset.melt(id_vars=['party','Theme','Platform', "core_status"], var_name='ValueType', value_name='Value',value_vars=['pessimism','toxicity'],)
jointDataset['Platform'] =  jointDataset['Platform'] + ' (' + jointDataset['ValueType'] + ')'

g = sns.FacetGrid(jointDataset, row='Platform', col='Theme',
                  row_order=['Twitter (toxicity)','Reddit (toxicity)', 
                             'Twitter (pessimism)', 'Reddit (pessimism)',],
                    height=4, aspect=1.5 , sharex='col', sharey=False,
                     margin_titles=True)

g.set_titles(col_template="{col_name}\n", row_template="{row_name}", size=20, fontweight='bold')

g.map_dataframe(sns.violinplot, x="core_status", y="Value", 
                hue='party',hue_order = ["Democrats", "Republicans"],
                palette=scatter_color,
                order = ['1-degree', '2-core'],
                dodge=True)

g.fig.subplots_adjust(wspace=0.1, hspace=0.2)

for text_obj in g._margin_titles_texts:
    original_title = text_obj.get_text()  
    text_obj.set_text(original_title.split()[0])  

for index, ax in enumerate(g.axes.flat):
    handles, labels = ax.get_legend_handles_labels()
    ax.set_ylabel('')
    if index in [0, 3]:
        ax.set_ylabel('Language toxicity', fontsize=20, fontweight='bold')
    if index in [6, 9]:
        ax.set_ylabel('Pessimism', fontsize=20, fontweight='bold')
    if index in [9, 10 ,11]:
        for label in ax.get_xticklabels():
            label.set_fontweight('bold')  

    ax.text(0, 1.1, letters[index], transform=ax.transAxes, 
        fontsize=20, fontweight='bold', va='top', ha='right')
g.add_legend(label_order=['Republicans','Democrats'], loc='upper center', title=None, bbox_to_anchor=(0.35, 1.04), ncol=2, fontsize=20, markerscale=2.5)
g.set_xlabels('', fontsize=20, fontweight='bold')
plt.savefig('fig4.png', bbox_inches='tight')
plt.savefig('fig4.pdf', bbox_inches='tight')
plt.show()

## 3 The phenomenon of polarization over time.

In [None]:
# Need to read dataset over time.
TopicsWithDatasetPath = {
    '2016 U.S. presidential election(Twitter)' : 'Twitter/daily_statistics[pessimism][debunking=keywords][lang=en][topic=POTUS2016][platform=Twitter].csv',
    '2020 U.S. presidential election(Twitter)' : 'Twitter/daily_statistics[pessimism][debunking=keywords][lang=en][topic=POTUS2020][platform=Twitter].csv',
    'QAnon(Twitter)'     : 'Twitter/daily_statistics[pessimism][debunking=keywords][lang=en][topic=QAnon][platform=Twitter].csv',
    '2016 U.S. presidential election(Reddit)'  : 'Reddit/daily_statistics[pessimism][debunking=keywords][lang=en][topic=POTUS2016][platform=Reddit].csv',
    '2020 U.S. presidential election(Reddit)'  : 'Reddit/daily_statistics[pessimism][debunking=keywords][lang=en][topic=POTUS2020][platform=Reddit].csv',
    'QAnon(Reddit)'      : 'Reddit/daily_statistics[pessimism][debunking=keywords][lang=en][topic=QAnon][platform=Reddit].csv',
}

TopicsWithDatasetRaw = {topic: pd.read_csv(datasetPath) for topic, datasetPath in TopicsWithDatasetPath.items()}
ColumnsNames = ['date','user_count','compound_mean_no_extreme','toxicity_mean_no_extreme','pessimism_mean_no_extreme']
RenamedColumns = {
    'date':'date',
    'user_count':'user_count',
    'compound_mean_no_extreme':'sentiment',
    'toxicity_mean_no_extreme':'toxicity',
    'pessimism_mean_no_extreme':'pessimism'
}

TopicsWithDataset = {topic: datasetRaw[ColumnsNames].rename(columns=RenamedColumns) for topic, datasetRaw in TopicsWithDatasetRaw.items()}
for topic, dataset in TopicsWithDataset.items():
    dataset['user_count'] = dataset['user_count'].map(np.log10)
    dataset['date'] = pd.to_datetime(dataset['date'])

TopicsWithDataset['2020 U.S. presidential election(Twitter)'] = TopicsWithDataset['2020 U.S. presidential election(Twitter)'][TopicsWithDataset['2020 U.S. presidential election(Twitter)']['date'] >= pd.to_datetime('2020-09-06')]
TopicsWithDatasetInDate = {topic: dataset.set_index('date') for topic, dataset in TopicsWithDataset.items()}


# Alignment
ThemeType = ['2016 U.S. presidential election','2020 U.S. presidential election',"QAnon"]
PlatformType = ['Reddit','Twitter']
PartyType = ['Democrats','Republicans']
for theme in ThemeType:
    maxData = min(TopicsWithDatasetInDate[f'{theme}(Reddit)'].index.max(), TopicsWithDatasetInDate[f'{theme}(Twitter)'].index.max())
    minData = max(TopicsWithDatasetInDate[f'{theme}(Reddit)'].index.min(), TopicsWithDatasetInDate[f'{theme}(Twitter)'].index.min())
    TopicsWithDatasetInDate[f'{theme}(Reddit)'] = TopicsWithDatasetInDate[f'{theme}(Reddit)'][minData:maxData]
    TopicsWithDatasetInDate[f'{theme}(Twitter)'] = TopicsWithDatasetInDate[f'{theme}(Twitter)'][minData:maxData]

# Due to the presence of gaps in the middle, the non-gap portions of time were extracted to ensure consistency.
TopicsWithDatasetInDate = {topic: dataset.reset_index() for topic, dataset in TopicsWithDatasetInDate.items()}

In [None]:
PointDate = {
    '2016 U.S. presidential election(Twitter)' : pd.to_datetime("2016-11-8"),
    '2020 U.S. presidential election(Twitter)' : pd.to_datetime("2020-11-3"),
    'QAnon(Twitter)' : pd.to_datetime("2021-1-6"),
    '2016 U.S. presidential election(Reddit)' : pd.to_datetime("2016-11-8"),
    '2020 U.S. presidential election(Reddit)' : pd.to_datetime("2020-11-3"),
    'QAnon(Reddit)' : pd.to_datetime("2021-1-6"),
}
# end
days_d = {}
keyDays = {}
TopicsWithDatasetInDateIndex = {topic: dataset.set_index('date') for topic, dataset in TopicsWithDatasetInDate.items()}

for topic, dataset in TopicsWithDatasetInDateIndex.items():
    target_date = PointDate[topic]
    # Head 7 days
    d =  dataset
    days = d.index
    times = 2
    start_date = target_date - pd.Timedelta(days=7)
    end_date = target_date + pd.Timedelta(days=7)
    middle = [{
        'date' : target_date,
        'toxicity' : d.loc[start_date:end_date].median()['toxicity'],
        'pessimism' : d.loc[start_date:end_date].median()['pessimism'] 
    }]

    # Last 7 days
    s = end_date
    ends = []
    i = 0
    while s + pd.Timedelta(days=1) in days and i < times:
        i += 1
        e = s + pd.Timedelta(days=7)
        s = s + pd.Timedelta(days=1)
        t = s + pd.Timedelta(days=15)
        ends.append({
            'date' : t,
            'toxicity' : d.loc[s:e].median()['toxicity'],
            'pessimism' : d.loc[s:e].median()['pessimism']
        })
        s = e
    
    starts = []
    e = start_date
    i = 0
    while e - pd.Timedelta(days=1) in days and i < times:
        i += 1
        s = e - pd.Timedelta(days=7)
        e = e - pd.Timedelta(days=1)
        t = s - pd.Timedelta(days=15)
        starts.append({
            'date' : t,
            'toxicity' : d.loc[s:e].median()['toxicity'],
            'pessimism' : d.loc[s:e].median()['pessimism']
        })
        e = s

    days_d[topic] = starts + middle + ends
    keyDays[topic] = {
        'date' : target_date,
        'toxicity' : d.loc[start_date:end_date]['toxicity'],
        'pessimism' : d.loc[start_date:end_date]['pessimism'] 
    }

In [None]:
## Calculate k and b by computing the linear fit.
a = list(itertools.chain.from_iterable(days_d.values()))
b = list(map(lambda x: (x['toxicity'], x['pessimism']), a))
toxicity, pessimism = zip(*b)
k, b = np.polyfit(pessimism, toxicity, deg=1)
x = np.linspace(0.055, 0.075, 1000)
y = k*x + b

v = list(PointDate.values())
a = {topic : pd.DataFrame(data) for topic, data in days_d.items()}
rangeday = [[data['date'].max(), data['date'].min()] for topic, data in a.items()]
rangeday = list(itertools.chain.from_iterable(rangeday))
a = pd.concat(a.values(), keys=a.keys())
a = a.reset_index(level=0)
a = a.rename(columns={"level_0": "topic"})
a = a.sort_values('date')

labeldays = a[a['date'].isin(v)]
rangedays = a[a['date'].isin(rangeday)]


# Draw the Fig6
plt.figure(figsize=(12,8))
sns.scatterplot(a, y='toxicity', x='pessimism', hue='topic', s=150,
                hue_order=['2016 U.S. presidential election(Twitter)','2020 U.S. presidential election(Twitter)','QAnon(Twitter)','2016 U.S. presidential election(Reddit)','2020 U.S. presidential election(Reddit)','QAnon(Reddit)'])
# Toxicity / Pessimism
sns.lineplot(a, y='toxicity', x='pessimism', hue='topic', alpha=0.5, legend=False, sort=False, linewidth=4,
             hue_order=['2016 U.S. presidential election(Twitter)','2020 U.S. presidential election(Twitter)','QAnon(Twitter)','2016 U.S. presidential election(Reddit)','2020 U.S. presidential election(Reddit)','QAnon(Reddit)'])

sns.lineplot(x=x, y=y, linestyle="--", linewidth=4, color="red")
plt.text(0.063, 0.28, "Reddit", fontsize=20, fontweight="bold", bbox=dict(facecolor='white', edgecolor='red', boxstyle='round, pad=0.5', linewidth=3))
plt.text(0.054, 0.20, "Twitter", fontsize=20, fontweight="bold", bbox=dict(facecolor='white', edgecolor='red', boxstyle='round, pad=0.5',linewidth=3))

_ = [plt.text(x, y, s.strftime('%Y-%m-%d'), va='center') for x, y ,s in zip(rangedays['pessimism'].tolist(), rangedays['toxicity'].tolist(), rangedays['date'].tolist())]
_ += [plt.text(x, y, "January 6 United States Capitol attack", fontsize=14, fontweight='bold',  va='center') for x, y ,topic in zip(labeldays['pessimism'].tolist(), labeldays['toxicity'].tolist(), labeldays['topic'].tolist()) if topic.startswith("QAnon") == True]
_ += [plt.text(x, y, f"{topic[:4]} Election Day", fontsize=14, fontweight='bold', va='center') for x, y ,topic in zip(labeldays['pessimism'].tolist(), labeldays['toxicity'].tolist(), labeldays['topic'].tolist()) if topic.startswith("QAnon") != True]
_ = adjust_text(_, x=a['pessimism'], y=a['toxicity'], expand=(1.2, 1.4), arrowprops=dict(arrowstyle='->', color='black'), prevent_crossings=True, min_arrow_len=7)


plt.xlabel('Pessimism', fontsize=16, fontweight='bold')
plt.ylabel('Language Toxicity' , fontsize=16, fontweight='bold')

plt.savefig('fig6.pdf')
plt.savefig('fig6.png')

# In reality, the annotation was done manually, but the current code employs automatic annotation (which may be somewhat disorganized).

## 4 The phenomenon of polarization in Entropy

### 4 1 Entropy minimal interval for 50% of users

In [None]:
def ComputeProp(dataset, inf, sup):
    return ((dataset > inf) & (dataset < sup)).sum() / len(dataset)

def FindIntervals(dataset, length):
    sup = dataset.max()
    ret = []
    for beg in np.arange(0, sup-length, 0.1):
        occ = ComputeProp(dataset, beg, beg+length)
        if occ > 0.5:
            ret.append({
                "begin" :np.round(beg,1),
                "end": np.round(beg+length,1),
                "length" : np.round(length,1),
                "occ" : np.round(occ,4),
            })
    return ret

def FindMinimumInterval(dataset):
    sup = dataset.max()
    suc = []
    for length in np.arange(0, sup, 0.1):
        suc += FindIntervals(dataset, length)
        if len(suc) >= 1:
            break
    return suc

In [None]:
ret = dataset.groupby(["Theme","Platform"]).apply(lambda x: pd.Series({
    "x" : FindMinimumInterval(x['entropy'])[0]
}))
ret = ret.reset_index()
ret[["begin", "end", "length", "occ"]] = pd.json_normalize(ret['x'])
ret = ret.drop(columns=["x"])
ret.to_csv("Entropy minimal interval.csv") # Entropy minimal interval for 50% of users

### 4 2 Entropy polarization across different platforms.

In [None]:
# Focusing on this 50% of users as the main subject, explore the phenomenon of entropy polarization.
rangeList = pd.read_csv("Entropy minimal interval.csv")
range_dict = rangeList.set_index(["Theme","Platform"]).to_dict()
def filter_entropy_inver(x):
    theme = x['Theme']
    platform = x['Platform']
    ret = (x['entropy'] > range_dict['begin'][(theme, platform)]) & (x['entropy'] < range_dict['end'][(theme, platform)])
    return ret
dataset['isInEntropy'] = dataset.apply(filter_entropy_inver, axis=1)
inDataset = dataset[dataset['isInEntropy'] == True]

subd = dataset.groupby(["Theme",'Platform',"party"]).apply(lambda x:
    pd.Series({
        "toxicity":filter_entropy_inver(x)['toxicity'].median(),
        "pessimism":filter_entropy_inver(x)['pessimism'].median(),
        "entropy":filter_entropy_inver(x)['entropy'].median(),
    })
)
labels_data = subd.sort_values(['Platform',"Theme","party"],ascending=[False, True, False]).values
subd.sort_values(['Platform',"Theme","party"],ascending=[False, True, False])

subd['Size'] = subd['entropy'].map(lambda x: np.exp2(x))
subd['Size'] = subd['Size'] / subd['Size'].min() * 500
subd.reset_index()

In [None]:
# Draw Fig7

globalMinSize = subd['Size'].min()
globalMaxSize = subd['Size'].max()

g = sns.FacetGrid(subd.reset_index(), row='Platform', col='Theme', hue='party', row_order=['Twitter','Reddit'], palette=scatter_color, height=3, aspect=1.8, sharey=True, sharex=True, margin_titles=True)
g.set_titles(col_template="{col_name}\n", row_template="{row_name}", size=20, fontweight='bold')
g.map_dataframe( plt.scatter, 
                x='toxicity',
                y='pessimism',
                s="Size",
                alpha=0.8,
                )

g.fig.subplots_adjust(wspace=0.18, hspace=0.32)

for index, ax in enumerate(g.axes.flat):
    handles, labels = ax.get_legend_handles_labels()

    ax.tick_params(axis="x", which="both", bottom=True, labelbottom=True)
    ax.tick_params(axis="y", which="both", left=True, labelleft=True)

    ax.set_ylim((0.04,0.09))
    ax.set_xlim((0.10,0.4))
    toxicity, pessimism, entropy = labels_data[2*index]
    _ = [ax.text(x=toxicity, y=pessimism, s=f"{entropy:.1f}", fontsize=16, fontweight='bold', horizontalalignment='center',  # 水平对齐方式，可以是 'left', 'center', 'right'
    verticalalignment='center') ]
    toxicity, pessimism, entropy = labels_data[2*index+1]
    _ += [ax.text(x=toxicity, y=pessimism, s=f"{entropy:.1f}", fontsize=16, fontweight='bold', horizontalalignment='center',  # 水平对齐方式，可以是 'left', 'center', 'right'
    verticalalignment='center')] 
    ax.text(0, 1.15, letters[index], transform=ax.transAxes, 
            fontsize=20, fontweight='bold', va='top', ha='right')
    
    adjust_text(_, expand=(2, 2.5), arrowprops=dict(arrowstyle='->', color='black'), ax=ax)

legend_data = {
    'Democrats' : g._legend_data['Democrats'],
    'Republicans' : g._legend_data['Republicans'],
}
g.set_axis_labels('Language toxicity','Pessimism', fontsize=20, fontweight='bold')

g.add_legend(legend_data=legend_data, label_order=['Republicans','Democrats'], loc='upper center', title="", bbox_to_anchor=(0.35, 1.11), ncol=2, fontsize=20, markerscale=0.5)
plt.savefig('Fig7.png', bbox_inches='tight')
plt.savefig('Fig7.pdf', bbox_inches='tight')

## 4 3 Toxicity With Entropy

In [None]:
dataset['entropyRound'] = dataset['entropy'].round(1)
mediandatatset = dataset.groupby(['Theme',"Platform","party","entropyRound"]).apply(lambda x: pd.Series({
    "toxicity" : x['toxicity'].dropna().median(),
    "pessimism" : x['pessimism'].dropna().median()
}))

def eval_deg1(x, y) :
    (a, b) = np.polyfit(x, y, deg=1)
    y_pred = a*x + b
    r = np.corrcoef(y, y_pred)[0, 1]
    r2 = r2_score(y, y_pred)
    return r, r2

# Figure 8
reval = mediandatatset.reset_index().dropna().groupby(['Theme',"Platform","party"]).apply(lambda x: 
    pd.Series({
        "r" : eval_deg1(x['toxicity'], x['entropyRound'])[0],
        "r2" : eval_deg1(x['toxicity'], x['entropyRound'])[1],
    })
)
resorted = reval.reset_index().sort_values(['Platform','Theme','party'], ascending=[False, True, False]).reset_index()
r = resorted['r'].round(2)
r2 = resorted['r2'].round(2)
resorted

def custom_regplot(x, y, **kwargs):
    label = kwargs.get('label')
    sns.regplot(x=x, y=y ,
                scatter_kws={'color':scatter_color[label], "alpha":0.5},
                order=1,
                line_kws={'color':line_color[label]}, **kwargs)

g = sns.FacetGrid(mediandatatset.reset_index(), row='Platform',
                  row_order=['Twitter','Reddit'],
                    col='Theme', hue="party", sharex=True, sharey=False, margin_titles=True,
                    aspect=1.9)
g.set_titles(col_template="{col_name}\n\n", row_template="{row_name}", size=20, fontweight='bold')

g.map_dataframe(custom_regplot, y="entropyRound", x="toxicity", ci=95)

g.figure.subplots_adjust(wspace=0.25, hspace=0.27, top=1.0)


for text_obj in g._margin_titles_texts:
    original_title = text_obj.get_text()  # 获取原始标题
    text_obj.set_text(original_title.split()[0])  

for index, ax in enumerate(g.axes.flat):
    ax.tick_params(axis="x", which="both", bottom=True, labelbottom=True)
    # ax.tick_params(axis="y", which="both", left=True, labelleft=True)
    handles, labels = ax.get_legend_handles_labels()
    ax.set_ylabel('')
    if index in [0, 3]:
        ax.set_ylabel('Entropy', fontweight='bold')
    # if index in [6, 9]:
    #     ax.set_ylabel('Median Pessimism')
    ax.text(0, 1.15, letters[index], transform=ax.transAxes, 
        fontsize=20, fontweight='bold', va='top', ha='right')
    
    ax.text(0.8, 1.03, fr"$\mathbf{{R^2}}$={r2[index*2]}",transform=ax.transAxes, fontsize=14, fontweight='bold', va='top', ha='left', color='red')
    ax.text(0.8, 1.13, fr"$\mathbf{{R^2}}$={r2[index*2+1]}",transform=ax.transAxes, fontsize=14, fontweight='bold', va='top', ha='left', color='blue')
      
g.set_xlabels('Median language toxicity', fontweight='bold')

g.add_legend(loc='upper center', label_order=['Republicans','Democrats'], title="", bbox_to_anchor=(0.35, 1.32), ncol=2, fontsize=20, markerscale=2.5)
g.savefig('Fig8.pdf', bbox_inches='tight')
g.savefig('Fig8.png', bbox_inches='tight')

## 5 Test

In [None]:
def cohensD(x1, x2):
    mean1, mean2 = np.mean(x1), np.mean(x2)
    std1 ,std2 = np.std(x1,ddof=1), np.std(x2, ddof=1)
    n1 , n2 = len(x1), len(x2)
    pooled_std = np.sqrt(((n1-1) * std1 ** 2 + (n2-1) * std2 ** 2) / (n1 + n2 -2))
    d = (mean1 - mean2) / pooled_std
    return d

def dipt(x):
    d, p = diptest.diptest(x)
    return d, p

def test(dataset, groupby, x1, y, values, alternative="greater"):
    return pd.concat(
        [
            dataset.groupby(groupby).apply(
                lambda x: pd.Series(
                    {
                        "Statistics": 
                        stats.mannwhitneyu(x[x[x1] == y[0]][value], x[x[x1] == y[1]][value], alternative=alternative)[0],
                        "P-value": 
                        stats.mannwhitneyu(x[x[x1] == y[0]][value], x[x[x1] == y[1]][value], alternative=alternative)[1],
                    }
                )
            )
        for value in values],
        keys=values,
    )

lmDataset = dataset[['toxicity','pessimism','entropy','party','core_status','topic','Theme',"Platform"]].dropna()

In [None]:
# Table S7 S9
ret = pd.concat(
    [
        lmDataset.groupby(['Theme','Platform',"core_status"]).apply(
        lambda x: pd.Series(
            {
            "MW-u test(stat)": 
            stats.mannwhitneyu(x[x['party'] == 'Democrats']['toxicity'].dropna(), x[x['party'] == "Republicans"]['toxicity'], alternative="less")[0],
            "MW-u test(p)": 
            stats.mannwhitneyu(x[x['party'] == 'Democrats']['toxicity'].dropna(), x[x['party'] == "Republicans"]['toxicity'], alternative="less")[1],
            "cliff's Delta(stat)":
            cliffs_delta(x[x['party'] == 'Democrats']['toxicity'], x[x['party'] == "Republicans"]['toxicity'])[0],
            "cliff's Delta":
            cliffs_delta(x[x['party'] == 'Democrats']['toxicity'], x[x['party'] == "Republicans"]['toxicity'])[1],
            "Fligner-Killeen(stat)":
            stats.fligner(x[x['party'] == 'Democrats']['toxicity'], x[x['party'] == "Republicans"]['toxicity'])[0],
            "Fligner-Killeen(p)":
            stats.fligner(x[x['party'] == 'Democrats']['toxicity'], x[x['party'] == "Republicans"]['toxicity'])[1],     
            })
            ),
        lmDataset.groupby(['Theme','Platform','party']).apply(
        lambda x: pd.Series(
            {
            "MW-u test(stat)": 
            stats.mannwhitneyu(x[x['core_status'] == '1 degree']['toxicity'], x[x['core_status'] == '2 core']['toxicity'], alternative="less")[0],
            "MW-u test(p)": 
            stats.mannwhitneyu(x[x['core_status'] == '1 degree']['toxicity'], x[x['core_status'] == '2 core']['toxicity'], alternative="less")[1],
            "cliff's Delta(stat)":
            cliffs_delta(x[x['core_status'] == '1 degree']['toxicity'], x[x['core_status'] == '2 core']['toxicity'])[0],
            "cliff's Delta":
            cliffs_delta(x[x['core_status'] == '1 degree']['toxicity'], x[x['core_status'] == '2 core']['toxicity'])[1],
            "Fligner-Killeen(stat)":
            stats.fligner(x[x['core_status'] == '1 degree']['toxicity'], x[x['core_status'] == '2 core']['toxicity'])[0],
            "Fligner-Killeen(p)":
            stats.fligner(x[x['core_status'] == '1 degree']['toxicity'], x[x['core_status'] == '2 core']['toxicity'])[1],     
            })
            ),
    ],
    keys=['Party','Core Stauts']
)
ret.to_csv("S7.csv")
ret

In [None]:
# Table S8 S10
ret = pd.concat(
    [
        lmDataset.groupby(['Theme','Platform',"core_status"]).apply(
        lambda x: pd.Series(
            {
            "MW-u test(stat)": 
            stats.mannwhitneyu(x[x['party'] == 'Democrats']['pessimism'].dropna(), x[x['party'] == "Republicans"]['pessimism'], alternative="greater")[0],
            "MW-u test(p)": 
            stats.mannwhitneyu(x[x['party'] == 'Democrats']['pessimism'].dropna(), x[x['party'] == "Republicans"]['pessimism'], alternative="greater")[1],
            "cliff's Delta(stat)":
            cliffs_delta(x[x['party'] == 'Democrats']['pessimism'], x[x['party'] == "Republicans"]['pessimism'])[0],
            "cliff's Delta":
            cliffs_delta(x[x['party'] == 'Democrats']['pessimism'], x[x['party'] == "Republicans"]['pessimism'])[1],
            "Fligner-Killeen(stat)":
            stats.fligner(x[x['party'] == 'Democrats']['pessimism'], x[x['party'] == "Republicans"]['pessimism'])[0],
            "Fligner-Killeen(p)":
            stats.fligner(x[x['party'] == 'Democrats']['pessimism'], x[x['party'] == "Republicans"]['pessimism'])[1],     
            })
            ),
        lmDataset.groupby(['Theme','Platform','party']).apply(
        lambda x: pd.Series(
            {
            "MW-u test(stat)": 
            stats.mannwhitneyu(x[x['core_status'] == '1 degree']['pessimism'], x[x['core_status'] == '2 core']['pessimism'], alternative="greater")[0],
            "MW-u test(p)": 
            stats.mannwhitneyu(x[x['core_status'] == '1 degree']['pessimism'], x[x['core_status'] == '2 core']['pessimism'], alternative="greater")[1],
            "cliff's Delta(stat)":
            cliffs_delta(x[x['core_status'] == '1 degree']['pessimism'], x[x['core_status'] == '2 core']['pessimism'])[0],
            "cliff's Delta":
            cliffs_delta(x[x['core_status'] == '1 degree']['pessimism'], x[x['core_status'] == '2 core']['pessimism'])[1],
            "Fligner-Killeen(stat)":
            stats.fligner(x[x['core_status'] == '1 degree']['pessimism'], x[x['core_status'] == '2 core']['pessimism'])[0],
            "Fligner-Killeen(p)":
            stats.fligner(x[x['core_status'] == '1 degree']['pessimism'], x[x['core_status'] == '2 core']['pessimism'])[1],     
            })
            ),
    ],
    keys=['Party','Core Stauts']
)
ret.to_csv("S8.csv")
ret

In [None]:
# Table S12
ret = test(lmDataset, ["Platform", "Theme"], 'party', ["Republicans", "Democrats"], ["toxicity"])
ret.to_csv("S12.csv")
ret

In [None]:
# Table S13
ret = pd.concat(
    [
        inDataset.groupby(['Theme',"party"]).apply(
        lambda x: pd.Series(
            {
            "MW-u test(stat)": 
            stats.mannwhitneyu(x[x['Platform'] == 'Reddit']['toxicity'], x[x['Platform'] == "Twitter"]['toxicity'], alternative="greater")[0],
            "MW-u test(p)": 
            stats.mannwhitneyu(x[x['Platform'] == 'Reddit']['toxicity'], x[x['Platform'] == "Twitter"]['toxicity'], alternative="greater")[1],
            "cliff's Delta(stat)":
            cliffs_delta(x[x['Platform'] == 'Reddit']['toxicity'], x[x['Platform'] == "Twitter"]['toxicity'])[0],
            "cliff's Delta":
            cliffs_delta(x[x['Platform'] == 'Reddit']['toxicity'], x[x['Platform'] == "Twitter"]['toxicity'])[1],
            "kstest":
            stats.kstest(x[x['Platform'] == 'Reddit']['toxicity'], x[x['Platform'] == "Twitter"]['toxicity'])[0],
            "ks(p)":
            stats.kstest(x[x['Platform'] == 'Reddit']['toxicity'], x[x['Platform'] == "Twitter"]['toxicity'])[1],   
            })
            ),
        lmDataset.groupby(['Theme','party']).apply(
        lambda x: pd.Series(
            {
            "MW-u test(stat)": 
            stats.mannwhitneyu(x[x['Platform'] == 'Reddit']['pessimism'], x[x['Platform'] == 'Twitter']['pessimism'], alternative="greater")[0],
            "MW-u test(p)": 
            stats.mannwhitneyu(x[x['Platform'] == 'Reddit']['pessimism'], x[x['Platform'] == 'Twitter']['pessimism'], alternative="greater")[1],
            "cliff's Delta(stat)":
            cliffs_delta(x[x['Platform'] == 'Reddit']['pessimism'], x[x['Platform'] == 'Twitter']['pessimism'])[0],
            "cliff's Delta":
            cliffs_delta(x[x['Platform'] == 'Reddit']['pessimism'], x[x['Platform'] == 'Twitter']['pessimism'])[1],
            "kstest":
            stats.kstest(x[x['Platform'] == 'Reddit']['pessimism'], x[x['Platform'] == "Twitter"]['pessimism'])[0],
            "ks(p)":
            stats.kstest(x[x['Platform'] == 'Reddit']['pessimism'], x[x['Platform'] == "Twitter"]['pessimism'])[1],  
            })
            ),
        lmDataset.groupby(["Theme","party"]).apply(
        lambda x: pd.Series(
            {
                "MW-u test(stat)": 
                stats.mannwhitneyu(x[x['Platform'] == 'Reddit']['entropy'].dropna(), x[x['Platform'] == "Twitter"]['entropy'], alternative="greater")[0],
                "MW-u test(p)": 
                stats.mannwhitneyu(x[x['Platform'] == 'Reddit']['entropy'].dropna(), x[x['Platform'] == "Twitter"]['entropy'], alternative="greater")[1],
                "cliff's Delta(stat)":
                cliffs_delta(x[x['Platform'] == 'Reddit']['entropy'].dropna(), x[x['Platform'] == "Twitter"]['entropy'].dropna())[0],
                "cliff's Delta":
                cliffs_delta(x[x['Platform'] == 'Reddit']['entropy'].dropna(), x[x['Platform'] == "Twitter"]['entropy'].dropna())[1],  
                "kstest":
                stats.kstest(x[x['Platform'] == 'Reddit']['entropy'].dropna(), x[x['Platform'] == "Twitter"]['entropy'])[0],
                "ks(p)":
                stats.kstest(x[x['Platform'] == 'Reddit']['entropy'].dropna(), x[x['Platform'] == "Twitter"]['entropy'])[1],
            })
        )
    ],
    keys=['Toxicity','Pessimism', "Entropy"]
)
ret.to_csv("S13.csv")
ret