In [None]:
import re
from functools import partial

from tqdm.notebook import tqdm

import pandas as pd
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import levene
import matplotlib.pyplot as plt
import seaborn as sns

topics = {
    'a': 'Advice',
    'e': 'Entertainment',
    'g': 'Gossip',
    'i': 'Informational',
    'r': 'Recommendation',
    's': 'Social'
}

In [None]:
df = pd.read_csv('data.csv', header=0, skiprows=[1,2], usecols=partial(re.fullmatch, r'[aegirs][123][+=-] (tone|clarity|intent)'))

display(df)
cols = df.columns

In [None]:
df.columns = pd.MultiIndex.from_arrays([
    cols.map(lambda col: col[4:]),
    cols.map(lambda col: col[2]),
    cols.map(lambda col: col[0]),
    cols.map(lambda col: col[1]),
])

display(df)

In [None]:
stacked = df.stack(level=[1,2,3])
stacked.index = stacked.index.droplevel(0).set_names(['label', 'topic', 'message'])
stacked = stacked.reset_index()
display(stacked)

In [None]:
tukey_tone =    pairwise_tukeyhsd(endog=stacked['tone'], groups=stacked['label'], alpha=0.05)
tukey_clarity = pairwise_tukeyhsd(endog=stacked['clarity'], groups=stacked['label'], alpha=0.05)
tukey_intent =  pairwise_tukeyhsd(endog=stacked['intent'], groups=stacked['label'], alpha=0.05)

In [None]:
print('Tone:\n', tukey_tone)
print('\nClarity:\n', tukey_clarity)
print('\nIntent:\n', tukey_intent)

In [None]:
topic_analyses = {}

for topic in tqdm(stacked['topic'].unique()):
    data = stacked[stacked['topic'] == topic]

    topic_analyses[topic] = {
        'tone': pairwise_tukeyhsd(
            endog=data['tone'],
            groups=data['label'],
            alpha=0.05
        ),
        'clarity': pairwise_tukeyhsd(
            endog=data['clarity'],
            groups=data['label'],
            alpha=0.05
        ),
        'intent': pairwise_tukeyhsd(
            endog=data['intent'],
            groups=data['label'],
            alpha=0.05
        )
    }

    print(f'==={topics[topic]}===')
    print('Tone:\n', topic_analyses[topic]['tone'])
    print('\nClarity:\n', topic_analyses[topic]['clarity'])
    print('\nIntent:\n', topic_analyses[topic]['intent'])
    print()

In [None]:
_, axs = plt.subplots(nrows=3, figsize=(5,10))

for i, question in enumerate(['tone', 'clarity', 'intent']):
    groups = [stacked[stacked['label'] == label][question].values for label in stacked['label'].unique()]
    statistic, p_value = levene(*groups)
    
    # sns.swarmplot(x=question, y='label', data=stacked, hue='label', s=3, ax=axs[i])
    sns.violinplot(x=question, y='label', data=stacked, hue='label', inner='box', cut=0, density_norm='count', alpha=0.5, ax=axs[i])
    axs[i].set_ylabel('Label')
    axs[i].set_xlabel(f'{question.capitalize()}, (Levene\'s Test: statistic={statistic:.3f}, p={p_value:.3f})')
    
    
plt.tight_layout()
plt.show()

In [None]:
for t_ind, topic in topics.items():
    _, axs = plt.subplots(nrows=3, figsize=(5,10))
    
    for i, question in enumerate(['tone', 'clarity', 'intent']):
        groups = [stacked[(stacked['label'] == label) & (stacked['topic'] == t_ind)][question].values for label in stacked['label'].unique()]
        statistic, p_value = levene(*groups)
        
        # sns.swarmplot(x=question, y='label', data=stacked, hue='label', s=3, ax=axs[i])
        sns.violinplot(x=question, y='label', data=stacked[stacked['topic'] == t_ind], hue='label', inner='box', cut=0, density_norm='count', alpha=0.5, ax=axs[i])
        axs[i].set_ylabel('Label')
        axs[i].set_xlabel(f'{question.capitalize()}, (Levene\'s Test: statistic={statistic:.3f}, p={p_value:.3f})')
    
    plt.suptitle(topic)
    plt.tight_layout()
    plt.show()