In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import sys
from pathlib import Path
sys.path.append(str(Path('.').absolute().parent))

from src.helpers import generate_wordcloud
from src.templates import neg_templates

In [13]:
# model = "sleng-bert"
# model = "sloberta-sleng"
model = "sloberta"
df = pd.read_csv(f"../{model}/atributes_emotions.csv") 

In [5]:
# remove negative templates
df = df[~df.template.isin(neg_templates)]

In [None]:
# atributes not classified
df[df.Positive.isna()].atribute.unique()

Sentiment and emotion distribution over all atributes

In [None]:
n = len(df)
pos = len(df[df.Positive==1])/n*100
neg = len(df[df.Negative==1])/n*100
mis = len(df[df.Negative.isna()])/n*100
neu = len(df[(df.Negative==0) & (df.Positive==0)])/n*100

plt.figure(figsize=(4,3), dpi=200)
sns.set_style("whitegrid")
ax = sns.barplot(
    x=["Nevtralno", "Pozitivno", "Ni v leksikonu", "Negativno"], 
    y=[neu, pos, mis, neg], 
    color="tab:blue"
)
ax.set_ylabel("%")
plt.savefig(f"../{model}/plots/sentiment_distribution_.png", bbox_inches="tight", dpi=200)

In [None]:
emotions = ['Trust','Joy','Fear', 'Sadness', 'Anticipation', 'Disgust', 'Anger', 'Surprise']
labels = ['Zaupanje', 'Sreča','Strah', 'Žalost  ', '  Pričakovanje   ', 'Gnus', 'Jeza  ', '  Presenečenje']
y = []
for e in emotions:
    y.append(len(df[df[e]==1])/n*100)

plt.figure(figsize=(6.5, 3), dpi=200)
ax = sns.barplot(x=labels, y=y, color="tab:blue")
ax.set_ylabel("%")
plt.savefig(f"../{model}/plots/emotion_distribution_.png", bbox_inches="tight", dpi=200)

Statistical testing
- test if any of the groups differ significantly in the amount of negative/positive atributes
- we use Friedman test followed by Bonferroni-Dunn test

In [10]:
from scipy.stats import friedmanchisquare
import scipy.stats as st

In [None]:
emotion = "Negative"
all_measurements = []

df["classified"] = df["Positive"].notnull()

for t in df.template.unique():
    df_t = df[df.template==t]
    df_grouped = df_t.groupby(["group", "target"]).sum().reset_index()
    df_grouped[emotion] = df_grouped[emotion] / df_grouped["classified"]*100
    measurements = df_grouped[emotion].values
    all_measurements.append(measurements)

groups = df_grouped.target.tolist()
scores = pd.DataFrame(all_measurements, columns=groups)
group_scores = [] 
for g in groups:
    group_scores.append(scores[g].values)
stat, p = friedmanchisquare(*group_scores)
print(f"p-value: {p}")

group_ranks = scores.T.rank().mean(axis=1)
group_ranks = group_ranks.sort_values()

In [None]:
# calculate critical distance
k = len(group_ranks)
N = len(df.template.unique())
alpha = 0.05

alpha_c = alpha/(k-1)
area = 1-alpha_c/2
z = st.norm.ppf(area)
cd = z * np.sqrt((k*(k+1))/6*N) 
print(z)
print(cd)

In [None]:
sns.set_style("whitegrid")
plt.figure(figsize=(3,14))

upper_lim = group_ranks["<mask>"] + cd
lower_lim = group_ranks["<mask>"] - cd
clrs = [
    'tab:orange' if group=="<mask>" else 
    'tab:blue' if (group_ranks[group] > upper_lim or group_ranks[group] < lower_lim) else 
    'gray' 
    for group in group_ranks.index
]

ax = sns.barplot( 
    y=group_ranks.index, 
    x=group_ranks.values, 
    orient="horizontal",
    palette=clrs, 
)

#ax.axvline(avg_ranks["<mask>"] + cd, ls='--', color="black", linewidth=1)
#ax.axvline(avg_ranks["<mask>"] - cd, ls='--', color="black", linewidth=1)

plt.xlabel("Povprečni rang")
plt.ylabel("")
plt.title("")
#plt.xlim(0)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.savefig(f"../{model}/plots/{emotion}_.png", facecolor="white", bbox_inches="tight", dpi=200)

Plot atributes wordcloud colored by sentiment

In [None]:
group = "Slovenci"

wordcloud = generate_wordcloud(df, group)

fig = plt.figure(figsize=(10,5))
plt.imshow(wordcloud)#, interpolation='bilinear')
#plt.title(group)
plt.tight_layout(pad=0)
plt.axis("off")
plt.savefig(f"../{model}/plots/wordcloud_{group}.png")