In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
import sys
from pathlib import Path
sys.path.append(str(Path('.').absolute().parent))

from src.templates import neg_templates

In [15]:
model_names = ["SloBERTa",  "SloBERTa-SlEng", "SlEng-BERT"]

In [16]:
df_models = []
for model_name in model_names:
    df_model = pd.read_csv(f"../{model_name.lower()}/atributes_emotions.csv")
    df_model["model"] = model_name
    df_models.append(df_model)

df_all = pd.concat(df_models)

In [17]:
df_all = df_all[~df_all.template.isin(neg_templates)]

Compare sentiment and emotion distributions for different models

In [18]:
def get_sentiment(row):
    if row.Positive==1:
        return "Pozitiven"
    if row.Negative==1:
        return "Negativen"
    if row.Positive==0 and row.Negative==0:
        return "Nevtralen"
    return "Ni v leksikonu"

df_all["Sentiment"] = df_all.apply(lambda x: get_sentiment(x), axis=1)

In [19]:
size = df_all.groupby("model").size().reset_index(name="N")
df_all = df_all.join(size.set_index("model"), on="model")

In [None]:
grouped = df_all.groupby(["model","Sentiment","N"]).size().reset_index(name="n_sent")
grouped["percent"] = grouped["n_sent"]/grouped["N"]*100
grouped

In [None]:
plt.figure(figsize=(6,3), dpi=200)
sns.set_style("whitegrid")

ax = sns.barplot(
    x=grouped["Sentiment"], 
    y=grouped["percent"], 
    hue=grouped["model"],
    hue_order=model_names,
    order=["Nevtralen", "Pozitiven", "Ni v leksikonu", "Negativen"],
    palette=["tab:blue", "tab:green", "tab:orange"],
)
ax.set_ylabel("%")
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), framealpha=0)
#plt.setp(ax.get_legend().get_texts(), fontsize='8') # for legend text
plt.savefig(f"../plots/sentiment_models.png", bbox_inches="tight", dpi=200)

In [None]:
emotions = ['Trust','Joy','Fear', 'Sadness', 'Anticipation', 'Disgust', 'Anger', 'Surprise']
labels = ['Zaupanje', 'Sreča','Strah', 'Žalost  ', '  Pričakovanje   ', 'Gnus', 'Jeza  ', '  Presenečenje']

y = []

for m in model_names:
    df_m = df_all[df_all.model==m]
    n = len(df_m)
    for e in emotions:
        p = len(df_m[df_m[e]==1])/n*100
        y.append({"model":m, "emotion":e, "percent":p})

df_emo = pd.DataFrame(y)
df_emo

In [None]:
plt.figure()
sns.set_style("whitegrid")

ax = sns.barplot(
    data = df_emo,
    x="emotion", 
    y="percent", 
    hue="model",
    hue_order=model_names,
)
ax.set_ylabel("%")

Compare atributes obtained with different models

In [None]:
groups = ["Slovenci", "muslimani", "Arabci", "cigani", "migranti",  "priseljeneci", "bogataši", "brezposelni"]

fig = plt.figure(figsize=(17,3*len(groups)))
fig.tight_layout()

for i, g in enumerate(groups):
    for j, model in enumerate(model_names):      
        img = plt.imread(f"../{model.lower()}/plots/wordcloud_{g}_tfidf.png")        
        ax = fig.add_subplot(len(groups), 3, (i)*3 + (j+1))
        plt.imshow(img)
        if i==0:
            names = model_names
            plt.title(names[j], fontsize=22, pad=16)
        if j==0:
            plt.ylabel(g, fontsize=22, labelpad=1)
        plt.grid(False)
        plt.box(False)
        ax.axes.get_xaxis().set_ticks([])
        ax.axes.get_yaxis().set_ticks([])

plt.tight_layout()
plt.subplots_adjust(wspace=-0.02, hspace=0)
plt.savefig("../plots/wordcloud_sleng_models.png", bbox_inches="tight", dpi=200)