In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from lemmagen3 import Lemmatizer

In [3]:
import sys
from pathlib import Path
sys.path.append(str(Path('.').absolute().parent))

from src.helpers import generate_wordcloud, clean_data
from src.templates import neg_templates, groups_dict

In [4]:
#model = "sleng-bert"
#model = "sloberta-sleng"
model = "sloberta"
df = pd.read_csv(f"../{model}/atributes_emotions.csv")

In [14]:
lemmatizer = Lemmatizer('sl')
df["atribute_l"] = df["atribute"].apply(lambda x: lemmatizer.lemmatize(x))
df["atribute_l"] = df.atribute_l.str.lower()

In [None]:
n=10
group="Slovenci"
t = "[TARGET] so zelo [MASK]."
#t = "[TARGET] so ponavadi zelo [MASK]."
#t = "Vsi [TARGET] so tako [MASK]."
#t = "Vsi [TARGET] so [MASK]."

df_example = pd.DataFrame()
df_all = pd.read_csv(f"../{model}/top_atributes_all.csv")
df_sub = df_all[(df_all.target==group) & (df_all.template==t)]
df_clean = clean_data(df_sub)


df_example["Prvotne"] = df_sub.sort_values("prob", ascending=False).head(n).atribute.values
df_example["Prečiščene"] = df_clean.sort_values("prob", ascending=False).head(n).atribute.values
df_example["Prerazvrščene"] = df_clean.sort_values("score", ascending=False).head(n).atribute.values
print(df_example.to_latex(index=False))

In [None]:
t = "Zakaj so [TARGET] tako [MASK]?"
#t = "[TARGET] so vedno [MASK]."
groups = ["Slovenci", "Hrvati", "Nemci", "moški", "ženske", "muslimani", "Romi"]

df_s = df[df.template==t].sort_values(by="score", ascending=False)
df_top = pd.DataFrame() 
for g in groups:
    df_top[g] = df_s[df_s.target==g].drop_duplicates(subset="atribute").head(20).atribute.values

df_top
#print(df_top.to_latex(index=False))

In [None]:
# atributes for different sentences and one group

group = "Slovenci"
sentences = [
    "Zakaj so [TARGET] tako [MASK]?",
    "[TARGET] so [MASK].",
    "[TARGET] so vedno tako [MASK].",
    "[TARGET] niso [MASK].",
    "[TARGET] so zelo [MASK].",
    "[TARGET] so ponavadi zelo [MASK].",
]

df_s = df[df.target==group].sort_values(by="score", ascending=False)[["template", "atribute"]].drop_duplicates()
df_top = pd.DataFrame() 

for s in sentences:
    df_top[s] = df_s[df_s.template==s].head(20).atribute.values

df_top
#print(df_top.to_latex(index=False))


# atrs=[]
# for s in sentences:
#     atrs += df_s[df_s.template==s].head(20).atribute.values.tolist()
# pd.DataFrame(atrs).value_counts().head(10)


Similarity between sets of atributes obtained with different sentences

In [10]:
from src.helpers import jaccard
from src.templates import templates

In [None]:
n = len(templates)
targets = df["target"].unique()

all_dist = np.zeros((n, n))

for target in targets:
    df_sub = df[df.target==target]
    dist = np.zeros((n,n))

    for i, s1 in enumerate(templates):
        for j, s2 in enumerate(templates):
            atr1 = df_sub[df_sub.template==s1].sort_values(by="score", ascending=False).atribute_l
            atr2 = df_sub[df_sub.template==s2].sort_values(by="score", ascending=False).atribute_l
            d = jaccard(atr1, atr2)
            dist[i, j] = d
    all_dist = all_dist + dist
all_dist = all_dist/len(targets)

labels = [q.replace("TARGET", "SK") for q in templates]
cm = sns.clustermap(
    all_dist, 
    linewidth=0.5, 
    xticklabels=labels, 
    yticklabels=labels, 
    vmin=0, vmax=1, 
    cmap="Blues", 
    #cbar_kws={"use_gridspec":False, "location":"top"}
    cbar_pos=(0.07, .4, .03, .37)
)
cm.ax_row_dendrogram.set_visible(False) 
cm.ax_col_dendrogram.set_visible(False) 
cm.figure.savefig(f"../{model}/plots/template_similarity_lemma.png", facecolor="white", bbox_inches="tight", dpi=200)

In [35]:
sent = df[["target", "template"]].drop_duplicates()
targets = df["target"].unique()
templates = df["template"].unique()
questions = [s for s in templates if "?" in s]

In [36]:
# with which sentence are differences between groups the largest
# (which sentence is best for detecting differences between groups?)

sent_dist = {}
for temp in templates:
    sent_dist[temp] = []
    for i in range(len(targets)):
        for j in range(i+1, len(targets)):
            atr1 = df[(df.template==temp) & (df.target==targets[i])].atribute_l
            atr2 = df[(df.template==temp) & (df.target==targets[j])].atribute_l
            dist = jaccard(atr1, atr2)
            sent_dist[temp].append(dist)

In [None]:
plot_dict = []
for key, val in sent_dist.items():
    plot_dict.append({"sent":key, "dist":val})
plot_df = pd.DataFrame(plot_dict)
plot_df = plot_df.explode("dist")

plt.figure(figsize=(4,8))
sns.pointplot(x=plot_df["dist"], y=plot_df["sent"], ci="sd", join=False)
#sns.boxplot(x=plot_df["dist"], y=plot_df["sent"], color="tab:blue")

In [174]:
### inter group distance vs inter template distance vs mixed

same_target_dist = {}
same_temp_dist = {}
mixed_dist = {}

# same target, different sentence
for t in targets:
    for i in range(len(templates)):
        for j in range(i+1, len(templates)):
            atr1 = df[(df.template==templates[i]) & (df.target==t)].atribute_l
            atr2 = df[(df.template==templates[j]) & (df.target==t)].atribute_l
            dist = jaccard(atr1, atr2)
            same_target_dist[(t, templates[i], templates[j])] = dist

# same template, different target
for temp in templates:
    for i in range(len(targets)):
        for j in range(i+1, len(targets)):
            atr1 = df[(df.template==temp) & (df.target==targets[i])].atribute_l
            atr2 = df[(df.template==temp) & (df.target==targets[j])].atribute_l
            dist = jaccard(atr1, atr2)
            same_temp_dist[(temp, targets[i], targets[j])] = dist


# randomly pick two sentences with different target and different template
while len(mixed_dist) < 10000:
    pair = sent.sample(n=2, replace=False)
    temp1 = pair.iloc[0].template
    temp2 = pair.iloc[1].template
    targ1 = pair.iloc[0].target
    targ2 = pair.iloc[1].target
    if temp1==temp2 or targ1==targ2:
        continue

    atr1 = df[(df.template==temp1) & (df.target==targ1)].atribute_l
    atr2 = df[(df.template==temp2) & (df.target==targ2)].atribute_l
    dist = jaccard(atr1, atr2)
    mixed_dist[(temp1, temp2, targ1, targ2)] = dist

In [None]:
d_all = pd.DataFrame({"dist":same_target_dist, "type":"enaka skupina, drugačen stavek"})
d_all = d_all.append(pd.DataFrame({"dist":same_temp_dist, "type":"drugačna skupina, enak stavek"}))
d_all = d_all.append(pd.DataFrame({"dist":mixed_dist, "type":"drugačna skupina, drugačen stavek"}))

In [None]:
plt.figure(figsize=(7,3))
#sns.pointplot(x=d["dist"], y=d["type"], ci="sd", join=False)
sns.boxplot(x=d_all["dist"], y=d_all["type"], color="tab:blue")
#sns.pointplot(x=d_ques["dist"], y=d_ques["type"], color="tab:blue")
plt.xlabel("Jaccardova podobnost")
plt.ylabel("")
plt.savefig(f"../{model}/plots/podobnost_stavkov_boxplot_lemma.png", bbox_inches="tight", facecolor="white", dpi=150)

TF-IDF atribute importance

In [26]:
from src.helpers import get_tf_idf_scores, generate_wordcloud_tfidf

In [20]:
df_sub = df[~df.template.isin(neg_templates)]
df_tfidf = get_tf_idf_scores(df_sub)

In [None]:
df_tfidf

In [None]:
groups = ["Slovenci", "Hrvati", "Nemci", "moški", "ženske", "muslimani", "Romi"]
top = pd.DataFrame()
for g in groups:
    top[g]= df_tfidf[df_tfidf.target==g].sort_values("tf_idf", ascending=False).head(20).atribute_l.values
top
#print(top.to_latex(index=False))

In [None]:
groups = df.target.unique()
groups = ["Slovenci"]

for group in groups:
    if group=="<mask>":
        continue
    w = generate_wordcloud_tfidf(df, df_tfidf, group)

    fig = plt.figure(figsize=(10,5))
    plt.imshow(w)
    #plt.title(group)
    plt.tight_layout(pad=0)
    plt.axis("off")
    plt.savefig(f"../{model}/plots/wordcloud_{group}_tfidf.png")

Similarities between social groups and clustering

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import dendrogram, linkage

In [27]:
df_pivot = df_tfidf.pivot(index="target", columns="atribute_l", values="tf_idf").fillna(0)
df_pivot = df_pivot.drop(["<mask>", "ljudje", "državljani", "domačini"])
cosine_similarities = cosine_similarity(df_pivot)

In [None]:

sns.set(font_scale=0.8)
plt.rcParams['xtick.major.size'] = 4
plt.rcParams['xtick.major.width'] = 1
plt.rcParams['ytick.major.size'] = 4
plt.rcParams['ytick.major.width'] = 1

cm = sns.clustermap(
     cosine_similarities,
     linewidth=0.7, 
     xticklabels=df_pivot.index, 
     yticklabels=df_pivot.index, 
     vmin=0, vmax=1, 
     cmap="Blues",
     cbar_kws=dict(orientation='horizontal'),
     cbar_pos=(0.35, 0.85, .4, .02), 
     figsize=(13,13), 
     method="average"
)
cm.ax_row_dendrogram.set_visible(False) 
cm.ax_col_dendrogram.set_visible(False) 
cm.ax_heatmap.tick_params(right=True, bottom=True)


plt.savefig(f"../{model}/plots/group_similarity_tfidf_.png", bbox_inches="tight", facecolor="white", dpi=200)


In [None]:
sns.set_style("white")

Z = linkage(cosine_similarities, 'average')

plt.figure(figsize=(7,17))
dn = dendrogram(Z, labels=df_pivot.index, orientation="left", color_threshold=1.4, leaf_font_size=13 )
#plt.show()
plt.savefig(f"../{model}/plots/dendrogram_average.png", bbox_inches="tight", facecolor="white", dpi=200)

In [None]:
targets = df.target.unique()
for t in targets:
    a = df_tfidf[df_tfidf.target==t].sort_values("tf_idf", ascending=False).head(20).atribute_l.values
    print(f"\item[{t}:] {', '.join(a)}")