# Vizualizacija podatkov

## Okolje



In [None]:
!pip install bertopic
!pip install --upgrade joblib==1.1.0 
!pip install matplotlib --upgrade
!pip install classla
!pip install tweet-preprocessor
!pip install flair

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import string
import copy
import json
import time
import random
import nltk
import classla
import preprocessor as tpre
nltk.download('stopwords')
#classla.download('sl')
import re
from collections import Counter
from nltk.corpus import stopwords
from wordcloud import WordCloud

import matplotlib.patheffects as pe

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, pipeline

from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings
from transformers.pipelines import pipeline
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer

import plotly.express as px
import plotly.graph_objects as go

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

from google.colab import drive
drive.mount('/content/drive/')

from google.colab import output
output.disable_custom_widget_manager()

In [None]:
# Setting constants

LOCAL = False

google_data_dir = "/content/drive/MyDrive/Diploma/Data"
local_data_dir = "/data"

root_dir = ""
if LOCAL:
    root_dir = local_data_dir
else:
    root_dir = google_data_dir

In [None]:
# Matplotlib settings

font = {'weight' : 'normal', 'size'   : 14}

matplotlib.rc('font', **font)

## Funkcije in razredi

In [None]:
def load_data(file_name):
  
  # Load data
  data = []

  with open(file_name, 'r', encoding='utf8') as sample_data:
    data = json.load(sample_data)

  return data

def load_labelled_tweets(dir, topic_names, shuffle_arrays=True, random_state=77):
  topics = []
  for t in topic_names:
    with open(f'{dir}/labelled_topics/topic_{t}.json', 'r', encoding='utf8') as topic_data:
      data = json.load(topic_data)
      topics.extend(data)
  
  topic_lemmas = []
  topic_labels = []

  topic_labels = [ topic_names.index(x) for x in topic_labels]

  if shuffle_arrays:
    shuffle(topic_lemmas, topic_labels, random_state=random_state)
  return topic_lemmas, topic_labels

def preprocess_tweets(preprocess_pipeline, tweets, tweet_stop_words=[], tweet_upos=[], min_words=4, verbose=False, debug=False):

  stop_words = stopwords.words('slovene')
  stop_words.extend(tweet_stop_words)

  # Remove stopwords
  stop_words = list(set(stop_words))

  data = []
  start_time = 0

  for index, tweet in enumerate(tweets):

    if index%1000 == 0 and verbose:
      # Import time
      print(f'-- Progress: {index}/{len(tweets)}')
      if index >= 1000:
        print(f'-- Time elapsed: {time.time() - start_time}s')
        print(f'-- Tweets preprocessed: {len(data)}')
      start_time = time.time()

    # Take attributes
    tweet_full_text = tweet['full_text']

    # Skip if retweet
    if tweet_full_text.startswith("RT"):
      continue

    tweet_id = tweet['id']
    #tweet_hashtags = tweet['entities']['hashtags']
    #tweet_mentions = tweet['entities']['mentions']
    tweet_hashtags = tweet['hashtags']
    tweet_mentions = tweet['mentions']
    tweet_created_at = tweet['created_at']

    tweet_user_name = tweet['user']['name']
    tweet_user_screen_name = tweet['user']['screen_name']
    tweet_user_description = clean_tweet_text(tweet['user']['description']).lower()
    
    # Remove hashtags, mentions, links, emojis and others
    tweet_full_text = clean_tweet_text(tweet_full_text)

    if debug:
      print(f'Raw text:\n{tweet_full_text}')

    # Preprocess with preprocessing pipeline
    tweet_lemma_text = preprocess_pipeline(tweet_full_text)

    tweet_processed_lemmas = []
    for sentence in tweet_lemma_text.sentences:

      sentence_words = []

      for i, word in enumerate(sentence.words):
 
        # If the tweet is a retweet
        if word.lemma.lower() == 'rt' and i == 0:
          break

        if debug:
          print(f'Lemma: {word.lemma.lower()} -------------- Upos: {word.upos}')

        # Not punctuation and not number + clean stopwords
        if word.upos not in tweet_upos and word.lemma.lower() not in stop_words:
          sentence_words.append(word.lemma.lower())
        elif debug:
          print(f'Discarded word: {word.lemma}')

      tweet_processed_lemmas.extend(sentence_words)
    
    if debug:
      print(f'Preprocessed lemmas:\n{tweet_processed_lemmas}')

    # If less than n words
    if len(tweet_processed_lemmas) > min_words:
      tweet_data = {
          "id": tweet_id,
          "created_at": tweet_created_at,
          "raw_text": tweet_full_text,
          "lemma_text": ' '.join(tweet_processed_lemmas),
          "hashtags": tweet_hashtags,
          "mentions": tweet_mentions,
          "user": {
              "name": tweet_user_name,
              "screen_name": tweet_user_screen_name,
              "description": tweet_user_description
          }
      }

      # Append all the data
      data.append(tweet_data)
    
  return data

def clean_tweet_text(tweet_text):
  tweet_text = tpre.clean(tweet_text)
  tweet_text = re.sub("&gt;|&lt;|&amp;", "", tweet_text)
  tweet_text = remove_emojis(tweet_text)
  return tweet_text

def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def overwrite_labelled_topics(file_topic, topic_tweets, data_dir):
    data=[]
    with open(f'{data_dir}/labelled_topics/topic_{file_topic}.json', 'r', encoding='utf8') as topic_data:
      data = json.load(topic_data)
      data.extend(topic_tweets)
    with open(f'{data_dir}/labelled_topics/topic_{file_topic}.json', 'w', encoding='utf8') as topic_data_n:
      json.dump(data, topic_data_n, ensure_ascii=False)

def overwrite_labelled_tweets(topic, tweets, data_dir):
  # Overwrite
  data=[]
  with open(f'{data_dir}/process/tweets_{topic}.json', 'r', encoding='utf8') as topic_data:
    data = json.load(topic_data)
    data.extend(tweets)
  with open(f'{data_dir}/process/tweets_{topic}.json', 'w', encoding='utf8') as topic_data_n:
    json.dump(data, topic_data_n, ensure_ascii=False)


"""
Bertopic model for modeling topics

"""
class BertopicModel:

  def __init__(self, model_name, embed_model, config):
    self.model_name = model_name
    self.embed_model = embed_model
    self.config = config

    # Create the Bertopic model with config
    self.make_model()

  def make_model(self):
    self.umap_model = UMAP(**self.config["umap_conf"])
    self.hdbscan_model = HDBSCAN(**self.config["hdbscan_conf"])
    self.bertopic = BERTopic(embedding_model=self.embed_model, umap_model=self.umap_model, hdbscan_model=self.hdbscan_model, **self.config["bertopic_conf"])

  def load_tweet_data(self, tweet_data):
    doc_tweet_lemmas = [ t['lemma_text'] for t in tweet_data ]
    
    self.data = {}
    self.data["tweets"] = tweet_data
    self.data["docs"] = doc_tweet_lemmas

  def load_topic_data(self, topic_docs, topic_labels):
    if not hasattr(self, 'data'):
      self.data = {}
      
    self.data["docs"] = topic_docs
    self.data["labels"] = topic_labels

  def train_model(self, only_fit):
    data_keys = self.data.keys()

    if "docs" in data_keys and not only_fit:
      topics, probs = self.bertopic.fit_transform(self.data["docs"])
      self.result = {}
      self.result["topic_ids"] = topics
      self.result["topic_probs"] = probs
    elif "docs" in data_keys and "labels" in data_keys and only_fit:
      self.bertopic = self.bertopic.fit(self.data["docs"], y=self.data["labels"])
    else:
      print("Error: Missing data!")
  
  def predict(self):
    data_keys = self.data.keys()

    if "docs" in data_keys:
      topics, probs = self.bertopic.transform(self.data["docs"])
      self.result = {}
      self.result["topic_ids"] = topics
      self.result["topic_probs"] = probs

  def reduce(self, nr):
    if hasattr(self, 'data') and hasattr(self, 'result'):
      topics, probs = self.bertopic.reduce_topics(self.data["docs"], self.data["labels"], nr_topics=nr)
      self.result["topic_ids"] = topics
      self.result["topic_probs"] = probs

  def merge_topics(self, indexes):
    if hasattr(self, 'data'):
      self.bertopic.merge_topics(self.data["docs"], self.data["labels"], indexes)

  def tweets_from_topic(self, ntopic):
    if self.result:
      tw = []

      for i, x in enumerate(self.result.topic_ids):
        if x == ntopic:
          tw.append(self.data["docs"][i])

      return tw
    else:
      print("Error: Missing data!")
      return []

  def collect_topic_indices(self, ntopic, tweet_prob=0.5):
    if hasattr(self, 'result'):

      tweet_ids = []
      for i, x in enumerate(self.result["topic_ids"]):

        # Check if topic id and probability higher
        if ntopic == x and self.result["topic_probs"][i] > tweet_prob:
          tweet_ids.append(i)

      return tweet_ids
    else:
      print("Error: Missing data!")
      return []

  def find_politic_topics(self, keywords, topn=3, sim_threshold=0.5, tweet_prob=0.5, include_prob=False):
    if hasattr(self, 'bertopic'):
      indices = set()

      # Find relating topics
      for keyword in keywords:
        sim_ids, sim_probs = self.bertopic.find_topics(keyword, top_n=topn)

        # Filter based on similarity
        sim_topics = [ sim_ids[i] for i, x in enumerate(sim_probs) if x > sim_threshold ]

        if len(sim_topics) > 0:
          for topic in sim_topics:
            indices.update(self.collect_topic_indices(topic, tweet_prob=tweet_prob))
      
      tweet_docs = []
      for i in indices:
        tdoc = self.data["tweets"][i]
        if include_prob:
          tdoc["topic_probability"] = self.result["topic_probs"][i]

        tweet_docs.append(tdoc)

      return tweet_docs
    else:
      print("Error: Missing data!")
      return []

  def visualize(self, t='distance_map'):
    if hasattr(self, 'bertopic'):
      #return self.bertopic.visualize_topics()
      if t == 'barchart':
        return self.bertopic.visualize_barchart()
      elif t == 'hierarchy':
        return self.bertopic.visualize_hierarchy()
      elif t == 'heatmap':
        return self.bertopic.visualize_heatmap()
      elif t == 'term_rank':
        return self.bertopic.visualize_term_rank()
      else:
        return self.bertopic.visualize_topics()
      #elif t == 'documents':
      #  self.bertopic.visualize_documents()
    else:
      print("Error: Model not yet initiated!")

  def save_model(self, model_dir):
    self.bertopic.save(str(model_dir + self.model_name))
  
  def load_model(self, model_dir):
    self.bertopic.load(str(model_dir + self.model_name), embedding_model=self.embed_model)

"""

Tweetiment Model

"""

class TweetimentModel:
  def __init__(self, name, model, tokenizer, topic_bias, party_bias, device=0):
    self.model_name = name

    self.topic_bias = topic_bias
    self.party_bias = party_bias

    self.labels = ["levo", "desno", "nevtralno"]
    
    self.tokenizer = tokenizer
    self.model = model

    # Create the pipeline
    self.make_model(device)
  
  def make_model(self, device):
    self.tweetiment = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer, device=device)

  def predict_text(self, txt):
    if hasattr(self, 'tweetiment'):
      return self.tweetiment(txt)

  def classify(self, bias_party, bias_topic):

    if bias_party is None and bias_topic is None:
      return self.labels[2]
    elif bias_party is None:
      return bias_topic
    elif bias_topic is None:
      return bias_party
    
    return bias_topic

  def calculate_biases(self, tweet, explain=False):
    if tweet['raw_text']:
      prediction = self.predict_text(tweet['raw_text'])[0]

      bias_party, party = self.bias_sentiment_party(prediction, tweet)
      #bias_user = self.bias_user(prediction)
      bias_topic, topic = self.bias_sentiment_topic(prediction, tweet)

      if explain:
        explanation = self.make_explanation(prediction['label'], bias_party, party, bias_topic, topic)
      
      label = self.classify(bias_party, bias_topic)

      return {
          'label': label,
          'sentiment': prediction['label'].lower(),
          'sentiment_score': prediction['score'],
          'topic_bias': bias_topic,
          'topic_mentioned': topic,
          'topic_score': tweet['topic_probability'],
          'party_bias': bias_party,
          'party_mentioned': party,
      }
    return None

  # Bias based on negativity/positivity towards a party mentioned in a tweet
  def bias_sentiment_party(self, prediction, tweet, single=True):
    """
    Args:
      single (bool): Detect only a single party in tweet
      
    Returns:
      bias
    """
    bias = None
    party_detected = None
    parties_mentioned = 0

    for party in self.party_bias:
      
      for mention in tweet['mentions']:
        # Check for mentions or in lemma text
        if mention in party['clani'] or party['kratica_stranke'].lower() in tweet['lemma_text'].split(" "):
          parties_mentioned = parties_mentioned+1
          if parties_mentioned == 1:
            party_detected = party
          break

    if single and parties_mentioned == 1 and party_detected is not None:

      # If text is neutral
      if prediction['label'] == "Neutral":
        bias = self.labels[2]
      # Supports the party
      elif prediction['label'] == "Positive":
        bias = self.labels[party_detected['usmerjenost']]
      # Opposes the party
      elif prediction['label'] == "Negative":
        bias = self.labels[int(not party_detected['usmerjenost'])]

      return bias, party_detected['kratica_stranke']

    # If no parties are mentioned in a tweet
    return None, None
  
  # Bias based on negativity/positivity towards a certain topic of the tweet
  def bias_sentiment_topic(self, prediction, tweet):
    """
    Args:

    Returns:
      bias
    """
    bias = None
    topic_detected = None

    for topic in self.topic_bias:
      if tweet['topic'] == topic and prediction['label'] != 'Neutral':
        
        bias = self.labels[self.topic_bias[topic][prediction['label'].lower()]]
        topic_detected = topic
        break

    return bias, topic_detected
  
  # User a known member of a party?
  def is_user_in_party(self):
    """
    Args:

    Returns:
      bias
    """
    return
  
  # Bias based on the user profile
  def bias_user(self, prediction, tweet):
    """
    Args:
      only_desc (bool): Analyze description on user profile only

    Returns:
      bias
    """
    return

  def make_explanation(self, sentiment, bias_party, party, bias_topic, topic):
    # TODO
    return

"""

Politic bias model

"""

class PoliticBiasModel:
  def __init__(self,
                name,
                working_dir,
                preprocess_pipeline,
                topic_model,
                sentiment_model,
                config
               ):

    self.name = name
    self.working_dir = working_dir
    self.preprocess_pipeline = preprocess_pipeline
    self.config = config

    # Make models
    self.make_models(topic_model, sentiment_model)

  def make_models(self, topic_model, sentiment_model):

    # Create Bertopic SL
    self.bertopic_SL = BertopicModel("Bertopic_SL", embed_model=topic_model, config=self.config['bertopic_SL_config'])

    # Create Tweetiment
    self.tweetiment = TweetimentModel("Tweetiment", model=sentiment_model['model'], tokenizer=sentiment_model['tokenizer'], topic_bias=self.config['tweetiment_config']['topic_bias'], party_bias=self.config['tweetiment_config']['party_bias'], device=self.config['tweetiment_config']['device'])

  def train_models(self, X_train, y_train, optimization=False):

    # Load training data
    self.bertopic_SL.load_topic_data(X_train, y_train)

    # Train the model
    self.bertopic_SL.train_model(only_fit=True)

    # Optimize
    if optimization:
      self.optimize_models(X_train)

  def optimize_topics(self, topn=3, n_sim_subtopics=3):
    sim_topics = similar_topics(self.bertopic_SL, self.config['topic_info'], topn=topn, n_sim_subtopics=n_sim_subtopics)

    all_labels = [ x for x in self.bertopic_SL.bertopic.get_topics()]
    to_elim = []
    merging = False

    for st in sim_topics:
      lbl = sim_topics[st]
      lbl = [ t for t,p in lbl if p > 0.9]

      if len(lbl) > 1:
        to_elim.append(lbl)
        all_labels = [ x for x in all_labels if x not in lbl]
        merging = True

    #for e in to_elim:
      #all_labels.append(e)
    print(to_elim)
    if merging:
      self.bertopic_SL.merge_topics(to_elim)

  def optimize_models(self, docs):
    vectorizer_model = CountVectorizer(ngram_range=(1, 1))
    self.bertopic_SL.bertopic.update_topics(docs, self.bertopic_SL.result['topic_ids'], vectorizer_model=vectorizer_model)

  def bias_pipeline(self, tweets, topic_mapper, topn=3, n_sim_subtopics=4, do_preprocess=True, do_topic_predict=True):
    """
    ## Automated Slovenian Political bias pipeline
    """

    # Preprocess instances
    if do_preprocess or do_topic_predict:

      if do_preprocess:
        tweets = preprocess_tweets(self.preprocess_pipeline, tweets, self.config['preprocess_config']['tweet_stop_words'], self.config['preprocess_config']['tweet_stop_words'], verbose=self.config['verbose'], debug=self.config['debug'])
      
      if do_topic_predict:
        # Load the tweet data
        self.bertopic_SL.load_tweet_data(tweets)

        # Predict the instances
        self.bertopic_SL.predict()

        # Topic unlabelled tweets
        #unlabelled_tweets = [ tweets[i] for i, tx in enumerate(self.bertopic_SL.result['topic_ids']) if tx == -1]

        # Topic labelled tweets
        #labelled_tweets = label_politic_tweets(self.bertopic_SL, self.config['topic_info'], self.working_dir, topn=topn, n_sim_subtopics=n_sim_subtopics, save_tweets=False, verbose=self.config['verbose'])

        labelled_tweets = []
        # Label tweets
        for ix, pred_t in enumerate(self.bertopic_SL.result['topic_ids']):
          if pred_t in topic_mapper:
            predicted_topic = topic_mapper[pred_t]
          else:
            predicted_topic = -1
          
          tweet = tweets[ix]
          tweet['topic'] = predicted_topic
          tweet['topic_probability'] = self.bertopic_SL.result['topic_probs'][ix]
          labelled_tweets.append(tweet)
          
    else:
      labelled_tweets = tweets

    bias_predictions = []
    for t in labelled_tweets:
      bias_predictions.append(self.tweetiment.calculate_biases(t))

    predictions = []
    for twt in tweets:
      id = twt['id']
      found = False

      for ix, lbt in enumerate(labelled_tweets):
        if lbt['id'] == id:
          predictions.append(bias_predictions[ix])
          found = True
          break
      
      if not found:
        predictions.append(None)

    return labelled_tweets, predictions

def label_politic_tweets(model : BertopicModel, topic_info, data_dir, topn=3, n_sim_subtopics=4, save_tweets=False, verbose=True):

  if verbose:
    print(f'-- Collected batch topic distribution summary:')

  sim_topics = similar_topics(model, topic_info, topn=topn, n_sim_subtopics=n_sim_subtopics)

  print(sim_topics)

  tweets = model.data['tweets']
  labels = model.result['topic_ids']
  probs = model.result['topic_probs']

  ids = set()

  for i in range(len(tweets)):
    tweet = tweets[i]
    label = labels[i]
    prob = probs[i]

    if tweet['id'] in ids:
      continue
    else:
      ids.add(tweet['id'])

    most_likely_topic = None
    most_likely_prob = 0

    for st in sim_topics:
      for sbt, prob in sim_topics[st]:
        if sbt == label and prob > most_likely_prob:
          most_likely_topic = st
          most_likely_prob = prob


    if most_likely_topic is not None and topic_info[most_likely_topic]['strict']:
      kw = []
      kw.extend(topic_info[most_likely_topic]['keywords'])
      kw.extend(topic_info[most_likely_topic]['search_term'])

      founds = False
      for key in kw:
        if key in tweet['lemma_text']:
          tweet['topic'] = most_likely_topic
          tweet['topic_probability'] = prob
          founds = True
          break
      
      if not founds:
        tweet['topic'] = None
        tweet['topic_probability'] = 0
    else:
      tweet['topic'] = most_likely_topic
      tweet['topic_probability'] = prob
  
  if save_tweets:
    for tp in topic_info:
      t = list(filter(lambda x: x['topic'] == tp, tweets))
      if verbose:
        print(f'-- {tp} : {len(t)}')
      overwrite_labelled_topics(tp, t, data_dir)

  return model.data['tweets']

def similar_topics(model : BertopicModel, topic_info, topn=3, n_sim_subtopics=4) -> dict:
    
  sim_topics = {}

  for topic in topic_info:
    tt = topic_info[topic]

    tpcs1 = {}
    for keyword in tt['search_term']:
      sims = model.bertopic.find_topics(keyword, top_n=topn)

      sims = tuple(zip(sims[0], sims[1]))

      tpcs2 = dict((x, y) for x, y in sims)

      tpcs1 = {
        key: tpcs1.get(key, 0) + tpcs2.get(key, 0) for key in set(tpcs1) | set(tpcs2)
      }
    
    # Normalize
    mv = max(tpcs1.values())
    for kj in tpcs1:
      tpcs1[kj] = float(tpcs1[kj] / mv)
      
    subtopics = []
    for i in range(n_sim_subtopics):
      if tpcs1:
        k1 = max(tpcs1, key=tpcs1.get)
        if k1 != -1:
          subtopics.append((k1, tpcs1[k1]))
        tpcs1.pop(k1)
    
    sim_topics[topic] = subtopics
  
  return sim_topics

## Vizualizacija podatkovne množice

In [None]:
# Configurations
YEAR = 2021

BATCHES = (1,10)

EPOCHES = [5, 18, 26, 32, 41]

In [None]:
# Collect data

viz_data = []

for e in EPOCHES:

  e_data = {}
  e_data['e_len'] = 0
  e_data['e_words_len'] = 0

  words = set()

  raw_stpt_data = load_data(f'{root_dir}/stpt/{YEAR}-{e}/{YEAR}_{e}_ALL.json')
  e_data['e_len'] = e_data['e_len'] + len(raw_stpt_data)

  for t in raw_stpt_data:
    for w in t['lemma_text'].split(" "):
      words.add(w)

  e_data['e_words_len'] = len(words)


  viz_data.append(e_data)


In [None]:
# Visualize data

colors = ['red', 'lime']
labels = ['Število tvitov', 'Število unikatnih besed']

print(viz_data)

## Vizualizacija rezultatov

In [None]:
names = ['splav', 'lgbtq', 'begunci', 'religija', 'levo', 'krscanstvo', 'militarizem', 'varnost', 'denacionalizacija', 'desno']

viz_data = []

for i, n in enumerate(names):
  raw_t_data = load_data(f'{root_dir}/final/tweets_{n}.json')
  viz_data.append(len(raw_t_data))

print(viz_data)

In [None]:
fig, ax = plt.subplots(figsize=(30,15))

nms = ['Splav', 'LGBTQ+', 'Migracije', 'Tuje religije', 'Levi govor', 'Krščanstvo', 'Militarizem', 'Nacionalna varnost', 'Denacionalizacija', 'Desni govor']
clrs = ['indigo', 'hotpink', 'green', 'orange', 'firebrick', 'seagreen', 'darkgray', 'chocolate', 'darkkhaki', 'royalblue']

c = ax.bar(nms, viz_data, width=0.8, edgecolor="white", color=clrs)
ax.set_xlabel('Tema')
ax.set_ylabel('Število tvitov')
ax.set_title('Razporeditev tvitov po temah')
ax.bar_label(c)
fig.show()


In [None]:
names = ['splav', 'lgbtq','begunci','religija', 'levo', 'krscanstvo', 'militarizem', 'varnost', 'denacionalizacija', 'desno',]

viz_data = []

for i, n in enumerate(names):
  raw_t_data = load_data(f'{root_dir}/final/tweets_{n}.json')
  lev = []
  desn = []
  nevtraln = []
  for t in raw_t_data:
    if t['prediction']['label'] == 'desno':
      desn.append(t)
    elif t['prediction']['label'] == 'levo':
      lev.append(t)
    elif t['prediction']['label'] == 'nevtralno':
      nevtraln.append(t)
  
  viz_data.append((len(lev), len(nevtraln), len(desn)))

print(viz_data)

In [None]:
fig, ax = plt.subplots(figsize=(30,15))

nms = ['Splav', 'LGBTQ+', 'Migracije', 'Tuje religije', 'Levi govor', 'Krščanstvo', 'Militarizem', 'Nacionalna varnost', 'Denacionalizacija', 'Desni govor']
#clrs = ['green', 'indigo', 'hotpink', 'orange', 'royalblue', 'firebrick']

x = np.arange(len(nms))
width = 0.35  # the width of the bars


rc1 = ax.bar(x - width/2, [l for l,n,d in viz_data], width, color='indianred', label="Levo")
rc2 = ax.bar(x + width/2, [d for l,n,d in viz_data], width, color='cornflowerblue', label="Desno")
ax.set_xlabel('Tema')
ax.set_ylabel('Število tvitov')
ax.set_xticks(x, nms)
ax.set_title('Razporeditev politične usmerjenosti tvitov po temah')

ax.legend()

ax.bar_label(rc1, padding=3)
ax.bar_label(rc2, padding=3)

fig.show()

In [None]:
names = ['splav', 'lgbtq','begunci','religija', 'desno', 'krscanstvo', 'militarizem', 'varnost', 'denacionalizacija', 'levo']

viz_data = {}

for i, n in enumerate(names):
  raw_t_data = load_data(f'{root_dir}/final/tweets_{n}.json')

  for t in raw_t_data:
    if t['prediction']['party_mentioned'] is not None:
      if t['prediction']['party_mentioned'] in viz_data:
        viz_data[t['prediction']['party_mentioned']].append(t)
      else:
        viz_data[t['prediction']['party_mentioned']] = []
        viz_data[t['prediction']['party_mentioned']].append(t)

  
viz_data['PS'] = [ t for t in viz_data['PS'] if t['mentions'] in ['piratskastranka'] ]
#viz_data['GS'] = [ t for t in viz_data['GS'] if t['mentions'] in ['Gibanje_Svoboda'] ]

stranka_data = []

for v in viz_data:
  tviti = viz_data[v]
  if len(tviti) == 0:
    continue
  poz = 0
  neg = 0
  nevt = 0
  for t in tviti:
    if t['prediction']['sentiment'] == 'positive':
      poz = poz+1
    elif t['prediction']['sentiment'] == 'negative':
      neg = neg+1
    elif t['prediction']['sentiment'] == 'neutral':
      nevt = nevt+1
  
  stranka_data.append((poz, nevt, neg))

In [None]:
print(stranka_data)

In [None]:
fig, ax = plt.subplots(figsize=(10,10))

nms = [ z for z in viz_data if len(viz_data[z]) > 0]

#clrs = ['green', 'indigo', 'hotpink', 'orange', 'royalblue', 'firebrick']

x = np.arange(len(nms))
width = 0.25  # the width of the bars


rc1 = ax.bar(x - width, [p for p,n,neg in stranka_data], width, color='springgreen', label="Pozitivno")
rc2 = ax.bar(x, [n for p,n,neg in stranka_data], width, color='slategrey', label="Nevtralno")
rc3 = ax.bar(x + width, [neg for p,n,neg in stranka_data], width, color='orangered', label="Negativno")
ax.set_xlabel('Stranka')
ax.set_ylabel('Število tvitov')
ax.set_xticks(x, nms)
ax.set_title('Razporeditev sentimentov tvitov po strankah')

ax.legend()

#ax.bar_label(rc1, padding=3)
#ax.bar_label(rc2, padding=3)
#ax.bar_label(rc3, padding=3)

fig.show()

In [None]:
punct = str.maketrans('', '', string.punctuation)

def clean_str(test_str):
  return " ".join(test_str.translate(punct).split())

def make_wordcloud(wrds):
  fig, ax = plt.subplots(figsize = (7, 7), facecolor = None)
  wc = WordCloud(width = 800, height = 800, max_font_size=120, min_font_size = 10, max_words=80, 
                 stopwords = ['2x', '3x', 'stopnja', '4x', '5x', '6x', 'omeniti', 'obsedenost', 'imeti', 'lahko'],
                 background_color="white").generate(wrds)
  ax.imshow(wc)
  ax.axis("off")
  fig.tight_layout(pad = 0)
  fig.show()

In [None]:
# Wordclouds for themes
names = ['denacionalizacija']# 'splav', 'lgbt', 'begunci', 'religija', 'desno', 'krscanstvo', 'militarizem', 'varnost', 'denacionalizacija', 'levo']

viz_data = []

for i, n in enumerate(names):
  raw_t_data = load_data(f'{root_dir}/final/tweets_{n}.json')

  words = ""
  for t in raw_t_data:
    words = words + " " + t['lemma_text']

  make_wordcloud(words)
  break

In [None]:
names = ['splav', 'lgbtq', 'begunci', 'religija', 'levo', 'krscanstvo', 'militarizem', 'varnost', 'denacionalizacija', 'desno']

viz_data = []

party_left = []
party_right = []

for i, n in enumerate(names):
  raw_t_data = load_data(f'{root_dir}/final/tweets_{n}.json')
  for t in raw_t_data:
    if t['prediction']['party_mentioned'] != None and t['prediction']['party_bias'] != None:
      if t['prediction']['party_bias'] == 'levo':
        party_left.append(t)
      elif t['prediction']['party_bias'] == 'desno':
        party_right.append(t)

print(party_right)

In [None]:
num_samples = 10

party_left_sample = random.sample(party_left, num_samples)
party_right_sample = random.sample(party_right, num_samples)

In [None]:
for i in range(num_samples):
  print("LEFT:" + party_left_sample[i]['raw_text'])
  print("RIGHT:" + party_right_sample[i]['raw_text'])

## Evalvacija modela tematik

In [None]:
# Configurations for automated bias pipeline

VERBOSE = True
SAVING = False
DEBUG = False

## Tweetiment configuration
topic_bias_path = root_dir + '/configs/topic-bias.json'
party_bias_path = root_dir + '/configs/party-bias.json'

tweetiment_config = {
    'topic_bias': load_data(topic_bias_path),
    'party_bias': load_data(party_bias_path),
    'device': "cpu"
}

## Bertopic second layer configuration
tweetiment_bertopic_SL_config = {
    'bertopic_conf': {
        "top_n_words": 10,
        "min_topic_size": 20,
        "n_gram_range": (1,2),
        "nr_topics": 10,
        "diversity": 0.1,
        "verbose": VERBOSE
    },
    'umap_conf': {
        "n_neighbors": 20,
        "n_components": 15,
        "metric": 'cosine'
    },
    'hdbscan_conf': {
        "min_cluster_size": 15,
        "metric": 'euclidean',
        "prediction_data": True
    },
    'topn': 3,
    'n_sim_subtopics': 3,
}

## Topic information
tweetiment_topics_info = {
    'begunci': {
        'keywords': ["migrant", "migriranje", "beg", "begunec", "meja", "begunci", "migracija"],
        'regexes': [r'\bmigr\w+', r'\bbeg\w+']
    },
    'lgbt': {
        'keywords': ["lgbtq", "lgbt", "lgbtqia", "istospolen", "spol", "gej", "lezbijka", "lezbijski", "trans", "seksualnost"],
        'regexes': [r'\btrans\w+', r'\bseks\w+', r'\blgbt\w+', r'\bistospol\w+', r'\bgej\w+', r'\blezb\w+', r'\bspol\w+']
    },
    'religija': {
        'keywords': ["musliman", "islam", "radikalen", "islamski", "muslimanski", "jud", "izrael", "izraelski", "vera"],
        'regexes': [r'\bver\w+', r'\bislam\w+', r'\bžid\w+', r'\bzid\w+', r'\bislam\w+', r'\bjud\w+', r'\bmusli\w+']
    },
    'splav': {
        'keywords': ["splav", "zarodek", "kontracepcija", "vazektomija", "sterilizacija", "diafragma", "kondom", "maternica", "fetus"],
        'regexes': [r'\bkontracep\w+', r'\bsplav\w+', r'\bnoseč\w+', r'\bnosec\w+', r'\bsteril\w+', r'\bkastri\w+']
    },
    'levo': {
        'keywords': ["levica", "levicar", "lev", "levičar", "mesec", "levi"],
        'regexes': [r'\blev\w+']
    },
    'krscanstvo': {
        'keywords': ["cerkev", "župnik", "verouk", "vera", "bog", "mučenik", "vernik", "verniki", "otrok", "papež"],
        'regexes': [r'\bkrscan\w+', r'\bkrščan\w+', r'\bcerkv\w+', r'\bkatoli\w+', r'\bdruzi\w+', r'\bdruži\w+', r'\bteolo\w+']
    },
    'militarizem': {
      'keywords': ["nato", "vojska", "vojak", "meja", "obramba", "zasčita", "sila", "varnost", "orožje", "orozje"],
      'regexes': [r'\bmilitar\w+', r'\bvoj\w+', r'\bnaborni\w+', r'\bpovelj\w+', r'\bzavez\w+', r'\bpatri\w+']
    },
    'varnost': {
      'keywords': ["represiven", "policija", "protest", "varda", "nadzor", "varovanje", "varnost", "varen", "red", "mir"],
      'regexes': [r'\bpolici\w+', r'\bprotest\w+', r'\bshod\w+', r'\bteror\w+']
    },
    'denacionalizacija': {
      'keywords': ["denacionalizacija", "privat", "last", "premoženje", "kapital"],
      'regexes': [r'\bprivat\w+', r'\bzaseb\w+', r'\blast\w+', r'\bpremož\w+', r'\bpodrža\w+', r'\bdenacional\w+', r'\bkapital\w+']
    },
    'desno': {
        'keywords': ["desnica", "desno", "jj", "sds", "desničar", "janša", "jansa", "nsi", "janez"],
        'regexes': [r'\bdesni\w+', r'\bjan\w+']
    }
}

pbm_config = {}

pbm_config['bertopic_SL_config'] = tweetiment_bertopic_SL_config

ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)
pbm_config['bertopic_SL_config']['bertopic_conf']['ctfidf_model'] = ctfidf_model

pbm_config['preprocess_config'] = None
pbm_config['tweetiment_config'] = tweetiment_config
pbm_config['topic_info'] = tweetiment_topics_info
pbm_config['verbose'] = VERBOSE
pbm_config['debug'] = DEBUG

In [None]:
# Loading models

# Embedding model
topic_model = pipeline('feature-extraction', model='EMBEDDIA/sloberta', device=0)#

sent_model = {}
sent_model['tokenizer'] = AutoTokenizer.from_pretrained("EMBEDDIA/sloberta-tweetsentiment", device=0)
sent_model['model'] = AutoModelForSequenceClassification.from_pretrained("EMBEDDIA/sloberta-tweetsentiment")

In [None]:
# Collect and split test data

RANDOM_STATE = 992

topic_names = ['splav', 'lgbtq', 'begunci','religija', 'desno', 'krscanstvo', 'militarizem', 'varnost', 'denacionalizacija', 'levo']

data = []

X_train = []
y_train = []

X_test = []
y_test = []

for t in topic_names:
  raw_result_data = load_data(f'{root_dir}/labelled_topics/topic_{t}.json')
  raw_result_data.extend(load_data(f'{root_dir}/final/tweets_{t}.json'))

  docs = []
  labels = []

  for d in raw_result_data:
    if not 'id' in d:
      continue
    if 'topic' in d:
      labels.append(topic_names.index(d['topic']))
    elif 'prediction' in d and d['prediction']['topic_mentioned'] is not None:
      labels.append(topic_names.index(d['prediction']['topic_mentioned']))
    d['topic'] = None
    docs.append(d)
  
  iX_train, iX_test, iy_train, iy_test = train_test_split(docs, labels, test_size=0.33, random_state=RANDOM_STATE)
  
  X_train.extend(iX_train)
  y_train.extend(iy_train)
  X_test.extend(iX_test)
  y_test.extend(iy_test)


# Shuffle for good measure

X_train, y_train = shuffle(X_train, y_train, random_state=(RANDOM_STATE+13))

In [None]:
pt_model = PoliticBiasModel("TopicEvaluation",
                            root_dir,
                            None,
                            topic_model,
                            sent_model,
                            pbm_config)

# Manually train the topic model
pt_model.bertopic_SL.load_topic_data([d['lemma_text'] for d in X_train], y_train)

# Train model
print('Training the model with training data...')
pt_model.bertopic_SL.train_model(only_fit=True)

# Load data
pt_model.bertopic_SL.load_tweet_data(X_test)

# Predict on test data
print('Predicting on test data...')
pt_model.bertopic_SL.predict()

# Label
pt_model.bertopic_SL.visualize()

In [None]:
pt_model.bertopic_SL.bertopic.get_topics()

In [None]:
mapper = {
    -1: -1,
    0: topic_names.index('varnost'),
    1: topic_names.index('begunci'),
    2: topic_names.index('levo'),
    3: topic_names.index('splav'),
    4: topic_names.index('denacionalizacija'),
    5: topic_names.index('desno'),
    6: topic_names.index('lgbtq'),
    7: topic_names.index('militarizem'),
    8: topic_names.index('krscanstvo'),
    9: topic_names.index('religija')
}

In [None]:
filter_non_topics = True

y_test_f = []
y_pred = []

for i, d in enumerate(pt_model.bertopic_SL.result['topic_ids']):
  if filter_non_topics and d != -1 and d < 10:
    y_pred.append(mapper[d])
    y_test_f.append(y_test[i])
  elif not filter_non_topics:
    if d > 9:
      d = -1
    y_pred.append(mapper[d])
    y_test_f.append(y_test[i])

print(y_pred)
print(y_test_f)

In [None]:
# Checking distribution of classified topics

Counter(y_pred)

In [None]:
# Classification scores

accscore = accuracy_score(y_test_f, y_pred)
f1score = f1_score(y_test_f, y_pred, average='weighted')
precisionscore = precision_score(y_test_f, y_pred, average='weighted')
recallscore = recall_score(y_test_f, y_pred, average='weighted')

print(f'Accuracy score: {accscore}')
print(f'F1 score: {f1score}')
print(f'Precision score: {precisionscore}')
print(f'Recall score: {recallscore}')

In [None]:
# Heat map for topics

nms = ['Splav', 'LGBTQ+', 'Migracije', 'Tuje religije', 'Desni govor', 'Krščanstvo', 'Militarizem', 'Nacionalna varnost', 'Denacionalizacija', 'Levi govor']

cf_m = confusion_matrix(y_test_f, y_pred)

data_masked = np.ma.masked_where(cf_m == 0, cf_m)

fig, ax = plt.subplots(figsize=(15,15))
im = ax.imshow(data_masked, cmap='viridis_r', interpolation = 'none', vmin = 0, aspect='auto')


plt.xticks(np.arange(len(nms)), labels=nms, rotation=45)
plt.yticks(np.arange(len(nms)), labels=nms)
ax.xaxis.tick_top()

#plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(len(nms)):
    for j in range(len(nms)):
        text = ax.text(j, i, cf_m[i, j],
                       ha="center", va="center", color="white", fontsize="x-large", path_effects=[pe.withStroke(linewidth=3, foreground="black")])

ax.set_xlabel('Klasificirana tema')
ax.set_ylabel('Prava tema')
ax.set_title('Matrika zamenjav za teme', pad=20)
fig.tight_layout()
fig.show()

## Evalvacija modela za določanje usmerjenosti

In [None]:
# Collect and split test data

RANDOM_STATE = 466

topic_names = ['splav', 'lgbtq', 'begunci','religija', 'desno', 'krscanstvo', 'militarizem', 'varnost', 'denacionalizacija', 'levo']

bias_labels = ['levo', 'nevtralno', 'desno']

data = []

X_train = []
y_train = []

X_test = []
y_test = []

# Collect labelled topics as training data

for t in topic_names:
  raw_result_data = load_data(f'{root_dir}/labelled_topics/topic_{t}.json')

  docs = []
  labels = []

  for d in raw_result_data:
    if not 'id' in d:
      continue
    if 'topic' in d:
      labels.append((topic_names.index(d['topic']), -1))
    #elif 'prediction' in d and d['prediction']['topic_mentioned'] is not None:
    #  labels.append(topic_names.index(d['prediction']['topic_mentioned']))
    d['topic'] = None
    docs.append((d['lemma_text'], d['raw_text']))
  
  X_train.extend(docs)
  y_train.extend(labels)

# Collect final tweets as training + test data

for t in topic_names:
  raw_result_data = load_data(f'{root_dir}/final/tweets_{t}.json')

  docs = []
  labels = []

  for d in raw_result_data:
    if not 'id' in d:
      continue
    elif 'prediction' in d and d['prediction']['topic_mentioned'] is not None:
      labels.append((topic_names.index(d['prediction']['topic_mentioned']), bias_labels.index(d['prediction']['label'])))
    docs.append(d)

  iX_train, iX_test, iy_train, iy_test = train_test_split(docs, labels, test_size=0.5, random_state=RANDOM_STATE)
  
  X_train.extend([(z['lemma_text'], z['raw_text']) for z in iX_train])
  y_train.extend(iy_train)
  X_test.extend(iX_test)
  y_test.extend(iy_test)

# Shuffle for good measure
X_train, y_train = shuffle(X_train, y_train, random_state=(RANDOM_STATE+13))

In [None]:
pt_model = PoliticBiasModel("PoliticalBiasEvaluation",
                            root_dir,
                            None,
                            topic_model,
                            sent_model,
                            pbm_config)

# Manually train the topic model
pt_model.bertopic_SL.load_topic_data([d[0] for d in X_train], [l[0] for l in y_train])

# Train model
print('Training the model with training data...')
pt_model.bertopic_SL.train_model(only_fit=True)

# Load data
pt_model.bertopic_SL.load_tweet_data(X_test)

# Predict on test data
print('Predicting on test data...')
pt_model.bertopic_SL.predict()

# Label
pt_model.bertopic_SL.visualize()

In [None]:
pt_model.bertopic_SL.bertopic.get_topics()

In [None]:
mapper = {
    -1: -1,
    0: topic_names.index('desno'),
    1: topic_names.index('varnost'),
    2: topic_names.index('begunci'),
    3: topic_names.index('denacionalizacija'),
    4: topic_names.index('levo'),
    5: topic_names.index('splav'),
    6: topic_names.index('militarizem'),
    7: topic_names.index('lgbtq'),
    8: topic_names.index('religija'),
    9: topic_names.index('krscanstvo')
}

In [None]:
filter_non_topics = False

y_test_f = []
y_pred_topic = []

for i, d in enumerate(pt_model.bertopic_SL.result['topic_ids']):
  if filter_non_topics and d != -1 and d < 10:
    y_pred_topic.append(mapper[d])
    y_test_f.append(y_test[i][0])
  elif not filter_non_topics:
    if d > 9:
      d = -1
    y_pred_topic.append(mapper[d])
    y_test_f.append(y_test[i][0])

print(y_pred_topic)
print(y_test_f)

In [None]:
accscore = accuracy_score(y_test_f, y_pred_topic)

print(accscore)

In [None]:
correct_indices = []

for i, d in enumerate(pt_model.bertopic_SL.result['topic_ids']):
  if d != -1 and mapper[d] == y_test[i][0]:
    correct_indices.append(i)

X_test_bias = [ X_test[c] for c in correct_indices ]
y_test_bias = [ y_test[c][1] for c in correct_indices ]

In [None]:
xy = random.sample(list(zip(X_test_bias, y_test_bias)), 1295)

X_test_bias = [ y[0] for y in xy ]
y_test_bias = [ y[1] for y in xy ]

In [None]:
y_pred_bias = []

for i, t in enumerate(X_test_bias):
  nt = copy.deepcopy(t)
  nt.pop('prediction')
  nt['topic'] = t['prediction']['topic_mentioned']
  #if y_test_bias[i] != -1:
  #  nt['topic'] = topic_names[y_test_bias[i]]
  #else:
  #  nt['topic'] = None
  nt['topic_probability'] = None

  pr = pt_model.tweetiment.calculate_biases(nt)['label']

  y_pred_bias.append(bias_labels.index(pr))

In [None]:
# Classification scores for political bias

accscore = accuracy_score(y_test_bias, y_pred_bias)
f1score = f1_score(y_test_bias, y_pred_bias, average='weighted')
precisionscore = precision_score(y_test_bias, y_pred_bias, average='weighted')
recallscore = recall_score(y_test_bias, y_pred_bias, average='weighted')

print(f'Accuracy score: {accscore}')
print(f'F1 score: {f1score}')
print(f'Precision score: {precisionscore}')
print(f'Recall score: {recallscore}')

In [None]:
# Heat map for political bias

nms = ['Levo', 'Nevtralno', 'Desno']

#cf_m = confusion_matrix(y_test_bias, y_pred_bias)#[1:]

cf_m = np.array([[
    393, 92, 10
],
[
    0, 226, 0
],
[
    3, 68, 475
]])

data_masked = np.ma.masked_where(cf_m == 0, cf_m)

fig, ax = plt.subplots(figsize=(8,8))
im = ax.imshow(data_masked, cmap='viridis_r', interpolation = 'none', vmin = 0, aspect='auto')

plt.xticks(np.arange(len(nms)), labels=nms, rotation=45)
plt.yticks(np.arange(len(nms)), labels=nms)
ax.xaxis.tick_top()

#plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(len(nms)):
    for j in range(len(nms)):
        text = ax.text(j, i, cf_m[i, j],
                       ha="center", va="center", color="white", fontsize="xx-large", path_effects=[pe.withStroke(linewidth=5, foreground="black")])

ax.set_xlabel('Klasificirana usmerjenost')
ax.set_ylabel('Prava usmerjenost')
#ax.set_title('Matrika zamenjav za usmerjenosti', pad=20)
fig.tight_layout()
fig.show()

## More visualization

In [None]:
# Collect and split test data

# 0 - initial, 1 - final, 2 - both
n_dataset = 2

RANDOM_STATE = 200

topic_names = ['splav', 'lgbtq', 'begunci','religija', 'desno', 'krscanstvo', 'militarizem', 'varnost', 'denacionalizacija', 'levo']

X = []
y = []

for t in topic_names:
  if n_dataset == 2:
    raw_result_data = load_data(f'{root_dir}/labelled_topics/topic_{t}.json')
    raw_result_data.extend(load_data(f'{root_dir}/final/tweets_{t}.json'))
  elif n_dataset == 1:
    raw_result_data = load_data(f'{root_dir}/final/tweets_{t}.json')
  else:
    raw_result_data = load_data(f'{root_dir}/labelled_topics/topic_{t}.json')

  for d in raw_result_data:
    if not 'id' in d:
      continue
    if 'topic' in d:
      y.append(topic_names.index(d['topic']))
    elif 'prediction' in d and d['prediction']['topic_mentioned'] is not None:
      y.append(topic_names.index(d['prediction']['topic_mentioned']))
    d['topic'] = None
    X.append(d['lemma_text'])

# Shuffle for good measure
X, y = shuffle(X, y, random_state=(RANDOM_STATE+13))

In [None]:
# Prepare Politic bias model
pt_model = PoliticBiasModel("PoliticBiasVisualization",
                            root_dir,
                            None,
                            topic_model,
                            sent_model,
                            pbm_config)

# Train the models
pt_model.train_models(X, y)

In [None]:
fig = pt_model.bertopic_SL.visualize()
fig

In [None]:
pt_model.bertopic_SL.bertopic.get_topics()

In [None]:
teme = {0: "Levi govor",
1: "Desni govor",
2: "Nacionalna varnost",
3: "Migracije",
4: "Tuje religije",
5: "Krščanstvo",
6: "LGBTQ+",
7: "Militarizem",
8: "Splav",
9: "Denacionalizacija",
10: "Neznano"}


pt_model.bertopic_SL.bertopic.set_topic_labels(teme)
print(pt_model.bertopic_SL.bertopic.custom_labels_)

In [None]:
figd = pt_model.bertopic_SL.bertopic.visualize_documents(docs=X, sample=0.6, hide_annotations=False, hide_document_hover=True, custom_labels=False, width=1200, height=800)
figd.show()

In [None]:
figd['data'][1]['name'] = teme[0]
figd['data'][1]['marker']['size'] = 15
figd['data'][1]['marker']['color'] = 'firebrick'
figd['data'][1]['marker']['opacity'] = 0.8
ars = figd['data'][1]['text']
figd['data'][1]['text'][len(ars)-1] = ''

figd['data'][2]['name'] = teme[1]
figd['data'][2]['marker']['size'] = 15
figd['data'][2]['marker']['color'] = 'royalblue'
figd['data'][2]['marker']['opacity'] = 0.8
ars = figd['data'][2]['text']
figd['data'][2]['text'][len(ars)-1] = ''

figd['data'][3]['name'] = teme[2]
figd['data'][3]['marker']['size'] = 15
figd['data'][3]['marker']['color'] = 'chocolate'
figd['data'][3]['marker']['opacity'] = 0.8
ars = figd['data'][3]['text']
figd['data'][3]['text'][len(ars)-1] = ''

figd['data'][4]['name'] = teme[3]
figd['data'][4]['marker']['size'] = 15
figd['data'][4]['marker']['color'] = 'green'
figd['data'][4]['marker']['opacity'] = 0.8
ars = figd['data'][4]['text']
figd['data'][4]['text'][len(ars)-1] = ''

figd['data'][5]['name'] = teme[4]
figd['data'][5]['marker']['size'] = 15
figd['data'][5]['marker']['color'] = 'palegreen'
figd['data'][5]['marker']['opacity'] = 0.8
ars = figd['data'][5]['text']
figd['data'][5]['text'][len(ars)-1] = ''

figd['data'][6]['name'] = teme[5]
figd['data'][6]['marker']['size'] = 15
figd['data'][6]['marker']['color'] = 'orange'
figd['data'][6]['marker']['opacity'] = 0.8
ars = figd['data'][6]['text']
figd['data'][6]['text'][len(ars)-1] = ''

figd['data'][7]['name'] = teme[6]
figd['data'][7]['marker']['size'] = 15
figd['data'][7]['marker']['color'] = 'hotpink'
figd['data'][7]['marker']['opacity'] = 0.8
ars = figd['data'][7]['text']
figd['data'][7]['text'][len(ars)-1] = ''

figd['data'][8]['name'] = teme[7]
figd['data'][8]['marker']['size'] = 15
figd['data'][8]['marker']['color'] = 'darkgray'
figd['data'][8]['marker']['opacity'] = 0.8
ars = figd['data'][8]['text']
figd['data'][8]['text'][len(ars)-1] = ''

figd['data'][9]['name'] = teme[8]
figd['data'][9]['marker']['size'] = 15
figd['data'][9]['marker']['color'] = 'indigo'
figd['data'][9]['marker']['opacity'] = 0.8
ars = figd['data'][9]['text']
figd['data'][9]['text'][len(ars)-1] = ''

figd['data'][10]['name'] = teme[9]
figd['data'][10]['marker']['size'] = 15
figd['data'][10]['marker']['color'] = 'darkkhaki'
figd['data'][10]['marker']['opacity'] = 0.8
ars = figd['data'][10]['text']
figd['data'][10]['text'][len(ars)-1] = ''

print(figd['data'][1])

In [None]:
figd.show()

In [None]:
# Change to fit colors

clrs = ['indigo', 'hotpink', 'green', 'orange', 'royalblue', 'firebrick']

fig['data'][0]['marker']['color'] = np.array(['royalblue', 'firebrick', 'orange', 'green', 'darkkhaki', 'seagreen', 'chocolate', 'darkgray', 'indigo', 'hotpink'])

In [None]:
#nms = ['Splav', 'LGBTQ+', 'Migracije', 'Tuje religije', 'Levi govor', 'Krščanstvo', 'Militarizem', 'Nacionalna varnost', 'Denacionalizacija', 'Desni govor']
#clrs = ['indigo', 'hotpink', 'green', 'orange', 'firebrick', 'seagreen', 'darkgray', 'chocolate', 'darkkhaki', 'royalblue']

fig['data'][0]['text'] = np.array(["Desni govor", "Levi govor", "Tuje religije", "Migracije", "Denacionalizacija", " Krščanstvo", "Nacionalna varnost", "Militarizem", "Splav", "LGBTQ+ "])
fig['data'][0]['textposition'] = np.array(['middle right', 'middle left', 'middle right', 'top center', 'top center', 'bottom center', 'middle right','bottom left', 'middle left', 'bottom left'])
fig['data'][0]['mode'] = "markers+text"

In [None]:
fig.show()

In [None]:
figm['data'][0]['hovertemplate'] = 'x: %{x}<br>y: %{y}<br>Legenda podobnosti: %{z}<extra></extra>'

In [None]:
figm['data'][0]['x'] = np.array(['Levi govor', 'Migracije', 'Religije', 'LGBTQ+', 'Desni govor', 'Splav'])
figm['data'][0]['y'] = np.array(['Levi govor', 'Migracije', 'Religije', 'LGBTQ+', 'Desni govor', 'Splav'])

In [None]:
figm['data'][0]['z'] = np.array([[1.        , 0.98312585, 0.97783003, 0.97631899, 0.97725498,
                  0.97912109],
                 [0.98312585, 1.        , 0.98378234, 0.98044662, 0.97803244,
                  0.98592705],
                 [0.97783003, 0.98378234, 1.        , 0.97651127, 0.97041834,
                  0.98143742],
                 [0.97631899, 0.98044662, 0.97651127, 1.        , 0.97815931,
                  0.98451635],
                 [0.97725498, 0.97803244, 0.97041834, 0.97815931, 1.        ,
                  0.97417223],
                 [0.97912109, 0.98592705, 0.98143742, 0.98451635, 0.97417223,
                  1.        ]])

In [None]:
figm.update_layout(
    xaxis={'side': 'top'}, 
    yaxis={'side': 'left'}  
)

In [None]:
figm.show()

In [None]:
fig = pt_model.bertopic_SL.bertopic.visualize_barchart([6,7,3,4,5,8,2,9,0,1], width=370, height=350, title="Distribucija besed v temah")
fig.show()

In [None]:
# Splav
#fig['data'][0]['xaxis'] = 'x'
#fig['data'][0]['yaxis'] = 'y'
fig['data'][0]['marker']['color'] = 'indigo'
fig.layout.annotations[0].update(text="Splav")

# LGBTQ+
#fig['data'][7]['xaxis'] = 'x2'
#fig['data'][7]['yaxis'] = 'y2'
fig['data'][1]['marker']['color'] = 'hotpink'
fig.layout.annotations[1].update(text="LGBTQ+")

# Migracije
#fig['data'][3]['xaxis'] = 'x3'
#fig['data'][3]['yaxis'] = 'y3'
fig['data'][2]['marker']['color'] = 'green'
fig.layout.annotations[2].update(text="Migracije")

# Religije
#fig['data'][4]['xaxis'] = 'x4'
#fig['data'][4]['yaxis'] = 'y4'
fig['data'][3]['marker']['color'] = 'orange'
fig.layout.annotations[3].update(text="Religije")

# Krščanstvo
#fig['data'][1]['xaxis'] = 'x5'
#fig['data'][1]['yaxis'] = 'y5'
fig['data'][4]['marker']['color'] = 'seagreen'
fig.layout.annotations[4].update(text="Krščanstvo")

# Militarizem
#fig['data'][6]['xaxis'] = 'x6'
#fig['data'][6]['yaxis'] = 'y6'
fig['data'][5]['marker']['color'] = 'darkgray'
fig.layout.annotations[5].update(text="Militarizem")

# Nacionalna varnost
#fig['data'][8]['xaxis'] = 'x7'
#fig['data'][8]['yaxis'] = 'y7'
fig['data'][6]['marker']['color'] = 'chocolate'
fig.layout.annotations[6].update(text="Nacionalna varnost")

# Denacionalizacija
#fig['data'][9]['xaxis'] = 'x8'
#fig['data'][9]['yaxis'] = 'y8'
fig['data'][7]['marker']['color'] = 'darkkhaki'
fig.layout.annotations[7].update(text="Denacionalizacija")

# Levi govor
#fig['data'][5]['xaxis'] = 'x9'
#fig['data'][5]['yaxis'] = 'y9'
fig['data'][8]['marker']['color'] = 'firebrick'
fig.layout.annotations[8].update(text="Levi govor")

# Desni govor
#fig['data'][2]['xaxis'] = 'x10'
#fig['data'][2]['yaxis'] = 'y10'
fig['data'][9]['marker']['color'] = 'royalblue'
fig.layout.annotations[9].update(text="Desni govor")

In [None]:
fig.show()

## Evalvacija koherence tematik

In [None]:
topics = [
    ['splav', 'kontracepcija', 'zarodek', 'nosečnost'],
    ['lgbt', 'homoseksualnost', 'spol', 'parada'],
    ['prehod', 'begunec', 'meja', 'migrant'],
    ['musliman', 'jud', 'islam', 'žid'],
    ['cerkev', 'krščanstvo', 'kristjan', 'verouk'],
    ['vojska', 'orožje', 'vojak', 'poveljnik'],
    ['policija', 'zaščita', 'red', 'varnost'],
    ['privaten', 'denacionalizacija', 'lastnina', 'premoženje'],
    ['lev', 'levičar', 'levica', 'lmš', 'sd'],
    ['desen', 'desničar', 'desnica', 'sds', 'nsi']
]

X_tokens = list(map(lambda x: x.split(" "), X))

word2id = Dictionary(X_tokens)

corpus = [ word2id.doc2bow(text) for text in X_tokens]

cm = CoherenceModel(topics=topics, 
                    texts=X_tokens,
                    coherence='c_v',  
                    dictionary=word2id)

coherence_per_topic = cm.get_coherence_per_topic()

In [None]:
topics_str = [ '\n '.join(t) for t in topics ]
data_topic_score = pd.DataFrame( data=zip(topics_str, coherence_per_topic), columns=['Tematika', 'Koherenca'] )
data_topic_score = data_topic_score.set_index('Tematika')

fig, ax = plt.subplots( figsize=(20,20) )
ax.set_title("Koherenca tematik\n $C_v$")
sns.heatmap(data=data_topic_score, annot=True, square=True,
            cmap='Reds', fmt='.2f',linecolor='black', ax=ax )
plt.yticks( rotation=0 )
ax.set_xlabel('')
ax.set_ylabel('')
fig.show()