# Obdelava podatkov

## Okolje

Vzpostavitev okolja

In [None]:
!pip install tweet-preprocessor
!pip install classla
!pip install bertopic

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import json
import classla
import re
import random
import time
import os.path
classla.download('sl')

import numpy as np
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, pipeline
from bertopic import BERTopic
import preprocessor as tpre
from umap import UMAP
from hdbscan import HDBSCAN

from google.colab import drive
drive.mount('/content/drive/')

In [None]:
# Setting constants

LOCAL = False

google_data_dir = "/content/drive/MyDrive/Diploma/Data"
local_data_dir = "/data"

root_dir = ""
if LOCAL:
    root_dir = local_data_dir
else:
    root_dir = google_data_dir

## Funkcije in razredi

In [None]:
def preprocess_tweets(preprocess_pipeline, tweets, tweet_stop_words=[], tweet_upos=[], min_words=4, verbose=False, debug=False):

  stop_words = stopwords.words('slovene')
  stop_words.extend(tweet_stop_words)

  # Remove stopwords
  stop_words = list(set(stop_words))

  data = []
  start_time = 0

  for index, tweet in enumerate(tweets):

    if index%1000 == 0 and verbose:
      # Import time
      print(f'-- Progress: {index}/{len(tweets)}')
      if index >= 1000:
        print(f'-- Time elapsed: {time.time() - start_time}s')
        print(f'-- Tweets preprocessed: {len(data)}')
      start_time = time.time()

    # Take attributes
    tweet_full_text = tweet['full_text']

    # Skip if retweet
    if tweet_full_text.startswith("RT"):
      continue

    tweet_id = tweet['id']
    #tweet_hashtags = tweet['entities']['hashtags']
    #tweet_mentions = tweet['entities']['mentions']
    tweet_hashtags = tweet['hashtags']
    tweet_mentions = tweet['mentions']
    tweet_created_at = tweet['created_at']

    tweet_user_name = tweet['user']['name']
    tweet_user_screen_name = tweet['user']['screen_name']
    tweet_user_description = clean_tweet_text(tweet['user']['description']).lower()

    # Remove hashtags, mentions, links, emojis and others
    tweet_full_text = clean_tweet_text(tweet_full_text)

    if debug:
      print(f'Raw text:\n{tweet_full_text}')

    # Preprocess with preprocessing pipeline
    tweet_lemma_text = preprocess_pipeline(tweet_full_text)

    tweet_processed_lemmas = []
    for sentence in tweet_lemma_text.sentences:

      sentence_words = []

      for i, word in enumerate(sentence.words):
 
        # If the tweet is a retweet
        if word.lemma.lower() == 'rt' and i == 0:
          break

        if debug:
          print(f'Lemma: {word.lemma.lower()} -------------- Upos: {word.upos}')

        # Not punctuation and not number + clean stopwords
        if word.upos not in tweet_upos and word.lemma.lower() not in stop_words:
          sentence_words.append(word.lemma.lower())
        elif debug:
          print(f'Discarded word: {word.lemma}')

      tweet_processed_lemmas.extend(sentence_words)
    
    if debug:
      print(f'Preprocessed lemmas:\n{tweet_processed_lemmas}')

    # If less than n words
    if len(tweet_processed_lemmas) > min_words:
      tweet_data = {
          "id": tweet_id,
          "created_at": tweet_created_at,
          "raw_text": tweet_full_text,
          "lemma_text": ' '.join(tweet_processed_lemmas),
          "hashtags": tweet_hashtags,
          "mentions": tweet_mentions,
          "user": {
              "name": tweet_user_name,
              "screen_name": tweet_user_screen_name,
              "description": tweet_user_description
          }
      }

      # Append all the data
      data.append(tweet_data)
    
  return data

def clean_tweet_text(tweet_text):
  tweet_text = tpre.clean(tweet_text)
  tweet_text = re.sub("&gt;|&lt;|&amp;", "", tweet_text)
  tweet_text = remove_emojis(tweet_text)
  return tweet_text

def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def load_tweets(file_name):

  # Load data
  data = []

  with open(file_name, 'r', encoding='utf8') as sample_data:
    data = json.load(sample_data)

  return data

def save_tweets(data, dir, file_name):
  with open(f'{dir}/{file_name}.json', 'w+', encoding='utf8') as outdata:
    json.dump(data, outdata, ensure_ascii=False)

def load_and_preprocess(cpipeline, data_dir, only_load, tweet_stop_words=[], tweet_upos=[], min_words=4, verbose=False, debug=False):
  d = []
  if only_load:
    d = load_tweets(data_dir)
  else:
    d = preprocess_tweets(cpipeline, load_tweets(data_dir), tweet_stop_words=tweet_stop_words, tweet_upos=tweet_upos, min_words=min_words, verbose=verbose, debug=debug)
  return d

def load_labelled_tweets(dir, topic_names, shuffle_arrays=True, random_state=77):
  topics = []
  for t in topic_names:
    with open(f'{dir}/labelled_topics/topic_{t}.json', 'r', encoding='utf8') as topic_data:
      data = json.load(topic_data)
      topics.extend(data)
  
  topic_lemmas = []
  topic_labels = []

  for t in topics:
    topic_lemmas.append(t['lemma_text'])
    topic_labels.append(t['topic'])

  topic_labels = [ topic_names.index(x) for x in topic_labels]

  if shuffle_arrays:
    shuffle(topic_lemmas, topic_labels, random_state=random_state)
  return topic_lemmas, topic_labels


"""
Bertopic model for modeling topics

"""
class BertopicModel:

  def __init__(self, model_name, embed_model, config):
    self.model_name = model_name
    self.embed_model = embed_model
    self.config = config

    # Create the Bertopic model with config
    self.make_model()

  def make_model(self):
    self.umap_model = UMAP(**self.config["umap_conf"])
    self.hdbscan_model = HDBSCAN(**self.config["hdbscan_conf"])
    self.bertopic = BERTopic(embedding_model=self.embed_model, umap_model=self.umap_model, hdbscan_model=self.hdbscan_model, **self.config["bertopic_conf"])

  def load_tweet_data(self, tweet_data):
    doc_tweet_lemmas = [ t['lemma_text'] for t in tweet_data ]
    
    self.data = {}
    self.data["tweets"] =  tweet_data
    self.data["docs"] = doc_tweet_lemmas

  def load_topic_data(self, topic_docs, topic_labels):
    if not hasattr(self, 'data'):
      self.data = {}
      
    self.data["docs"] = topic_docs
    self.data["labels"] = topic_labels

  def train_model(self, only_fit):
    data_keys = self.data.keys()

    if "docs" in data_keys and not only_fit:
      topics, probs = self.bertopic.fit_transform(self.data["docs"])
      self.result = {}
      self.result["topic_ids"] = topics
      self.result["topic_probs"] = probs
    elif "docs" in data_keys and "labels" in data_keys and only_fit:
      self.bertopic = self.bertopic.fit(self.data["docs"], y=self.data["labels"])
    else:
      print("Error: Missing data!")
  
  def predict(self):
    data_keys = self.data.keys()

    if "docs" in data_keys:
      topics, probs = self.bertopic.transform(self.data["docs"])
      self.result = {}
      self.result["topic_ids"] = topics
      self.result["topic_probs"] = probs

  def reduce(self, nr):
    if hasattr(self, 'data') and hasattr(self, 'result'):
      topics, probs = self.bertopic.reduce_topics(self.data["docs"], self.data["labels"], nr_topics=nr)
      self.result["topic_ids"] = topics
      self.result["topic_probs"] = probs

  def merge_topics(self, indexes):
    if hasattr(self, 'data'):
      self.bertopic.merge_topics(self.data["docs"], self.data["labels"], indexes)

  def tweets_from_topic(self, ntopic):
    if self.result:
      tw = []

      for i, x in enumerate(self.result.topic_ids):
        if x == ntopic:
          tw.append(self.data["docs"][i])

      return tw
    else:
      print("Error: Missing data!")
      return []

  def collect_topic_indices(self, ntopic, tweet_prob=0.5):
    if hasattr(self, 'result'):

      tweet_ids = []
      for i, x in enumerate(self.result["topic_ids"]):

        # Check if topic id and probability higher
        if ntopic == x and self.result["topic_probs"][i] > tweet_prob:
          tweet_ids.append(i)

      return tweet_ids
    else:
      print("Error: Missing data!")
      return []

  def find_politic_topics(self, keywords, topn=3, sim_threshold=0.5, tweet_prob=0.5, include_prob=False):
    if hasattr(self, 'bertopic'):
      indices = set()

      # Find relating topics
      for keyword in keywords:
        sim_ids, sim_probs = self.bertopic.find_topics(keyword, top_n=topn)

        # Filter based on similarity
        sim_topics = [ sim_ids[i] for i, x in enumerate(sim_probs) if x > sim_threshold ]

        if len(sim_topics) > 0:
          for topic in sim_topics:
            indices.update(self.collect_topic_indices(topic, tweet_prob=tweet_prob))
      
      tweet_docs = []
      for i in indices:
        tdoc = self.data["tweets"][i]
        if include_prob:
          tdoc["topic_probability"] = self.result["topic_probs"][i]

        tweet_docs.append(tdoc)

      return tweet_docs
    else:
      print("Error: Missing data!")
      return []

  def visualize(self, t='distance_map'):
    if hasattr(self, 'bertopic'):
      #return self.bertopic.visualize_topics()
      if t == 'barchart':
        return self.bertopic.visualize_barchart()
      elif t == 'hierarchy':
        return self.bertopic.visualize_hierarchy()
      elif t == 'heatmap':
        return self.bertopic.visualize_heatmap()
      elif t == 'term_rank':
        return self.bertopic.visualize_term_rank()
      else:
        return self.bertopic.visualize_topics()
      #elif t == 'documents':
      #  self.bertopic.visualize_documents()
    else:
      print("Error: Model not yet initiated!")

  def save_model(self, model_dir):
    self.bertopic.save(str(model_dir + self.model_name))
  
  def load_model(self, model_dir):
    self.bertopic.load(str(model_dir + self.model_name), embedding_model=self.embed_model)

"""

Tweetiment Model

"""

class TweetimentModel:
  def __init__(self, name, model, tokenizer, topic_bias, party_bias):
    self.model_name = name

    self.topic_bias = topic_bias
    self.party_bias = party_bias

    self.labels = ["levo", "desno", "nevtralno"]
    
    self.tokenizer = tokenizer
    self.model = model

    # Create the pipeline
    self.make_model()
  
  def make_model(self):
    self.tweetiment = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer)

  def predict_text(self, txt):
    if hasattr(self, 'tweetiment'):
      return self.tweetiment(txt)

  def classify(self, bias_party, bias_topic):

    if bias_party is None and bias_topic is None:
      return self.labels[2]
    elif bias_party is None:
      return bias_topic
    elif bias_topic is None:
      return bias_party
    
    return bias_topic

  def calculate_biases(self, tweet, explain=False):
    if tweet['raw_text']:
      prediction = self.predict_text(tweet['raw_text'])[0]

      bias_party, party = self.bias_sentiment_party(prediction, tweet)
      #bias_user = self.bias_user(prediction)
      bias_topic, topic = self.bias_sentiment_topic(prediction, tweet)

      if explain:
        explanation = self.make_explanation(prediction['label'], bias_party, party, bias_topic, topic)
      
      label = self.classify(bias_party, bias_topic)

      return {
          'label': label,
          'sentiment': prediction['label'].lower(),
          'sentiment_score': prediction['score'],
          'topic_bias': bias_topic,
          'topic_mentioned': topic,
          'topic_score': tweet['topic_probability'],
          'party_bias': bias_party,
          'party_mentioned': party,
      }
    return None

  # Bias based on negativity/positivity towards a party mentioned in a tweet
  def bias_sentiment_party(self, prediction, tweet, single=True):
    """
    Args:
      single (bool): Detect only a single party in tweet
      
    Returns:
      bias
    """
    bias = None
    party_detected = None
    parties_mentioned = 0

    for party in self.party_bias:
      
      for mention in tweet['mentions']:
        # Check for mentions or in lemma text
        if mention in party['clani'] or party['kratica_stranke'].lower() in tweet['lemma_text'].split(" "):
          parties_mentioned = parties_mentioned+1
          if parties_mentioned == 1:
            party_detected = party
          break

    if single and parties_mentioned == 1 and party_detected is not None:

      # If text is neutral
      if prediction['label'] == "Neutral":
        bias = self.labels[2]
      # Supports the party
      elif prediction['label'] == "Positive":
        bias = self.labels[party_detected['usmerjenost']]
      # Opposes the party
      elif prediction['label'] == "Negative":
        bias = self.labels[int(not party_detected['usmerjenost'])]

      return bias, party_detected['kratica_stranke']

    # If no parties are mentioned in a tweet
    return None, None
  
  # Bias based on negativity/positivity towards a certain topic of the tweet
  def bias_sentiment_topic(self, prediction, tweet):
    """
    Args:

    Returns:
      bias
    """
    bias = None
    topic_detected = None

    for topic in self.topic_bias:
      if tweet['topic'] == topic and prediction['label'] != 'Neutral':
        
        bias = self.labels[self.topic_bias[topic][prediction['label'].lower()]]
        topic_detected = topic
        break

    return bias, topic_detected
  
  # User a known member of a party?
  def is_user_in_party(self):
    """
    Args:

    Returns:
      bias
    """
    return
  
  # Bias based on the user profile
  def bias_user(self, prediction, tweet):
    """
    Args:
      only_desc (bool): Analyze description on user profile only

    Returns:
      bias
    """
    return

  def make_explanation(self, sentiment, bias_party, party, bias_topic, topic):
    # TODO
    return

"""

Politic bias model

"""

class PoliticBiasModel:
  def __init__(self,
                name,
                working_dir,
                preprocess_pipeline,
                topic_model,
                sentiment_model,
                config
               ):

    self.name = name
    self.working_dir = working_dir
    self.preprocess_pipeline = preprocess_pipeline
    self.config = config

    # Make models
    self.make_models(topic_model, sentiment_model)

  def make_models(self, topic_model, sentiment_model):

    # Create Bertopic SL
    self.bertopic_SL = BertopicModel("Bertopic_SL", embed_model=topic_model, config=self.config['bertopic_SL_config'])

    # Create Tweetiment
    self.tweetiment = TweetimentModel("Tweetiment", model=sentiment_model['model'], tokenizer=sentiment_model['tokenizer'], topic_bias=self.config['tweetiment_config']['topic_bias'], party_bias=self.config['tweetiment_config']['party_bias'])

  def train_models(self):
    topic_names = [x for x in self.config['topic_info']]

    # Loading labelled tweets to train
    X_train, y_train = load_labelled_tweets(self.working_dir, topic_names, shuffle_arrays=self.config['bertopic_SL_config']['shuffle_arrays'], random_state=self.config['bertopic_SL_config']['random_state'])

    # Load training data
    self.bertopic_SL.load_topic_data(X_train, y_train)

    # Train the model
    self.bertopic_SL.train_model(only_fit=True)

  def optimize_models(self, topn=3, n_sim_subtopics=3):
    sim_topics = similar_topics(self.bertopic_SL, self.config['topic_info'], topn=topn, n_sim_subtopics=n_sim_subtopics)

    all_labels = [ x for x in self.bertopic_SL.bertopic.get_topics()]
    to_elim = []
    merging = False

    for st in sim_topics:
      lbl = sim_topics[st]
      lbl = [ t for t,p in lbl if p > 0.9]

      if len(lbl) > 1:
        to_elim.append(lbl)
        all_labels = [ x for x in all_labels if x not in lbl]
        merging = True

    #for e in to_elim:
      #all_labels.append(e)
    print(to_elim)
    if merging:
      self.bertopic_SL.merge_topics(to_elim)


  def bias_pipeline(self, tweets, do_preprocess=True):
    """
    ## Automated Slovenian Political bias pipeline
    """

    # Preprocess instances
    if do_preprocess:
      p_tweets = preprocess_tweets(self.preprocess_pipeline, tweets, self.config['preprocess_config'], verbose=self.config['verbose'], debug=self.config['debug'])
      # Load the tweet data
      self.bertopic_SL.load_tweet_data(p_tweets)

      # Predict the instances
      self.bertopic_SL.predict()

      # Topic unlabelled tweets
      #unlabelled_tweets = [ tweets[i] for i, tx in enumerate(self.bertopic_SL.result['topic_ids']) if tx == -1]

      # Topic labelled tweets
      labelled_tweets = label_politic_tweets(self.bertopic_SL, self.config['topic_info'], self.working_dir, save_tweets=True, verbose=self.config['verbose'])

      # Make bias predictions for labelled tweets
    else:
      labelled_tweets = tweets

    bias_predictions = []
    for t in labelled_tweets:
      bias_predictions.append(self.tweetiment.calculate_biases(t))

    predictions = []
    for twt in tweets:
      id = twt['id']
      found = False

      for ix, lbt in enumerate(labelled_tweets):
        if lbt['id'] == id:
          predictions.append(bias_predictions[ix])
          found = True
          break
      
      if not found:
        predictions.append(None)

    return predictions

def label_politic_tweets(model : BertopicModel, topic_info, data_dir, topn=3, n_sim_subtopics=4, save_tweets=False, verbose=True):

  if verbose:
    print(f'-- Collected batch topic distribution summary:')

  sim_topics = similar_topics(model, topic_info, topn=topn, n_sim_subtopics=n_sim_subtopics)

  tweets = model.data['tweets']
  labels = model.result['topic_ids']
  probs = model.result['topic_probs']

  ids = set()

  for i in range(len(tweets)):
    tweet = tweets[i]
    label = labels[i]
    prob = probs[i]

    if tweet['id'] in ids:
      continue
    else:
      ids.add(tweet['id'])

    most_likely_topic = None
    most_likely_prob = 0

    for st in sim_topics:
      for sbt, prob in sim_topics[st]:
        if sbt == label and prob > most_likely_prob:
          most_likely_topic = st
          most_likely_prob = prob


    if most_likely_topic is not None and topic_info[most_likely_topic]['strict']:
      kw = []
      kw.extend(topic_info[most_likely_topic]['keywords'])
      kw.extend(topic_info[most_likely_topic]['search_term'])

      founds = False
      for key in kw:
        if key in tweet['lemma_text']:
          tweet['topic'] = most_likely_topic
          tweet['topic_probability'] = prob
          founds = True
          break
      
      if not founds:
        tweet['topic'] = None
        tweet['topic_probability'] = 0
    else:
      tweet['topic'] = most_likely_topic
      tweet['topic_probability'] = prob
  
  if save_tweets:
    for tp in topic_info:
      t = list(filter(lambda x: x['topic'] == tp, tweets))
      if verbose:
        print(f'-- {tp} : {len(t)}')
      overwrite_labelled_topics(tp, t, data_dir)

  return model.data['tweets']

def similar_topics(model : BertopicModel, topic_info, topn=3, n_sim_subtopics=3) -> dict:
    
  sim_topics = {}

  for topic in topic_info:
    tt = topic_info[topic]

    tpcs1 = {}
    for keyword in tt['search_term']:
      sims = model.bertopic.find_topics(keyword, top_n=topn)

      sims = tuple(zip(sims[0], sims[1]))

      tpcs2 = dict((x, y) for x, y in sims)

      tpcs1 = {
        key: tpcs1.get(key, 0) + tpcs2.get(key, 0) for key in set(tpcs1) | set(tpcs2)
      }
    
    # Normalize
    mv = max(tpcs1.values())
    for kj in tpcs1:
      tpcs1[kj] = float(tpcs1[kj] / mv)
      
    subtopics = []
    for i in range(n_sim_subtopics):
      if tpcs1:
        k1 = max(tpcs1, key=tpcs1.get)
        if k1 != -1:
          subtopics.append((k1, tpcs1[k1]))
        tpcs1.pop(k1)
    
    sim_topics[topic] = subtopics
  
  return sim_topics


def overwrite_labelled_topics(file_topic, topic_tweets, data_dir):
    data=[]
    with open(f'{data_dir}/labelled_topics/topic_{file_topic}.json', 'r', encoding='utf8') as topic_data:
      data = json.load(topic_data)
      data.extend(topic_tweets)
    with open(f'{data_dir}/labelled_topics/topic_{file_topic}.json', 'w', encoding='utf8') as topic_data_n:
      json.dump(data, topic_data_n, ensure_ascii=False)

def overwrite_labelled_tweets(topic, tweets, data_dir):
  # Overwrite
  data=[]
  with open(f'{data_dir}/process/tweets_{topic}.json', 'r', encoding='utf8') as topic_data:
    data = json.load(topic_data)
    data.extend(tweets)
  with open(f'{data_dir}/process/tweets_{topic}.json', 'w', encoding='utf8') as topic_data_n:
    json.dump(data, topic_data_n, ensure_ascii=False)

"""
  Tweetiment model
"""

class TweetimentModel:
  def __init__(self, name, model, tokenizer, topic_bias, party_bias):
    self.model_name = name

    self.topic_bias = topic_bias
    self.party_bias = party_bias

    self.labels = ["levo", "desno", "nevtralno"]
    
    self.tokenizer = tokenizer
    self.model = model

    # Create the pipeline
    self.make_model()
  
  def make_model(self):
    self.tweetiment = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer)

  def predict_text(self, txt):
    if hasattr(self, 'tweetiment'):
      return self.tweetiment(txt)

  def classify(self, bias_party, bias_topic):

    if bias_party is None and bias_topic is None:
      return self.labels[2]
    elif bias_party is None:
      return bias_topic
    elif bias_topic is None:
      return bias_party
    
    return bias_topic

  def calculate_biases(self, tweet, explain=False):
    if tweet['raw_text']:
      prediction = self.predict_text(tweet['raw_text'])[0]

      bias_party, party = self.bias_sentiment_party(prediction, tweet)
      #bias_user = self.bias_user(prediction)
      bias_topic, topic = self.bias_sentiment_topic(prediction, tweet)

      if explain:
        explanation = self.make_explanation(prediction['label'], bias_party, party, bias_topic, topic)
      
      label = self.classify(bias_party, bias_topic)

      return {
          'label': label,
          'sentiment': prediction['label'].lower(),
          'sentiment_score': prediction['score'],
          'topic_bias': bias_topic,
          'topic_mentioned': topic,
          'topic_score': tweet['topic_probability'],
          'party_bias': bias_party,
          'party_mentioned': party,
      }
    return None

  # Bias based on negativity/positivity towards a party mentioned in a tweet
  def bias_sentiment_party(self, prediction, tweet, single=True):
    """
    Args:
      single (bool): Detect only a single party in tweet
      
    Returns:
      bias
    """
    bias = None
    party_detected = None
    parties_mentioned = 0

    for party in self.party_bias:
      
      for mention in tweet['mentions']:
        # Check for mentions or in lemma text
        if mention in party['clani'] or party['kratica_stranke'].lower() in tweet['lemma_text'].split(" "):
          parties_mentioned = parties_mentioned+1
          if parties_mentioned == 1:
            party_detected = party
          break

    if single and parties_mentioned == 1 and party_detected is not None:

      # If text is neutral
      if prediction['label'] == "Neutral":
        bias = self.labels[2]
      # Supports the party
      elif prediction['label'] == "Positive":
        bias = self.labels[party_detected['usmerjenost']]
      # Opposes the party
      elif prediction['label'] == "Negative":
        bias = self.labels[int(not party_detected['usmerjenost'])]

      return bias, party_detected['kratica_stranke']

    # If no parties are mentioned in a tweet
    return None, None
  
  # Bias based on negativity/positivity towards a certain topic of the tweet
  def bias_sentiment_topic(self, prediction, tweet):
    """
    Args:

    Returns:
      bias
    """
    bias = None
    topic_detected = None

    for topic in self.topic_bias:
      if tweet['topic'] == topic and prediction['label'] != 'Neutral':
        
        bias = self.labels[self.topic_bias[topic][prediction['label'].lower()]]
        topic_detected = topic
        break

    return bias, topic_detected
  
  # User a known member of a party?
  def is_user_in_party(self):
    """
    Args:

    Returns:
      bias
    """
    # TODO
    return
  
  # Bias based on the user profile
  def bias_user(self, prediction, tweet):
    """
    Args:
      only_desc (bool): Analyze description on user profile only

    Returns:
      bias
    """
    # TODO
    return

  def make_explanation(self, sentiment, bias_party, party, bias_topic, topic):
    # TODO
    return

def bertopic_pipeline(data_dir,
                      preprocess_pipeline,
                      preprocess_config,
                      bertopic_FL_config,
                      bertopic_SL_config,
                      embed_model,
                      topic_info,
                      year=2021,
                      epoch_number=1,
                      batch_index=1,
                      iterations=10,
                      saving=True,
                      verbose=True,
                      debug=False):
  """
    ## Automated Bertopic pipeline

    1. Preprocessing of tweets
    2. Saving tweets
    3. First layer of topic modeling
    4. Second layer of topic modeling
    5. Labelling new instances
    6. Retraining second layer of topic modeling
    7. Saving models

    ## Args:

      data_dir (str): directory of tweet data and working directory 
        data should be of form: `'{year}-{epoch_number}/{year}_{epoch_number}_{batch_number}.json'`
      preprocess_pipeline (Pipeline): preprocessor pipeline
      preprocess_config (dict): configuration for preprocessing
      bertopic_FL_config (dict): configuration for first layer topic modeling
      bertopic_SL_config (dict): configuration for second layer topic modeling
      topic_info (dict): information of topics to extract and label
      year (int): year of data tweets
      epoch_number (int): index of epoch
      batches (int, int): first and last index of batch
      iterations (int): number of iterations a topic model should go through
      num_parts (int): number of seperate files inside batch_dir to process
      saving (bool): option to save tweets
      verbose (bool): verbosing
      debug (bool): debuging

    ## Returns:


  """
  
  # Settings configurations for preprocessing
  if verbose:
    print(f'Configuring...')

  # Directories & paths
  unpreprocessed_dir = 'unpreprocess' # Raw Twitter data
  stpt_dir = 'stpt' # Slovenian Twitter politics Tweets
  
  # Other variables
  topics_to_extract = [item for sublist in bertopic_FL_config['bertopic_conf']['seed_topic_list'] for item in sublist]
  topic_names = [item for item in topic_info]

  # Models
  bt_fl_model = BertopicModel('Bertopic_FL', embed_model, config=bertopic_FL_config)
  #if os.path.exists(f'{data_dir}/models/Bertopic_FL'):
    #bt_fl_model.load_model(f'{data_dir}/models/')
  bt_sl_model = BertopicModel('Bertopic_SL', embed_model, config=bertopic_SL_config)
  #if os.path.exists(f'{data_dir}/models/Bertopic_SL'):
    #bt_sl_model.load_model(f'{data_dir}/models/')    

  training = False

  
  # Batch path
  batch_path = f'{data_dir}/{unpreprocessed_dir}/{year}-{epoch_number}/{year}_{epoch_number}_{batch_index}.json' # Path to first batch

  if verbose:
    print(f'--------------------------------------------')
    print(f'Processing batch: {batch_index} in {batch_path}')

  # Batch preprocessed path
  stpt_path = f'{data_dir}/{stpt_dir}/{year}-{epoch_number}'

  # Load and preprocess the batch
  if verbose:
    print(f'- Preprocessing batch #{batch_index}...')
  preprocessed_tweet_data = load_and_preprocess(preprocess_pipeline, batch_path, False, min_words=preprocess_config['min_words'], verbose=verbose, debug=debug)

  # Print summary if verbose
  if verbose:
    print(f'- Batch summary:')
    print(f'-- Batch length: {len(preprocessed_tweet_data)}')
    
    # First layer of topic modeling
    print(f'- First layer of topic modeling (batch #{batch_index})...')
  
  # Create Bertopic model (1st layer)
  bt_fl_model = BertopicModel('Bertopic_FL', embed_model=embed_model, config=bertopic_FL_config)

  # Load twitter data
  bt_fl_model.load_tweet_data(preprocessed_tweet_data)

  if verbose:
    print(f'- Training 1st layer of Bertopic model...')
  # Train the model
  bt_fl_model.train_model(only_fit=False)

  # Visualize topics
  #bt_fl_model.visualize()
  
  if saving:
    if verbose:
      print(f'- Saving 1st layer of Bertopic model...')
    # Save model
    bt_fl_model.save_model(f'{data_dir}/models')
  
  # Extract general politic topics
  extracted_tweets = bt_fl_model.find_politic_topics(topics_to_extract, sim_threshold=bertopic_FL_config['sim_threshold'], tweet_prob=bertopic_FL_config['tweet_prob'])

  if verbose:
    print(f'- Batch of general politics summary:')
    print(f'-- Batch length: {len(extracted_tweets)}')

  # Save STP tweets
  if saving:
    if verbose:
      print(f'- Saving STPT in file {stpt_path}...')
    save_tweets(extracted_tweets, dir=stpt_path, file_name=f'{year}_{epoch_number}_{batch_index}')

  # Second layer of topic modeling
  if verbose:
    print(f'- Second layer of topic modeling (batch #{batch_index})...')
  
  # Get training data
  X, y = load_labelled_tweets(data_dir, topic_names)

  # Create Bertopic model (2nd layer)
  bt_sl_model = BertopicModel('Bertopic_SL', embed_model=embed_model, config=bertopic_SL_config)

  # Load training data
  bt_sl_model.load_topic_data(X, y)

  # Train model with training data
  if verbose:
    print(f'- Training 2nd layer of Bertopic model...')
  bt_sl_model.train_model(only_fit=True)

  # Saving the SL model
  if saving:
    if verbose:
      print(f'- Saving 2nd layer of Bertopic model...')
    bt_sl_model.save_model(f'{data_dir}/models')

  # Load extracted tweets (test data)
  bt_sl_model.load_tweet_data(extracted_tweets)

  if verbose:
    print(f'- Predicting new instances on second layer topic modeling...')

  # Predict new instances on test data
  bt_sl_model.predict()

  # Visualize topics
  #bt_sl_model.visualize()

  # Label new instances & overwrite
  if verbose:
    print(f'- Labelling and saving topic tweets...')
  labelled_tweets = label_politic_tweets(bt_sl_model, topic_info, data_dir, save_tweets=saving, verbose=verbose)

  # Clean up
  del extracted_tweets
  del preprocessed_tweet_data
  del bt_fl_model
  del bt_sl_model

  return labelled_tweets
    # TODO: Calculate accuracy, data collection and verbosing
    # TODO: Extracting topic keywords
    # TODO: Iterations?
    # TODO: Time verbosing?

## Cevovod za učenje

In [None]:
# Configurations for automated bertopic pipeline

VERBOSE = True
SAVING = True
DEBUG = False

# Imported configurations
politics_seed_topic = [["politik", "politika", "političen"],["vlada", "vladati", "država"],["komunist", "komunističen", "komunizem"],["socializem", "socialen", "sociala"],["fašisti", "fašističen", "fašizem"],["levičar", "levičarski", "levica"],["desničar", "desničarski", "desnica", "janez", "jj", "sds"],["nosečnost", "nosečnica", "splav", "ženska", "kontracepcija"],["migrant", "migriranje", "beg", "begunec"],["musliman", "islam", "islamist", "islamist", "ekstremist"],["kriminal", "kiminalec", "zločinec"],["lgbtq", "istospolni", "lgbt", "spol"]]

topics_info = {
    'begunci': {
        'search_term': ["begunec", "migrant", "migrantski"],
        'keywords': [],
        'strict': True,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'lgbtq': {
        'search_term': ["lgbtq", "lgbt", "istospolno"],
        'keywords': [],
        'strict': True,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'religija': {
        'search_term': ["islam", "musliman", "vera"],
        'keywords': ["religija", "dzihadist"],
        'strict': True,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'splav': {
        'search_term': ["splav"],
        'keywords': ["kontracepcija"],
        'strict': True,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'desno': {
        'search_term': ["desnica", "desno", "jj", "sds"],
        'keywords': ["jansa", "desnicar", "janša"],
        'strict': True,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'levo': {
        'search_term': ["levica", "levicar", "lev"],
        'keywords': ["levicarski", "levičar", "mesec"],
        'strict': True,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'politika': {
        'search_term': ["politika", "politicen", "vlada"],
        'keywords': ["politik", "minister", "predsednik"],
        'strict': True,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    }
}

# Preprocessing configuration
preprocess_config = {
    'min_words': 4,
    'verbose': VERBOSE,
    'debug': DEBUG,
    'tweet_upos': ['PUNCT', 'NUM', 'SYM', 'CCONJ', 'INTJ'],
    'tweet_stop_words': ['http', 'https', 'rt', 'oz']
}

# Bertopic first layer configuration
bertopic_FL_config = {
    'bertopic_conf': {
        "top_n_words": 10,
        "min_topic_size": 20,
        "seed_topic_list": politics_seed_topic
    },
    'umap_conf': {
        "n_neighbors": 15,
        "n_components": 10,
        "metric": 'cosine'
    },
    'hdbscan_conf': {
        "min_cluster_size": 10,
        "metric": 'euclidean',
        "prediction_data": True
    },
    'sim_threshold': 0.5,
    'tweet_prob': 0.5
}

# Bertopic second layer configuration
bertopic_SL_config = {
    'bertopic_conf': {
        "top_n_words": 10,
        "min_topic_size": 20,
        #"nr_topics": 8
    },
    'umap_conf': {
        "n_neighbors": 20,
        "n_components": 10,
        "metric": 'cosine'
    },
    'hdbscan_conf': {
        "min_cluster_size": 15,
        "metric": 'euclidean',
        "prediction_data": True
    },
    'shuffle_arrays': True,
    'random_state': 77
}

# Classla configuration
classla_conf = {
  #'processors': 'tokenize, lemma',
  'lang': 'sl',
  'pos_lemma_pretag' : True,
  'use_gpu': True
}

In [None]:
# Configurations for automated bias pipeline

## Tweetiment configuration
topic_bias_path = root_dir + '/configs/topic-bias.json'
party_bias_path = root_dir + '/configs/party-bias.json'

tweetiment_config = {
    'topic_bias': load_tweets(topic_bias_path),
    'party_bias': load_tweets(party_bias_path)
}

## Bertopic second layer configuration
tweetiment_bertopic_SL_config = {
    'bertopic_conf': {
        "top_n_words": 10,
        "min_topic_size": 20,
        #"nr_topics": 8
    },
    'umap_conf': {
        "n_neighbors": 20,
        "n_components": 10,
        "metric": 'cosine'
    },
    'hdbscan_conf': {
        "min_cluster_size": 15,
        "metric": 'euclidean',
        "prediction_data": True
    },
    'shuffle_arrays': True,
    'random_state': 77
}

## Topic information
tweetiment_topics_info = {
    'begunci': {
        'search_term': ["begunec", "migrant", "migrantski"],
        'keywords': [],
        'strict': False,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'lgbtq': {
        'search_term': ["lgbtq", "lgbt", "istospolno"],
        'keywords': [],
        'strict': False,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'religija': {
        'search_term': ["islam", "musliman", "vera"],
        'keywords': ["religija", "dzihadist"],
        'strict': False,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'splav': {
        'search_term': ["splav"],
        'keywords': ["kontracepcija"],
        'strict': False,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'desno': {
        'search_term': ["desnica", "desno", "jj", "sds"],
        'keywords': ["jansa", "desnicar", "janša"],
        'strict': False,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'levo': {
        'search_term': ["levica", "levicar", "lev"],
        'keywords': ["levicarski", "levičar", "mesec"],
        'strict': False,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'politika': {
        'search_term': ["politika", "politicen", "vlada"],
        'keywords': [],
        'strict': False,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    }
}

pbm_config = {}
pbm_config['bertopic_SL_config'] = tweetiment_bertopic_SL_config
pbm_config['preprocess_config'] = preprocess_config
pbm_config['tweetiment_config'] = tweetiment_config
pbm_config['topic_info'] = tweetiment_topics_info
pbm_config['verbose'] = VERBOSE
pbm_config['debug'] = DEBUG

# Parameters

YEAR = 2021
EPOCH = 21
BATCHES = (10,10)
ITERATIONS = 10

In [None]:
# Classla preprocessor
classla_pipeline = classla.Pipeline(**classla_conf)

# Tweet preprocessor
tpre.set_options(tpre.OPT.URL, tpre.OPT.MENTION, tpre.OPT.HASHTAG)

# Embedding model
topic_model = AutoModelForMaskedLM.from_pretrained("EMBEDDIA/sloberta")

sent_model = {}
sent_model['tokenizer'] = AutoTokenizer.from_pretrained("EMBEDDIA/sloberta-tweetsentiment")
sent_model['model'] = AutoModelForSequenceClassification.from_pretrained("EMBEDDIA/sloberta-tweetsentiment")

In [None]:
# MAIN CODE - Bertopic pipeline

# Run only Bertopic training and testing, predicting pipeline
bertopic_pipeline(root_dir,
                  classla_pipeline,
                  preprocess_config,
                  bertopic_FL_config,
                  bertopic_SL_config,
                  topic_model,
                  topics_info,
                  year=YEAR,
                  epoch_number=EPOCH,
                  batch_index=BATCHES[0],
                  iterations=ITERATIONS,
                  saving=SAVING,
                  verbose=VERBOSE,
                  debug=DEBUG)

In [None]:
# Bertopic + Bias pipeline

bt = lambda b: bertopic_pipeline(root_dir,
                  classla_pipeline,
                  preprocess_config,
                  bertopic_FL_config,
                  bertopic_SL_config,
                  topic_model,
                  topics_info,
                  year=YEAR,
                  epoch_number=EPOCH,
                  batch_index=b,
                  iterations=ITERATIONS,
                  saving=SAVING,
                  verbose=VERBOSE,
                  debug=DEBUG)


def bertbias_pipeline(bt_pipeline, bs_pipeline, data_dir, topic_info, batches, saving, verbose):
  for batch_index in range(batches[0], batches[1]+1):
    tweets = bt_pipeline(batch_index)
    if verbose:
      print("- Calculating biases...")
    predictions = bs_pipeline(tweets, False)
    for i, tweet in enumerate(tweets):
      if 'topic' in tweet:
        tweet.pop('topic')
      if 'topic_probability' in tweet:
        tweet.pop('topic_probability')
      
      # Assign result
      tweet['prediction'] = predictions[i]

    if saving:
      if verbose:
        print("- Saving labelled tweets...")
      for topic in topic_info:
        t = list(filter(lambda x: x['prediction']['topic_mentioned'] == topic, tweets))
        overwrite_labelled_tweets(topic, t, data_dir)
      
      # Null topics
      t = list(filter(lambda x: x['prediction']['topic_mentioned'] == None, tweets))
      overwrite_labelled_tweets('null', t, data_dir)

In [None]:
# MAIN CODE - Bertopic + Bias pipeline

# Prepare Politic bias model
pt_model = PoliticBiasModel("PoliticBias",
                            root_dir,
                            classla_pipeline,
                            topic_model,
                            sent_model,
                            pbm_config)
pt_model.train_models()

# Run the whole pipeline
bertbias_pipeline(bt, pt_model.bias_pipeline, root_dir, topics_info, batches=BATCHES, saving=SAVING, verbose=VERBOSE)