# Modeliranje tem

## Okolje

Vzpostavitev okolja

In [None]:
!pip install bertopic

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import json
import re
import random
import time
import os.path
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter

import numpy as np
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, pipeline
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from umap import UMAP
from hdbscan import HDBSCAN

from google.colab import drive
drive.mount('/content/drive/')

In [None]:
# Setting constants

LOCAL = False

google_data_dir = "/content/drive/MyDrive/Diploma/Data"
local_data_dir = "/data"

root_dir = ""
if LOCAL:
    root_dir = local_data_dir
else:
    root_dir = google_data_dir

## Funkcije in razredi

In [None]:
def load_tweets(file_name):

  # Load data
  data = []

  with open(file_name, 'r', encoding='utf8') as sample_data:
    data = json.load(sample_data)

  return data

def save_tweets(data, dir, file_name):
  with open(f'{dir}/{file_name}.json', 'w+', encoding='utf8') as outdata:
    json.dump(data, outdata, ensure_ascii=False)

def load_and_preprocess(cpipeline, data_dir, only_load, tweet_stop_words=[], tweet_upos=[], min_words=4, verbose=False, debug=False):
  d = []
  if only_load:
    d = load_tweets(data_dir)
  else:
    d = []#preprocess_tweets(cpipeline, load_tweets(data_dir), tweet_stop_words=tweet_stop_words, tweet_upos=tweet_upos, min_words=min_words, verbose=verbose, debug=debug)
  return d

def load_labelled_tweets(dir, topic_names, shuffle_arrays=True, random_state=77):
  topics = []
  for t in topic_names:
    with open(f'{dir}/labelled_topics/topic_{t}.json', 'r', encoding='utf8') as topic_data:
      data = json.load(topic_data)
      topics.extend(data)
  
  topic_lemmas = []
  topic_labels = []

  for t in topics:
    topic_lemmas.append(t['lemma_text'])
    topic_labels.append(t['topic'])

  topic_labels = [ topic_names.index(x) for x in topic_labels]

  if shuffle_arrays:
    shuffle(topic_lemmas, topic_labels, random_state=random_state)
  return topic_lemmas, topic_labels

"""
Bertopic model for modeling topics

"""
class BertopicModel:

  def __init__(self, model_name, embed_model, config):
    self.model_name = model_name
    self.embed_model = embed_model
    self.config = config

    # Create the Bertopic model with config
    self.make_model()

  def make_model(self):
    self.umap_model = UMAP(**self.config["umap_conf"])
    self.hdbscan_model = HDBSCAN(**self.config["hdbscan_conf"])
    self.bertopic = BERTopic(embedding_model=self.embed_model, umap_model=self.umap_model, hdbscan_model=self.hdbscan_model, **self.config["bertopic_conf"])

  def load_tweet_data(self, tweet_data):
    doc_tweet_lemmas = [ t['lemma_text'] for t in tweet_data ]
    
    self.data = {}
    self.data["tweets"] = tweet_data
    self.data["docs"] = doc_tweet_lemmas

  def load_topic_data(self, topic_docs, topic_labels):
    if not hasattr(self, 'data'):
      self.data = {}
      
    self.data["docs"] = topic_docs
    self.data["labels"] = topic_labels

  def train_model(self, only_fit):
    data_keys = self.data.keys()

    if "docs" in data_keys and not only_fit:
      topics, probs = self.bertopic.fit_transform(self.data["docs"])
      self.result = {}
      self.result["topic_ids"] = topics
      self.result["topic_probs"] = probs
    elif "docs" in data_keys and "labels" in data_keys and only_fit:
      self.bertopic = self.bertopic.fit(self.data["docs"], y=self.data["labels"])
    else:
      print("Error: Missing data!")
  
  def predict(self):
    data_keys = self.data.keys()

    if "docs" in data_keys:
      topics, probs = self.bertopic.transform(self.data["docs"])
      self.result = {}
      self.result["topic_ids"] = topics
      self.result["topic_probs"] = probs

  def reduce(self, nr):
    if hasattr(self, 'data') and hasattr(self, 'result'):
      topics, probs = self.bertopic.reduce_topics(self.data["docs"], self.data["labels"], nr_topics=nr)
      self.result["topic_ids"] = topics
      self.result["topic_probs"] = probs

  def merge_topics(self, indexes):
    if hasattr(self, 'data'):
      self.bertopic.merge_topics(self.data["docs"], self.data["labels"], indexes)

  def tweets_from_topic(self, ntopic):
    if self.result:
      tw = []

      for i, x in enumerate(self.result.topic_ids):
        if x == ntopic:
          tw.append(self.data["docs"][i])

      return tw
    else:
      print("Error: Missing data!")
      return []

  def extract_topics_by_keywords(self, topic_keywords, similar_kw=2):
    if hasattr(self, 'result'):

      topic_indices = []
      topics = self.bertopic.get_topics()
      for t in topics:
        cur_topics = topics[t]

        valid = False
        s = 0
        for tkws in topic_keywords:
          for w, p in cur_topics:
            if w in tkws:
              s = s + 1
              if s >= similar_kw:
                valid = True
                break
          
          if valid:
            topic_indices.append(t)
            break

      tweets = []

      for i, t in enumerate(self.result["topic_ids"]):
        if t in topic_indices:
          tweets.append(self.data["tweets"][i])

      return tweets

    else:
      print("Error: Missing data!")
      return []


  def collect_topic_indices(self, ntopic, tweet_prob=0.5):
    if hasattr(self, 'result'):

      tweet_ids = []
      for i, x in enumerate(self.result["topic_ids"]):

        # Check if topic id and probability higher
        if ntopic == x and self.result["topic_probs"][i] > tweet_prob:
          tweet_ids.append(i)

      return tweet_ids
    else:
      print("Error: Missing data!")
      return []

  def find_politic_topics(self, keywords, topn=3, sim_threshold=0.5, tweet_prob=0.5, include_prob=False):
    if hasattr(self, 'bertopic'):
      indices = set()

      # Find relating topics
      for keyword in keywords:
        sim_ids, sim_probs = self.bertopic.find_topics(keyword, top_n=topn)

        # Filter based on similarity
        sim_topics = [ sim_ids[i] for i, x in enumerate(sim_probs) if x > sim_threshold ]

        if len(sim_topics) > 0:
          for topic in sim_topics:
            indices.update(self.collect_topic_indices(topic, tweet_prob=tweet_prob))
      
      tweet_docs = []
      for i in indices:
        tdoc = self.data["tweets"][i]
        if include_prob:
          tdoc["topic_probability"] = self.result["topic_probs"][i]

        tweet_docs.append(tdoc)

      return tweet_docs
    else:
      print("Error: Missing data!")
      return []

  def visualize(self, t='distance_map'):
    if hasattr(self, 'bertopic'):
      #return self.bertopic.visualize_topics()
      if t == 'barchart':
        return self.bertopic.visualize_barchart()
      elif t == 'hierarchy':
        return self.bertopic.visualize_hierarchy()
      elif t == 'heatmap':
        return self.bertopic.visualize_heatmap()
      elif t == 'term_rank':
        return self.bertopic.visualize_term_rank()
      else:
        return self.bertopic.visualize_topics()
      #elif t == 'documents':
      #  self.bertopic.visualize_documents()
    else:
      print("Error: Model not yet initiated!")

  def save_model(self, model_dir):
    self.bertopic.save(str(model_dir + self.model_name))
  
  def load_model(self, model_dir):
    self.bertopic.load(str(model_dir + self.model_name), embedding_model=self.embed_model)

def label_politic_tweets(model : BertopicModel, topic_info, data_dir, topn=3, n_sim_subtopics=3, save_tweets=False, verbose=True):

  if verbose:
    print(f'-- Collected batch topic distribution summary:')

  sim_topics = similar_topics(model, topic_info, topn=topn, n_sim_subtopics=n_sim_subtopics)
  #topics = [ x for x in topic_info]

  tweets = model.data['tweets']
  labels = model.result['topic_ids']
  probs = model.result['topic_probs']

  for i in range(len(tweets)):
    tweet = tweets[i]
    label = labels[i]
    prob = probs[i]

    most_likely_topic = None
    most_likely_prob = 0

    for st in sim_topics:
      for sbt, prob in sim_topics[st]:
        if sbt == label and prob > most_likely_prob:
          most_likely_topic = st
          most_likely_prob = prob


    if most_likely_topic is not None and topic_info[most_likely_topic]['strict']:
      kw = []
      kw.extend(topic_info[most_likely_topic]['keywords'])
      kw.extend(topic_info[most_likely_topic]['search_term'])

      founds = False
      for key in kw:
        if key in tweet['lemma_text']:
          tweet['topic'] = most_likely_topic
          tweet['topic_probability'] = prob
          founds = True
          break
      
      if not founds:
        tweet['topic'] = None
        tweet['topic_probability'] = 0

    else:
      tweet['topic'] = most_likely_topic
      tweet['topic_probability'] = prob

  if save_tweets:
    for tp in topic_info:
      t = list(filter(lambda x: x['topic'] == tp, tweets))
      if verbose:
        print(f'-- {tp} : {len(t)}')
      overwrite_labelled_topics(tp, t, data_dir)

  return model.data['tweets']

def similar_topics(model : BertopicModel, topic_info, topn=3, n_sim_subtopics=3) -> dict:
    
  sim_topics = {}

  for topic in topic_info:
    tt = topic_info[topic]

    tpcs1 = {}
    for keyword in tt['search_term']:
      sims = model.bertopic.find_topics(keyword, top_n=topn)

      sims = tuple(zip(sims[0], sims[1]))

      tpcs2 = dict((x, y) for x, y in sims)

      tpcs1 = {
        key: tpcs1.get(key, 0) + tpcs2.get(key, 0) for key in set(tpcs1) | set(tpcs2)
      }
    
    # Normalize
    mv = max(tpcs1.values())
    for kj in tpcs1:
      tpcs1[kj] = float(tpcs1[kj] / mv)
      
    subtopics = []
    for i in range(n_sim_subtopics):
      if tpcs1:
        k1 = max(tpcs1, key=tpcs1.get)
        if k1 != -1:
          subtopics.append((k1, tpcs1[k1]))
        tpcs1.pop(k1)
    
    sim_topics[topic] = subtopics
  
  return sim_topics

def overwrite_labelled_topics(file_topic, topic_tweets, data_dir):
    data=[]
    with open(f'{data_dir}/labelled_topics/topic_{file_topic}.json', 'r', encoding='utf8') as topic_data:
      data = json.load(topic_data)
      data.extend(topic_tweets)
    with open(f'{data_dir}/labelled_topics/topic_{file_topic}.json', 'w', encoding='utf8') as topic_data_n:
      json.dump(data, topic_data_n, ensure_ascii=False)

In [None]:
# Setting configuration

# Path to preprocessed tweet data

YEAR = 2021
EPOCH = 41
BATCHES = (1,11)

#tweet_data_path = 
topics = ['splav', 'lgbtq', 'begunci', 'religija', 'krscanstvo', 'militarizem', 'varnost', 'denacionalizacija', 'levo', 'desno']

# Path to save labelled tweet data
SAVING = True
tweet_save_path = f'stpt/{YEAR}-{EPOCH}'

# Verbose
VERBOSE = True

# Imported configurations
topic_config_path = f'{root_dir}/configs/topics.json'
politics_seed_topic = load_tweets(topic_config_path)

topic_info = {
    'begunci': {
        'keywords': ["migrant", "migriranje", "beg", "begunec", "meja", "begunci", "migracija"],
        'regexes': [r'\bmigr\w+', r'\bbeg\w+']
    },
    'lgbt': {
        'keywords': ["lgbtq", "lgbt", "lgbtqia", "istospolen", "spol", "gej", "lezbijka", "lezbijski", "trans", "seksualnost"],
        'regexes': [r'\btrans\w+', r'\bseks\w+', r'\blgbt\w+', r'\bistospol\w+', r'\bgej\w+', r'\blezb\w+', r'\bspol\w+']
    },
    'religija': {
        'keywords': ["musliman", "islam", "radikalen", "islamski", "muslimanski", "jud", "izrael", "izraelski", "vera"],
        'regexes': [r'\bver\w+', r'\bislam\w+', r'\bžid\w+', r'\bzid\w+', r'\bislam\w+', r'\bjud\w+', r'\bmusli\w+']
    },
    'splav': {
        'keywords': ["splav", "zarodek", "kontracepcija", "vazektomija", "sterilizacija", "diafragma", "kondom", "maternica", "fetus"],
        'regexes': [r'\bkontracep\w+', r'\bsplav\w+', r'\bnoseč\w+', r'\bnosec\w+', r'\bsteril\w+', r'\bkastri\w+']
    },
    'levo': {
        'keywords': ["levica", "levicar", "lev", "levičar", "mesec", "levi"],
        'regexes': [r'\blev\w+']
    },
    'krscanstvo': {
        'keywords': ["cerkev", "župnik", "verouk", "vera", "bog", "mučenik", "vernik", "verniki", "otrok", "papež"],
        'regexes': [r'\bkrscan\w+', r'\bkrščan\w+', r'\bcerkv\w+', r'\bkatoli\w+', r'\bdruzi\w+', r'\bdruži\w+', r'\bteolo\w+']
    },
    'militarizem': {
      'keywords': ["nato", "vojska", "vojak", "meja", "obramba", "zasčita", "sila", "varnost", "orožje", "orozje"],
      'regexes': [r'\bmilitar\w+', r'\bvoj\w+', r'\bnaborni\w+', r'\bpovelj\w+', r'\bzavez\w+', r'\bpatri\w+']
    },
    'varnost': {
      'keywords': ["represiven", "policija", "protest", "varda", "nadzor", "varovanje", "varnost", "varen", "red", "mir"],
      'regexes': [r'\bpolici\w+', r'\bprotest\w+', r'\bshod\w+', r'\bteror\w+']
    },
    'denacionalizacija': {
      'keywords': ["denacionalizacija", "privat", "last", "premoženje", "kapital"],
      'regexes': [r'\bprivat\w+', r'\bzaseb\w+', r'\blast\w+', r'\bpremož\w+', r'\bpodrža\w+', r'\bdenacional\w+', r'\bkapital\w+']
    },
    'desno': {
        'keywords': ["desnica", "desno", "jj", "sds", "desničar", "janša", "jansa", "nsi", "janez"],
        'regexes': [r'\bdesni\w+', r'\bjan\w+']
    }
}

# Preprocessing configuration
preprocess_config = {
    'min_words': 4,
    'verbose': True,
    'debug': False,
    'tweet_upos': ['PUNCT', 'NUM', 'SYM', 'CCONJ', 'INTJ'],
    'tweet_stop_words': ['http', 'https', 'rt', 'oz']
}

# Bertopic first layer configuration
bertopic_FL_config = {
    'bertopic_conf': {
        "top_n_words": 5,
        "min_topic_size": 15,
        "seed_topic_list": politics_seed_topic
    },
    'umap_conf': {
        "n_neighbors": 15,
        "n_components": 15,
        "metric": 'cosine'
    },
    'hdbscan_conf': {
        "min_cluster_size": 15,
        "metric": 'euclidean',
        "prediction_data": True
    },
    'similar_kw': 1
}

# Bertopic second layer configuration
bertopic_SL_config = {
    'bertopic_conf': {
        "top_n_words": 10,
        "min_topic_size": 20,
        "n_gram_range": (1,2),
        "nr_topics": 10,
        "diversity": 0.1
    },
    'umap_conf': {
        "n_neighbors": 20,
        "n_components": 15,
        "metric": 'cosine'
    },
    'hdbscan_conf': {
        "min_cluster_size": 15,
        "metric": 'euclidean',
        "prediction_data": True
    },
    'topn': 3,
    'n_sim_subtopics': 3,
}

# Bertopic third layer configuration
bertopic_TL_config = {
    'bertopic_conf': {
        "top_n_words": 5,
        "min_topic_size": 10,
        "n_gram_range": (1,2),
        "diversity": 0.1
    },
    'umap_conf': {
        "n_neighbors": 10,
        "n_components": 10,
        "metric": 'cosine'
    },
    'hdbscan_conf': {
        "min_cluster_size": 5,
        "metric": 'euclidean',
        "prediction_data": True
    }
}


In [None]:
# Loading models

#slobert_model = AutoModelForMaskedLM.from_pretrained("EMBEDDIA/sloberta")
slobert_model = pipeline('feature-extraction', model='EMBEDDIA/sloberta')#, device=0)

## Prvi sloj modeliranja tem

In [None]:
# MAIN CODE

# Extracting only general politic tweets

# Load preprocessed tweets
preprocessed_tweet_data = []
for b in range(*BATCHES):
  preprocessed_tweet_data.extend(load_and_preprocess(None, f'{root_dir}/preprocess/{YEAR}-{EPOCH}/{YEAR}_{EPOCH}_{b}.json', True))

# Print summary if verbose
if VERBOSE:
  print(f'- Batch "{YEAR}_{EPOCH}" summary:')
  print(f'-- Batch length: {len(preprocessed_tweet_data)}')
  
# First layer of topic modeling
print(f'- First layer of topic modeling...')

# Create Bertopic model (1st layer)
bt_fl_model = BertopicModel('Bertopic_FL', embed_model=slobert_model, config=bertopic_FL_config)

# Load twitter data
bt_fl_model.load_tweet_data(preprocessed_tweet_data)

if VERBOSE:
  print(f'- Training 1st layer of Bertopic model...')

# Train the model
bt_fl_model.train_model(only_fit=False)

if SAVING:
  if VERBOSE:
    print(f'- Saving 1st layer of Bertopic model...')
  # Save model
  bt_fl_model.save_model(f'{root_dir}/models')

# Visualize
bt_fl_model.visualize()

# Extract general politic by topic keywords
extracted_tweets = bt_fl_model.extract_topics_by_keywords(bertopic_FL_config['bertopic_conf']['seed_topic_list'], similar_kw=bertopic_FL_config['similar_kw'])

if VERBOSE:
  print(f'- Batch of general politics summary:')
  print(f'-- Batch length: {len(extracted_tweets)}')
  print(f'-- Extracted general politics tweets: {len(extracted_tweets)}/{len(preprocessed_tweet_data)} ({int((len(extracted_tweets)/len(preprocessed_tweet_data))*100)} %)')

# Save STP tweets
if SAVING:
  if VERBOSE:
    print(f'- Saving STPT in file {tweet_save_path}/{YEAR}_{EPOCH}_{b}...')
  save_tweets(extracted_tweets, dir=f'{root_dir}/{tweet_save_path}', file_name=f'{YEAR}_{EPOCH}_ALL')


## Drugi sloj modeliranja tem

In [None]:
# MAIN CODE

# Second layer of topic modeling
if VERBOSE:
  print(f'- Second layer of topic modeling...')

# Get training data
X, y = load_labelled_tweets(root_dir, topics)

# Modifications

#vectorizer_model = CountVectorizer(stop_words = stopwords.words('slovene'), ngram_range=(1, 2), max_features=10)
#bertopic_SL_config['bertopic_conf']['vectorizer_model'] = vectorizer_model

ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)
bertopic_SL_config['bertopic_conf']['ctfidf_model'] = ctfidf_model

# Create Bertopic model (2nd layer)
bt_sl_model = BertopicModel('Bertopic_SL', embed_model=slobert_model, config=bertopic_SL_config)

# Load training data
bt_sl_model.load_topic_data(X, y)

# Train model with training data
if VERBOSE:
  print(f'- Training 2nd layer of Bertopic model...')
bt_sl_model.train_model(only_fit=True)

# Saving the SL model
if SAVING:
  if VERBOSE:
    print(f'- Saving 2nd layer of Bertopic model...')
  bt_sl_model.save_model(f'{root_dir}/models')

bt_sl_model.visualize()

In [None]:
bt_sl_model.bertopic.get_topics()

In [None]:
# Predict new instances

topic_names = ['desno', 'levo', 'begunci', 'splav', 'krscanstvo', 'religija', 'militarizem', 'lgbt', 'denacionalizacija', 'varnost']

# Load extracted tweets (test data)
tweets_to_predict = load_and_preprocess(None, f'{root_dir}/stpt/{YEAR}-{EPOCH}/{YEAR}_{EPOCH}_ALL.json', True)

bt_sl_model.load_tweet_data(tweets_to_predict)

if VERBOSE:
  print(f'- Predicting new instances on second layer topic modeling...')

# Predict new instances on test data
bt_sl_model.predict()

# Label new instances & overwrite
if SAVING:
  if VERBOSE:
    print(f'- Labelling and saving topic tweets...')
    #labelled_tweets = label_politic_tweets(bt_sl_model, topic_info, root_dir, topn=bertopic_SL_config['topn'], n_sim_subtopics=bertopic_SL_config['n_sim_subtopics'], save_tweets=SAVING, verbose=VERBOSE)

    threshold = 0.4

    predicted_topics = zip([ i for i in range(0, len(bt_sl_model.result['topic_ids']))], bt_sl_model.result['topic_ids'], bt_sl_model.result['topic_probs'])

    predicted_topics = list(filter(lambda t: t[2] > threshold, predicted_topics))
    for i in range(len(topic_names)):
      topic_tweet_ids = list(filter(lambda x: x[1] == i, predicted_topics))

    topic_tweets = []
    for idx, ti, p in topic_tweet_ids:
      tw = bt_sl_model.data['tweets'][idx]
      tw['topic'] = topic_names[ti]
      tw['topic_probability'] = p
      topic_tweets.append(tw)

    data=[]
    with open(f'{root_dir}/process/tweets_{topic_names[i]}.json', 'r', encoding='utf8') as topic_data:
      data = json.load(topic_data)
      data.extend(topic_tweets)
    with open(f'{root_dir}/process/tweets_{topic_names[i]}.json', 'w', encoding='utf8') as topic_data_n:
      json.dump(data, topic_data_n, ensure_ascii=False)

## Postprocessing

In [None]:
processed_tweets = []

topic_to_postprocess = 'levo'
#topic_seed_list = [politics_seed_topic[9]]
#bertopic_TL_config['bertopic_conf']['seed_topic_list'] = topic_seed_list

processed_tweets.extend(load_and_preprocess(None, f'{root_dir}/process/tweets_{topic_to_postprocess}.json', True))

# Print summary if verbose
if VERBOSE:
  print(f'- Tweets "{topic_to_postprocess}" summary:')
  print(f'-- Batch length: {len(processed_tweets)}')
  
# Postprocessing
print(f'- Postprocess start...')

# Modifications
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)
bertopic_TL_config['bertopic_conf']['ctfidf_model'] = ctfidf_model

# Create Bertopic model (3rd layer)
bt_tl_model = BertopicModel('Bertopic_TL', embed_model=slobert_model, config=bertopic_TL_config)

# Load twitter data
bt_tl_model.load_tweet_data(processed_tweets)

# Train
if VERBOSE:
  print(f'- Training 3rd layer of Bertopic model...')
bt_tl_model.train_model(only_fit=False)

# Visualize
bt_tl_model.visualize()

In [None]:
bt_tl_model.bertopic.get_topic_info()

In [None]:
# Manually eliminate irrelevant topics

topics_to_eliminate = [-1]

predicted_topics = zip([ i for i in range(0, len(bt_tl_model.result['topic_ids']))], bt_tl_model.result['topic_ids'], bt_tl_model.result['topic_probs'])

predicted_topics = list(filter(lambda t: t[1] in topics_to_eliminate, predicted_topics))

topic_tweets = []
for idx, ti, p in predicted_topics:
  print(bt_tl_model.data['tweets'][idx]['raw_text'])
  topic_tweets.append(bt_tl_model.data['tweets'][idx])

In [None]:
# Overwrite and save postprocessed data

print(f'Batch length after postprocessing: {len(topic_tweets)}/{len(processed_tweets)}')

data=[]
with open(f'{root_dir}/postprocess/tweets_{topic_to_postprocess}.json', 'r', encoding='utf8') as topic_data:
  data = json.load(topic_data)
  data.extend(topic_tweets)
with open(f'{root_dir}/postprocess/tweets_{topic_to_postprocess}.json', 'w', encoding='utf8') as topic_data_n:
  json.dump(data, topic_data_n, ensure_ascii=False)

In [None]:
# Another level of postprocessing if needed

# Print summary if verbose
if VERBOSE:
  print(f'- Tweets "{topic_to_postprocess}" (extra) summary:')
  print(f'-- Batch length: {len(topic_tweets)}')
  
# Postprocessing
print(f'- Postprocess start...')

# Create Bertopic model (3rd layer extra)
bt_tlx_model = BertopicModel('Bertopic_TLx', embed_model=slobert_model, config=bertopic_TL_config)

# Load twitter data
bt_tlx_model.load_tweet_data(topic_tweets)

# Modifications
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)
bertopic_TL_config['bertopic_conf']['ctfidf_model'] = ctfidf_model

# Train
if VERBOSE:
  print(f'- Training 3rd (extra) layer of Bertopic model...')
bt_tlx_model.train_model(only_fit=False)

# Visualize
bt_tlx_model.visualize()

In [None]:
bt_tlx_model.bertopic.get_topic_info()

In [None]:
# Manually eliminate irrelevant topics

topics_to_eliminate = [0,1,2,3,4,6,7,8,9,10,11,15,16]

predicted_topics_x = zip([ i for i in range(0, len(bt_tlx_model.result['topic_ids']))], bt_tlx_model.result['topic_ids'], bt_tlx_model.result['topic_probs'])

predicted_topics_x = list(filter(lambda t: t[1] in topics_to_eliminate, predicted_topics_x))

topic_tweets = []
for idx, ti, p in predicted_topics_x:
  print(bt_tlx_model.data['tweets'][idx]['raw_text'])
  topic_tweets.append(bt_tlx_model.data['tweets'][idx])

In [None]:
# Overwrite and save postprocessed data

print(f'Batch length after postprocessing: {len(topic_tweets)}/{len(processed_tweets)}')

data=[]
with open(f'{root_dir}/postprocess/tweets_{topic_to_postprocess}.json', 'r', encoding='utf8') as topic_data:
  data = json.load(topic_data)
  data.extend(topic_tweets)
with open(f'{root_dir}/postprocess/tweets_{topic_to_postprocess}.json', 'w', encoding='utf8') as topic_data_n:
  json.dump(data, topic_data_n, ensure_ascii=False)

In [None]:
# Final step - find by keywords and regexes (optional)

topics_to_eliminate = [-1]

predicted_topics_x = zip([ i for i in range(0, len(bt_tlx_model.result['topic_ids']))], bt_tlx_model.result['topic_ids'], bt_tlx_model.result['topic_probs'])

predicted_topics_x = list(filter(lambda t: t[1] in topics_to_eliminate, predicted_topics_x))

topic_tweets_d = []
for idx, ti, p in predicted_topics_x:
  topic_tweets_d.append(bt_tlx_model.data['tweets'][idx])


topic_tweets = []
for tw in topic_tweets_d:

  isvalid = False

  lemmas = tw['lemma_text'].split(" ")

  for k in topic_info[topic_to_postprocess]['keywords']:
    if k in lemmas:
      isvalid = True
      topic_tweets.append(tw)
      break
    
  if not isvalid:
    for r in topic_info[topic_to_postprocess]['regexes']:
      found = re.search(r, tw['lemma_text'])

      if found:
        isvalid = True
        topic_tweets.append(tw)
        break
  
  if isvalid:
    print(tw['raw_text'])

In [None]:
# Overwrite and save postprocessed data
print(f'Batch length after postprocessing: {len(topic_tweets)}/{len(processed_tweets)}')

data=[]
with open(f'{root_dir}/postprocess/tweets_{topic_to_postprocess}.json', 'r', encoding='utf8') as topic_data:
  data = json.load(topic_data)
  data.extend(topic_tweets)
with open(f'{root_dir}/postprocess/tweets_{topic_to_postprocess}.json', 'w', encoding='utf8') as topic_data_n:
  json.dump(data, topic_data_n, ensure_ascii=False)