# Modeliranje tem

## Okolje

Vzpostavitev okolja

In [1]:
!pip install bertopic

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bertopic
  Downloading bertopic-0.11.0-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 3.0 MB/s 
[?25hCollecting hdbscan>=0.8.28
  Downloading hdbscan-0.8.28.tar.gz (5.2 MB)
[K     |████████████████████████████████| 5.2 MB 10.8 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting umap-learn>=0.5.0
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 7.0 MB/s 
Collecting sentence-transformers>=0.4.1
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 5.0 MB/s 
Collecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 43.5 MB/s 
Collect

In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import json
import re
import random
import time
import os.path

import numpy as np
from sklearn.utils import shuffle

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, pipeline
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN

from google.colab import drive
drive.mount('/content/drive/')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Mounted at /content/drive/


In [7]:
# Setting constants

LOCAL = False

google_data_dir = "/content/drive/MyDrive/Diploma/Data"
local_data_dir = "/data"

root_dir = ""
if LOCAL:
    root_dir = local_data_dir
else:
    root_dir = google_data_dir

## Funkcije in razredi

In [26]:
def load_tweets(file_name):

  # Load data
  data = []

  with open(file_name, 'r', encoding='utf8') as sample_data:
    data = json.load(sample_data)

  return data

def save_tweets(data, dir, file_name):
  with open(f'{dir}/{file_name}.json', 'w+', encoding='utf8') as outdata:
    json.dump(data, outdata, ensure_ascii=False)

def load_and_preprocess(cpipeline, data_dir, only_load, tweet_stop_words=[], tweet_upos=[], min_words=4, verbose=False, debug=False):
  d = []
  if only_load:
    d = load_tweets(data_dir)
  else:
    d = []#preprocess_tweets(cpipeline, load_tweets(data_dir), tweet_stop_words=tweet_stop_words, tweet_upos=tweet_upos, min_words=min_words, verbose=verbose, debug=debug)
  return d

def load_labelled_tweets(dir, topic_names, shuffle_arrays=True, random_state=77):
  topics = []
  for t in topic_names:
    with open(f'{dir}/labelled_topics/topic_{t}.json', 'r', encoding='utf8') as topic_data:
      data = json.load(topic_data)
      topics.extend(data)
  
  topic_lemmas = []
  topic_labels = []

  for t in topics:
    topic_lemmas.append(t['lemma_text'])
    topic_labels.append(t['topic'])

  topic_labels = [ topic_names.index(x) for x in topic_labels]

  if shuffle_arrays:
    shuffle(topic_lemmas, topic_labels, random_state=random_state)
  return topic_lemmas, topic_labels

"""
Bertopic model for modeling topics

"""
class BertopicModel:

  def __init__(self, model_name, embed_model, config):
    self.model_name = model_name
    self.embed_model = embed_model
    self.config = config

    # Create the Bertopic model with config
    self.make_model()

  def make_model(self):
    self.umap_model = UMAP(**self.config["umap_conf"])
    self.hdbscan_model = HDBSCAN(**self.config["hdbscan_conf"])
    self.bertopic = BERTopic(embedding_model=self.embed_model, umap_model=self.umap_model, hdbscan_model=self.hdbscan_model, **self.config["bertopic_conf"])

  def load_tweet_data(self, tweet_data):
    doc_tweet_lemmas = [ t['lemma_text'] for t in tweet_data ]
    
    self.data = {}
    self.data["tweets"] = tweet_data
    self.data["docs"] = doc_tweet_lemmas

  def load_topic_data(self, topic_docs, topic_labels):
    if not hasattr(self, 'data'):
      self.data = {}
      
    self.data["docs"] = topic_docs
    self.data["labels"] = topic_labels

  def train_model(self, only_fit):
    data_keys = self.data.keys()

    if "docs" in data_keys and not only_fit:
      topics, probs = self.bertopic.fit_transform(self.data["docs"])
      self.result = {}
      self.result["topic_ids"] = topics
      self.result["topic_probs"] = probs
    elif "docs" in data_keys and "labels" in data_keys and only_fit:
      self.bertopic = self.bertopic.fit(self.data["docs"], y=self.data["labels"])
    else:
      print("Error: Missing data!")
  
  def predict(self):
    data_keys = self.data.keys()

    if "docs" in data_keys:
      topics, probs = self.bertopic.transform(self.data["docs"])
      self.result = {}
      self.result["topic_ids"] = topics
      self.result["topic_probs"] = probs

  def reduce(self, nr):
    if hasattr(self, 'data') and hasattr(self, 'result'):
      topics, probs = self.bertopic.reduce_topics(self.data["docs"], self.data["labels"], nr_topics=nr)
      self.result["topic_ids"] = topics
      self.result["topic_probs"] = probs

  def merge_topics(self, indexes):
    if hasattr(self, 'data'):
      self.bertopic.merge_topics(self.data["docs"], self.data["labels"], indexes)

  def tweets_from_topic(self, ntopic):
    if self.result:
      tw = []

      for i, x in enumerate(self.result.topic_ids):
        if x == ntopic:
          tw.append(self.data["docs"][i])

      return tw
    else:
      print("Error: Missing data!")
      return []

  def collect_topic_indices(self, ntopic, tweet_prob=0.5):
    if hasattr(self, 'result'):

      tweet_ids = []
      for i, x in enumerate(self.result["topic_ids"]):

        # Check if topic id and probability higher
        if ntopic == x and self.result["topic_probs"][i] > tweet_prob:
          tweet_ids.append(i)

      return tweet_ids
    else:
      print("Error: Missing data!")
      return []

  def find_politic_topics(self, keywords, topn=3, sim_threshold=0.5, tweet_prob=0.5, include_prob=False):
    if hasattr(self, 'bertopic'):
      indices = set()

      # Find relating topics
      for keyword in keywords:
        sim_ids, sim_probs = self.bertopic.find_topics(keyword, top_n=topn)

        # Filter based on similarity
        sim_topics = [ sim_ids[i] for i, x in enumerate(sim_probs) if x > sim_threshold ]

        if len(sim_topics) > 0:
          for topic in sim_topics:
            indices.update(self.collect_topic_indices(topic, tweet_prob=tweet_prob))
      
      tweet_docs = []
      for i in indices:
        tdoc = self.data["tweets"][i]
        if include_prob:
          tdoc["topic_probability"] = self.result["topic_probs"][i]

        tweet_docs.append(tdoc)

      return tweet_docs
    else:
      print("Error: Missing data!")
      return []

  def visualize(self, t='distance_map'):
    if hasattr(self, 'bertopic'):
      #return self.bertopic.visualize_topics()
      if t == 'barchart':
        return self.bertopic.visualize_barchart()
      elif t == 'hierarchy':
        return self.bertopic.visualize_hierarchy()
      elif t == 'heatmap':
        return self.bertopic.visualize_heatmap()
      elif t == 'term_rank':
        return self.bertopic.visualize_term_rank()
      else:
        return self.bertopic.visualize_topics()
      #elif t == 'documents':
      #  self.bertopic.visualize_documents()
    else:
      print("Error: Model not yet initiated!")

  def save_model(self, model_dir):
    self.bertopic.save(str(model_dir + self.model_name))
  
  def load_model(self, model_dir):
    self.bertopic.load(str(model_dir + self.model_name), embedding_model=self.embed_model)

def label_politic_tweets(model : BertopicModel, topic_info, data_dir, topn=3, n_sim_subtopics=3, save_tweets=False, verbose=True):

  if verbose:
    print(f'-- Collected batch topic distribution summary:')

  sim_topics = similar_topics(model, topic_info, topn=topn, n_sim_subtopics=n_sim_subtopics)

  tweets = model.data['tweets']
  labels = model.result['topic_ids']
  probs = model.result['topic_probs']

  for i in range(len(tweets)):
    tweet = tweets[i]
    label = labels[i]
    prob = probs[i]

    most_likely_topic = None
    most_likely_prob = 0

    for st in sim_topics:
      for sbt, prob in sim_topics[st]:
        if sbt == label and prob > most_likely_prob:
          most_likely_topic = st
          most_likely_prob = prob


    if most_likely_topic is not None and topic_info[most_likely_topic]['strict']:
      kw = []
      kw.extend(topic_info[most_likely_topic]['keywords'])
      kw.extend(topic_info[most_likely_topic]['search_term'])

      founds = False
      for key in kw:
        if key in tweet['lemma_text']:
          tweet['topic'] = most_likely_topic
          tweet['topic_probability'] = prob
          founds = True
          break
      
      if not founds:
        tweet['topic'] = None
        tweet['topic_probability'] = 0

    else:
      tweet['topic'] = most_likely_topic
      tweet['topic_probability'] = prob

  if save_tweets:
    for tp in topic_info:
      t = list(filter(lambda x: x['topic'] == tp, tweets))
      if verbose:
        print(f'-- {tp} : {len(t)}')
      overwrite_labelled_topics(tp, t, data_dir)

  return model.data['tweets']

def similar_topics(model : BertopicModel, topic_info, topn=3, n_sim_subtopics=3) -> dict:
    
  sim_topics = {}

  for topic in topic_info:
    tt = topic_info[topic]

    tpcs1 = {}
    for keyword in tt['search_term']:
      sims = model.bertopic.find_topics(keyword, top_n=topn)

      sims = tuple(zip(sims[0], sims[1]))

      tpcs2 = dict((x, y) for x, y in sims)

      tpcs1 = {
        key: tpcs1.get(key, 0) + tpcs2.get(key, 0) for key in set(tpcs1) | set(tpcs2)
      }
    
    # Normalize
    mv = max(tpcs1.values())
    for kj in tpcs1:
      tpcs1[kj] = float(tpcs1[kj] / mv)
      
    subtopics = []
    for i in range(n_sim_subtopics):
      if tpcs1:
        k1 = max(tpcs1, key=tpcs1.get)
        if k1 != -1:
          subtopics.append((k1, tpcs1[k1]))
        tpcs1.pop(k1)
    
    sim_topics[topic] = subtopics
  
  return sim_topics

def overwrite_labelled_topics(file_topic, topic_tweets, data_dir):
    data=[]
    with open(f'{data_dir}/labelled_topics/topic_{file_topic}.json', 'r', encoding='utf8') as topic_data:
      data = json.load(topic_data)
      data.extend(topic_tweets)
    with open(f'{data_dir}/labelled_topics/topic_{file_topic}.json', 'w', encoding='utf8') as topic_data_n:
      json.dump(data, topic_data_n, ensure_ascii=False)

In [18]:
# Setting configuration

# Path to preprocessed tweet data

YEAR = 2021
EPOCH = 2
BATCH = 1

tweet_data_path = f'preprocess/{YEAR}-{EPOCH}/{YEAR}_{EPOCH}_{BATCH}.json'

# Path to save labelled tweet data
SAVING = True
tweet_save_path = f'stpt/{YEAR}-{EPOCH}'

# Verbose
VERBOSE = True

# Imported configurations
politics_seed_topic = [["politik", "politika", "političen"],["vlada", "vladati", "država"],["komunist", "komunističen", "komunizem"],["socializem", "socialen", "sociala"],["fašisti", "fašističen", "fašizem"],["levičar", "levičarski", "levica"],["desničar", "desničarski", "desnica", "janez", "jj", "sds"],["nosečnost", "nosečnica", "splav", "ženska"],["migrant", "migriranje", "beg", "begunec"],["musliman", "islam", "islamist", "islamist", "ekstremist"],["kriminal", "kiminalec", "zločinec"],["lgbtq", "istospolni", "lgbt", "spol"]]

topic_info = {
    'begunci': {
        'search_term': ["begunec", "migrant", "migrantski"],
        'keywords': [],
        'strict': True,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'lgbtq': {
        'search_term': ["lgbtq", "lgbt", "istospolno"],
        'keywords': [],
        'strict': True,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'religija': {
        'search_term': ["islam", "musliman", "vera"],
        'keywords': ["religija", "dzihadist"],
        'strict': True,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'splav': {
        'search_term': ["splav"],
        'keywords': ["kontracepcija"],
        'strict': True,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'desno': {
        'search_term': ["desnica", "desno", "jj", "sds"],
        'keywords': ["jansa", "desnicar", "janša"],
        'strict': True,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'levo': {
        'search_term': ["levica", "levicar", "lev"],
        'keywords': ["levicarski", "levičar", "mesec"],
        'strict': True,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    },
    'politika': {
        'search_term': ["politika", "politicen", "vlada"],
        'keywords': [],
        'strict': False,
        'sim_threshold': 0.2,
        'tweet_prob': 0.5
    }
}

# Preprocessing configuration
preprocess_config = {
    'min_words': 4,
    'verbose': True,
    'debug': False,
    'tweet_upos': ['PUNCT', 'NUM', 'SYM', 'CCONJ', 'INTJ'],
    'tweet_stop_words': ['http', 'https', 'rt', 'oz']
}

# Bertopic first layer configuration
bertopic_FL_config = {
    'bertopic_conf': {
        "top_n_words": 10,
        "min_topic_size": 20,
        "seed_topic_list": politics_seed_topic
    },
    'umap_conf': {
        "n_neighbors": 15,
        "n_components": 10,
        "metric": 'cosine'
    },
    'hdbscan_conf': {
        "min_cluster_size": 10,
        "metric": 'euclidean',
        "prediction_data": True
    },
    'sim_threshold': 0.5,
    'tweet_prob': 0.5
}

# Bertopic second layer configuration
bertopic_SL_config = {
    'bertopic_conf': {
        "top_n_words": 10,
        "min_topic_size": 20,
        #"nr_topics": 8
    },
    'umap_conf': {
        "n_neighbors": 20,
        "n_components": 10,
        "metric": 'cosine'
    },
    'hdbscan_conf': {
        "min_cluster_size": 15,
        "metric": 'euclidean',
        "prediction_data": True
    },
    'topn': 3,
    'n_sim_subtopics': 3,
}

In [9]:
# Loading models

slobert_model = AutoModelForMaskedLM.from_pretrained("EMBEDDIA/sloberta")

Downloading config.json:   0%|          | 0.00/520 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/422M [00:00<?, ?B/s]

In [27]:
# MAIN CODE

# Load preprocessed tweets
preprocessed_tweet_data = load_and_preprocess(None, f'{root_dir}/{tweet_data_path}', True)

# Print summary if verbose
if VERBOSE:
  print(f'- Batch summary:')
  print(f'-- Batch length: {len(preprocessed_tweet_data)}')
  
# First layer of topic modeling
print(f'- First layer of topic modeling...')

# Create Bertopic model (1st layer)
bt_fl_model = BertopicModel('Bertopic_FL', embed_model=slobert_model, config=bertopic_FL_config)

# Load twitter data
bt_fl_model.load_tweet_data(preprocessed_tweet_data)

if VERBOSE:
  print(f'- Training 1st layer of Bertopic model...')
# Train the model
bt_fl_model.train_model(only_fit=False)

if SAVING:
  if VERBOSE:
    print(f'- Saving 1st layer of Bertopic model...')
  # Save model
  bt_fl_model.save_model(f'{root_dir}/models')

# Extract general politic topics
topics_to_extract = [item for sublist in bertopic_FL_config['bertopic_conf']['seed_topic_list'] for item in sublist]
extracted_tweets = bt_fl_model.find_politic_topics(topics_to_extract, sim_threshold=bertopic_FL_config['sim_threshold'], tweet_prob=bertopic_FL_config['tweet_prob'])

if VERBOSE:
  print(f'- Batch of general politics summary:')
  print(f'-- Batch length: {len(extracted_tweets)}')

# Save STP tweets
if SAVING:
  if VERBOSE:
    print(f'- Saving STPT in file {tweet_save_path}...')
  save_tweets(extracted_tweets, dir=f'{root_dir}/{tweet_save_path}', file_name=f'{YEAR}_{EPOCH}_{BATCH}')


# Second layer of topic modeling
if VERBOSE:
  print(f'- Second layer of topic modeling...')

# Get training data
topic_names = [item for item in topic_info]
X, y = load_labelled_tweets(root_dir, topic_names)

# Create Bertopic model (2nd layer)
bt_sl_model = BertopicModel('Bertopic_SL', embed_model=slobert_model, config=bertopic_SL_config)

# Load training data
bt_sl_model.load_topic_data(X, y)

# Train model with training data
if VERBOSE:
  print(f'- Training 2nd layer of Bertopic model...')
bt_sl_model.train_model(only_fit=True)

# Saving the SL model
if SAVING:
  if VERBOSE:
    print(f'- Saving 2nd layer of Bertopic model...')
  bt_sl_model.save_model(f'{root_dir}/models')

# Load extracted tweets (test data)
bt_sl_model.load_tweet_data(extracted_tweets)

if VERBOSE:
  print(f'- Predicting new instances on second layer topic modeling...')

# Predict new instances on test data
bt_sl_model.predict()

# Label new instances & overwrite
if VERBOSE:
  print(f'- Labelling and saving topic tweets...')
labelled_tweets = label_politic_tweets(bt_sl_model, topic_info, root_dir, topn=bertopic_SL_config['topn'], n_sim_subtopics=bertopic_SL_config['n_sim_subtopics'], save_tweets=SAVING, verbose=VERBOSE)

- Batch summary:
-- Batch length: 3542
- First layer of topic modeling...
- Training 1st layer of Bertopic model...
- Saving 1st layer of Bertopic model...
- Batch of general politics summary:
-- Batch length: 932
- Saving STPT in file stpt/2021-2...
- Second layer of topic modeling...
- Training 2nd layer of Bertopic model...
- Saving 2nd layer of Bertopic model...
- Predicting new instances on second layer topic modeling...
- Labelling and saving topic tweets...
-- Collected batch topic distribution summary:
-- begunci : 15
-- lgbtq : 4
-- religija : 1
-- splav : 0
-- desno : 13
-- levo : 27
-- politika : 253
