# Obdelava podatkov

## Okolje

Vzpostavitev okolja

In [None]:
!pip install bertopic
!pip install tweet-preprocessor
!pip install classla

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import json
import classla
import re
import random
import time
import os.path
classla.download('sl')

import numpy as np
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, pipeline
from bertopic import BERTopic
import preprocessor as tpre
from umap import UMAP
from hdbscan import HDBSCAN

from google.colab import drive
drive.mount('/content/drive/')

In [None]:
# Setting constants

LOCAL = False

google_data_dir = "/content/drive/MyDrive/Diploma/Data"
local_data_dir = "/data"

root_dir = ""
if LOCAL:
    root_dir = local_data_dir
else:
    root_dir = google_data_dir

## Funkcije in razredi

In [None]:
def load_tweets(file_name):
  # Load data

  data = []

  with open(file_name, 'r', encoding='utf8') as sample_data:
    data = json.load(sample_data)

  return data

def save_tweets(data, dir, file_name):
  with open(f'{dir}/{file_name}.json', 'w+', encoding='utf8') as outdata:
    json.dump(data, outdata, ensure_ascii=False)


def load_labelled_tweets(dir, topic_names, shuffle_arrays=True, random_state=77):
  topics = []
  for t in topic_names:
    with open(f'{dir}/labelled_topics/topic_{t}.json', 'r', encoding='utf8') as topic_data:
      data = json.load(topic_data)
      topics.extend(data)
  
  topic_lemmas = []
  topic_labels = []

  for t in topics:
    topic_lemmas.append(t['lemma_text'])
    topic_labels.append(t['topic'])

  topic_labels = [ topic_names.index(x) for x in topic_labels]

  if shuffle_arrays:
    shuffle(topic_lemmas, topic_labels, random_state=random_state)
  return topic_lemmas, topic_labels


"""

Tweetiment Model

"""
class TweetimentModel:
  def __init__(self, name, model, tokenizer, topic_bias, party_bias):
    self.model_name = name

    self.topic_bias = topic_bias
    self.party_bias = party_bias

    self.labels = ["levo", "desno", "nevtralno"]
    
    self.tokenizer = tokenizer
    self.model = model

    # Create the pipeline
    self.make_model()
  
  def make_model(self):
    self.tweetiment = pipeline("sentiment-analysis", model=self.model, tokenizer=self.tokenizer)

  def predict_text(self, txt):
    if hasattr(self, 'tweetiment'):
      return self.tweetiment(txt)

  def classify(self, bias_party, bias_topic):

    if bias_party is None and bias_topic is None:
      return self.labels[2]
    elif bias_party is None:
      return bias_topic
    elif bias_topic is None:
      return bias_party
    
    return bias_topic

  def calculate_biases(self, tweet, explain=False):
    if tweet['raw_text']:
      prediction = self.predict_text(tweet['raw_text'])[0]

      bias_party, party = self.bias_sentiment_party(prediction, tweet)
      #bias_user = self.bias_user(prediction)
      bias_topic, topic = self.bias_sentiment_topic(prediction, tweet)

      if explain:
        explanation = self.make_explanation(prediction['label'], bias_party, party, bias_topic, topic)
      
      label = self.classify(bias_party, bias_topic)

      return {
          'label': label,
          'sentiment': prediction['label'].lower(),
          'sentiment_score': prediction['score'],
          'topic_bias': bias_topic,
          'topic_mentioned': topic,
          'topic_score': tweet['topic_probability'],
          'party_bias': bias_party,
          'party_mentioned': party,
      }
    return None

  # Bias based on negativity/positivity towards a party mentioned in a tweet
  def bias_sentiment_party(self, prediction, tweet, single=True):
    """
    Args:
      single (bool): Detect only a single party in tweet
      
    Returns:
      bias
    """
    bias = None
    party_detected = None
    parties_mentioned = 0

    for party in self.party_bias:
      
      for mention in tweet['mentions']:
        # Check for mentions or in lemma text
        if mention in party['clani'] or party['kratica_stranke'].lower() in tweet['lemma_text'].split(" "):
          parties_mentioned = parties_mentioned+1
          if parties_mentioned == 1:
            party_detected = party
          break

    if single and parties_mentioned == 1 and party_detected is not None:

      # If text is neutral
      if prediction['label'] == "Neutral":
        bias = self.labels[2]
      # Supports the party
      elif prediction['label'] == "Positive":
        bias = self.labels[party_detected['usmerjenost']]
      # Opposes the party
      elif prediction['label'] == "Negative":
        bias = self.labels[int(not party_detected['usmerjenost'])]

      return bias, party_detected['kratica_stranke']

    # If no parties are mentioned in a tweet
    return None, None
  
  # Bias based on negativity/positivity towards a certain topic of the tweet
  def bias_sentiment_topic(self, prediction, tweet):
    """
    Args:

    Returns:
      bias
    """
    bias = None
    topic_detected = None

    for topic in self.topic_bias:
      if tweet['topic'] == topic and prediction['label'] != 'Neutral':
        
        bias = self.labels[self.topic_bias[topic][prediction['label'].lower()]]
        topic_detected = topic
        break

    return bias, topic_detected
  
  # User a known member of a party?
  def is_user_in_party(self):
    """
    Args:

    Returns:
      bias
    """
    return
  
  # Bias based on the user profile
  def bias_user(self, prediction, tweet):
    """
    Args:
      only_desc (bool): Analyze description on user profile only

    Returns:
      bias
    """
    return

  def make_explanation(self, sentiment, bias_party, party, bias_topic, topic):
    # TODO
    return

## Sentimentalna analiza

In [None]:
# Fetch the config for topic bias

topic_bias_path = root_dir + '/configs/topic-bias.json'
party_bias_path = root_dir + '/configs/party-bias.json'

# Loading bias config
topic_bias = load_tweets(topic_bias_path)
party_bias = load_tweets(party_bias_path)

tweet_tokenizer = AutoTokenizer.from_pretrained("EMBEDDIA/sloberta-tweetsentiment")
tweet_model = AutoModelForSequenceClassification.from_pretrained("EMBEDDIA/sloberta-tweetsentiment")

In [None]:
pt = TweetimentModel("Politics", tweet_model, tweet_tokenizer, topic_bias, party_bias)

In [None]:
# Single instance of tweet in the following format

t = {"id": 1381256945009618948,
     "created_at": "Sun Apr 11 14:44:52 +0000 2021",
     "raw_text": "Ob več kot 1 milijonskem proračunu njegovega Zagovorništva načel LGBTIQ, pa res ne moreš biti šlank !",
     "lemma_text": "več milijonski proračun zagovorništvo načelo lgbtiq šlank",
     "hashtags": [],
     "mentions": ["NortzDr", "KovacecSrecko", "Nova24TV"],
     "user": {
        "name": "Mirna",
        "screen_name": "neukrotljiva",
        "description": ""
      },
     "topic": "lgbtq", "topic_probability": 0.5817090388648133
     }

pt.calculate_biases(t)