# Predobdelava podatkov

## Okolje

Vzpostavitev okolja

In [None]:
!pip install tweet-preprocessor
!pip install classla

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import json
import classla
import re
import time

classla.download('sl')

import preprocessor as tpre

from google.colab import drive
drive.mount('/content/drive/')

In [None]:
# Setting constants

LOCAL = False

google_data_dir = "/content/drive/MyDrive/Diploma/Data"
local_data_dir = "/data"

root_dir = ""
if LOCAL:
    root_dir = local_data_dir
else:
    root_dir = google_data_dir

## Funkcije in razredi

In [None]:
def preprocess_tweets(preprocess_pipeline, tweets, tweet_stop_words=[], tweet_upos=[], min_words=4, verbose=False, debug=False):

  stop_words = stopwords.words('slovene')
  stop_words.extend(tweet_stop_words)

  # Remove stopwords
  stop_words = list(set(stop_words))

  data = []
  start_time = 0

  for index, tweet in enumerate(tweets):

    if index%1000 == 0 and verbose:
      # Import time
      print(f'-- Progress: {index}/{len(tweets)}')
      if index >= 1000:
        print(f'-- Time elapsed: {time.time() - start_time}s')
        print(f'-- Tweets preprocessed: {len(data)}')
      start_time = time.time()

    # Take attributes
    tweet_full_text = tweet['full_text']

    # Skip if retweet
    if tweet_full_text.startswith("RT"):
      continue

    tweet_id = tweet['id']
    #tweet_hashtags = tweet['entities']['hashtags']
    #tweet_mentions = tweet['entities']['mentions']
    tweet_hashtags = tweet['hashtags']
    tweet_mentions = tweet['mentions']
    tweet_created_at = tweet['created_at']

    tweet_user_name = tweet['user']['name']
    tweet_user_screen_name = tweet['user']['screen_name']
    tweet_user_description = clean_tweet_text(tweet['user']['description']).lower()

    # Remove hashtags, mentions, links, emojis and others
    tweet_full_text = clean_tweet_text(tweet_full_text)

    if debug:
      print(f'Raw text:\n{tweet_full_text}')

    # Preprocess with preprocessing pipeline
    tweet_lemma_text = preprocess_pipeline(tweet_full_text)

    tweet_processed_lemmas = []
    for sentence in tweet_lemma_text.sentences:

      sentence_words = []

      for i, word in enumerate(sentence.words):
 
        # If the tweet is a retweet
        if word.lemma.lower() == 'rt' and i == 0:
          break

        if debug:
          print(f'Lemma: {word.lemma.lower()} -------------- Upos: {word.upos}')

        # Not punctuation and not number + clean stopwords
        if word.upos not in tweet_upos and word.lemma.lower() not in stop_words:
          sentence_words.append(word.lemma.lower())
        elif debug:
          print(f'Discarded word: {word.lemma}')

      tweet_processed_lemmas.extend(sentence_words)
    
    if debug:
      print(f'Preprocessed lemmas:\n{tweet_processed_lemmas}')

    # If less than n words
    if len(tweet_processed_lemmas) > min_words:
      tweet_data = {
          "id": tweet_id,
          "created_at": tweet_created_at,
          "raw_text": tweet_full_text,
          "lemma_text": ' '.join(tweet_processed_lemmas),
          "hashtags": tweet_hashtags,
          "mentions": tweet_mentions,
          "user": {
              "name": tweet_user_name,
              "screen_name": tweet_user_screen_name,
              "description": tweet_user_description
          }
      }

      # Append all the data
      data.append(tweet_data)
    
  return data

def clean_tweet_text(tweet_text):
  tweet_text = tpre.clean(tweet_text)
  tweet_text = re.sub("&gt;|&lt;|&amp;", "", tweet_text)
  tweet_text = remove_emojis(tweet_text)
  return tweet_text

def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

def load_tweets(file_name):
  
  # Load data
  data = []

  with open(file_name, 'r', encoding='utf8') as sample_data:
    data = json.load(sample_data)

  return data

def save_tweets(data, dir, file_name):
  with open(f'{dir}/{file_name}.json', 'w+', encoding='utf8') as outdata:
    json.dump(data, outdata, ensure_ascii=False)

def load_and_preprocess(cpipeline, data_dir, only_load, tweet_stop_words=[], tweet_upos=[], min_words=4, verbose=False, debug=False):
  d = []
  if only_load:
    d = load_tweets(data_dir)
  else:
    d = preprocess_tweets(cpipeline, load_tweets(data_dir), tweet_stop_words=tweet_stop_words, tweet_upos=tweet_upos, min_words=min_words, verbose=verbose, debug=debug)
  return d

In [None]:
# Setting configuration

# Path to unpreprocessed tweet data

YEAR = 2021
EPOCH = 2
BATCH = 1

tweet_data_path = f'unpreprocess/{YEAR}-{EPOCH}/{YEAR}_{EPOCH}_{BATCH}.json'

# Path to save processed tweet data
SAVING = True
tweet_save_path = f'preprocess/{YEAR}-{EPOCH}'

# Preprocessing configuration
preprocess_config = {
    'only_load': False,
    'min_words': 4,
    'verbose': True,
    'debug': False,
    'tweet_upos': ['PUNCT', 'NUM', 'SYM', 'CCONJ', 'INTJ'],
    'tweet_stop_words': ['http', 'https', 'rt', 'oz']
}

# Classla configuration
classla_conf = {
  #'processors': 'tokenize, lemma',
  'lang': 'sl',
  'pos_lemma_pretag' : True,
  'use_gpu': True
}


In [None]:
# Loading preprocessors

# Classla preprocessor
classla_pipeline = classla.Pipeline(**classla_conf)

# Tweet preprocessor
tpre.set_options(tpre.OPT.URL, tpre.OPT.MENTION, tpre.OPT.HASHTAG)

In [None]:
# MAIN CODE

# Preprocess tweet data
preprocessed_tweet_data = load_and_preprocess(classla_pipeline, f'{root_dir}/{tweet_data_path}', **preprocess_config)

if SAVING:
    print(f'Saving preprocessed tweets to {root_dir}/{tweet_save_path}...')
    save_tweets(preprocessed_tweet_data, f'{root_dir}/{tweet_save_path}', f'{YEAR}_{EPOCH}_{BATCH}')

# Print summary
print(f'- Batch summary:')
print(f'-- Batch length: {len(preprocessed_tweet_data)}')