# Ideias
Procurar palavras que aparecem mais em comum com outras (e.g Bolsonaro, Lula, presidente, etc.)

In [None]:
from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim
from wordcloud import WordCloud

import bitermplus as btm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tmplot as tmp # topic modelling visualizer lib

# Vectorizers

In [None]:
data = pd.read_csv('../data/processed/preprocessed_lemma_dataset.csv')
data.head()

## Removing accents

In [None]:
data['full_text_lemmas'] = data['full_text_lemmas'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

# Visualizing

## Word cloud

In [None]:
text = ''
for video in data['full_text_lemmas']:
    text = text + ' ' + video
    
plt.figure(figsize=(17, 10))
word_cloud = WordCloud(collocations = False, background_color = 'white').generate(text)
plt.imshow(word_cloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# BTM

In [None]:
def train_btm(topic_numbers):
    model = btm.BTM(X, vocabulary, seed=42, T=topic_numbers, M=20, alpha=50/topic_numbers, beta=10)
    model.fit(biterms, iterations=50, verbose=False)
    return model

In [None]:
# Obtaining terms frequency in a sparse matrix and corpus vocabulary
texts = data['full_text_lemmas'].str.strip().tolist()

count_vector_kwargs = dict(ngram_range=(1,2), min_df=30, max_features=1000)
X, vocabulary, vocab_dict = btm.get_words_freqs(texts,**count_vector_kwargs)

# Vectorizing documents
docs_vec = btm.get_vectorized_docs(texts, vocabulary)
docs_lens = list(map(len, docs_vec))

# Generating biterms
biterms = btm.get_biterms(docs_vec)

In [None]:
cluster_number = []
coherences_std = []
coherences_avg = []
coherences_median = []
for k in tqdm(range(10, 31)):
    model = train_btm(k)
    coherence = metric_coherence_gensim('u_mass',
                                       top_n=6, 
                                       topic_word_distrib=model.matrix_topics_words_,
                                       dtm=X,
                                       vocab=vocabulary, 
                                       texts=data['full_text_lemmas'].values)
    
    # Add metrics
    cluster_number.append(k)
    coherences_std.append(np.std(coherence))
    coherences_avg.append(np.mean(coherence))
    coherences_median.append(np.median(coherence))
    
# Results
cluster_optimization_df = pd.DataFrame(data = [cluster_number, coherences_std, coherences_median, coherences_avg]).T
cluster_optimization_df.columns = ['cluster_number', 'coherence_std', 'coherence_median', 'coherence_avg']
cluster_optimization_df[['cluster_number', 'coherence_median', 'coherence_avg', 'coherence_std']].sort_values('coherence_avg', ascending=False)

### BTM Inference

In [None]:
NUMBER_OF_TOPICS = 12

model = train_btm(NUMBER_OF_TOPICS)
data['topic_btm'] = np.argmax(model.transform(docs_vec), axis=1)
data['topic_btm'].value_counts()

# Mapping topics

In [None]:
btm_topic_maps = {
    0:  'Lula',
    1:  'Economy',
    2:  'Social Issues',
    3:  'Religion',
    4:  'Political Economic',
    5:  'Social Classes',
    6:  'Humor',
    7:  'Corruption',
    8:  'Bolsonaro',
    9:  'Elections',
    10: 'Covid',
    11: 'Science'
}
data['topic_btm'] = data['topic_btm'].map(btm_topic_maps)

assert data['topic_btm'].isnull().sum() == 0

In [None]:
data['topic_btm'].value_counts(True) * 100

In [None]:
tmp.report(model=model, docs=texts, width=200)

## Word cloud

In [None]:
fig = plt.figure(figsize=(17,7))
position = 1
for cluster in np.sort(data['topic_btm'].unique()):
    text = ''
    
    cluster_data = data.loc[data['topic_btm'] == cluster]
    for video in cluster_data['full_text_lemmas']:
        text = text + ' ' + video

    ax = fig.add_subplot(5, 3, position)
    word_cloud = WordCloud(collocations = False, 
                           background_color = 'white').generate(text)
    position += 1
    
    ax.imshow(word_cloud, interpolation='bilinear')
    fig.set_figheight(15)
    plt.axis('off')
    plt.title('{} ({} videos)'.format(cluster, len(cluster_data)))

# Toxicity analysis

## Toxicity analysis with ToLD-Br

In [None]:
import os
from simpletransformers.classification import ClassificationModel

os.environ['modelpath'] = "../model/toxic_bert_model.zip"

In [None]:
#!unzip -o "$modelpath" -d .

In [None]:
model = ClassificationModel("distilbert", "toxic_bert_model", use_cuda=False)

In [None]:
predictions, outputs = model.predict(data['full_text'].tolist())

In [None]:
data['is_toxic'] = predictions
data['toxicity_min'] = [value[0] for value in outputs]
data['toxicity_max'] = [value[1] for value in outputs]

### Saving dataset

In [None]:
data.to_csv('../data/processed/predictions_btm_whisper.csv', index=False)