In [1]:
import pandas as pd
import random
import numpy as np
import hdbscan
from umap import UMAP
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from hyperopt import hp
from hyperopt import fmin, tpe, STATUS_OK, space_eval, Trials
from functools import partial

In [2]:
#docs =  pd.read_csv('./datasets/antivaxxers/antivaxxers_processed.csv')
docs =  pd.read_csv('./datasets/provaxxers/provaxxers_processed.csv')

In [3]:
docs

Unnamed: 0,created_at,id,author_id,in_reply_to_user_id,text
0,2020-03-11T01:40:07.000Z,1237553858320031744,2276964030,,attention people if we believe the is as bad a...
1,2020-03-11T01:42:55.000Z,1237554565580382209,1129184099070074880,,lest we forget except when they don t and here...
2,2020-03-11T02:00:12.000Z,1237558915090374656,16846937,,in global citizens convinced to help protect c...
3,2020-03-11T02:48:00.000Z,1237570942097907712,88914175,,just published maike morrison and coworkers st...
4,2020-03-11T02:50:44.000Z,1237571629313437696,1187401190,8.108359e+17,so many irl who aren t on twitter currently mu...
...,...,...,...,...,...
673801,2022-04-05T22:59:33.000Z,1511478678093541385,376817911,,s typhoid conjugate vaccine tcv campaign kicks...
673802,2022-04-05T23:01:17.000Z,1511479115496439812,310403500,,when it came to vaccinating our son against co...
673803,2022-04-05T23:54:47.000Z,1511492578272854016,1389743442738229251,,vax misinformation debunk by immunology prof d...
673804,2022-04-05T23:54:47.000Z,1511492578272854016,1389743442738229251,,vax misinformation debunk by immunology prof d...


In [4]:
import torch

if torch.cuda.is_available():        
    device = torch.device("cuda")    
    print('There are %d GPU(s) available.' % torch.cuda.device_count())    
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 2060 SUPER


In [5]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-mpnet-base-v2", device="cuda")

In [6]:
embeddings = model.encode(docs.text, show_progress_bar=True,convert_to_tensor=False)

Batches:   0%|          | 0/21057 [00:00<?, ?it/s]

In [7]:
def generate_clusters(embeddings,
                      n_neighbors,
                      n_components, 
                      min_cluster_size,
                      random_state = None):
    """
    Generate HDBSCAN cluster object after reducing embedding dimensionality with UMAP
    """
    
    umap_embeddings = (UMAP(n_neighbors=n_neighbors, 
                                n_components=n_components, 
                                metric='cosine', 
                                random_state=random_state)
                                .fit_transform(embeddings))

    clusters = hdbscan.HDBSCAN(min_cluster_size = min_cluster_size,
                               metric='euclidean', 
                               cluster_selection_method='eom').fit(umap_embeddings)

    return clusters

In [8]:
def score_clusters(clusters, prob_threshold = 0.05):
    """
    Returns the label count and cost of a given cluster supplied from running hdbscan
    """
    
    cluster_labels = clusters.labels_
    label_count = len(np.unique(cluster_labels))
    total_num = len(clusters.labels_)
    cost = (np.count_nonzero(clusters.probabilities_ < prob_threshold)/total_num)
    
    return label_count, cost

In [9]:
def objective(params, embeddings, label_lower, label_upper):
    """
    Objective function for hyperopt to minimize, which incorporates constraints
    on the number of clusters we want to identify
    """
    
    clusters = generate_clusters(embeddings, 
                                 n_neighbors = params['n_neighbors'], 
                                 n_components = params['n_components'], 
                                 min_cluster_size = params['min_cluster_size'],
                                 random_state = params['random_state'])
    
    label_count, cost = score_clusters(clusters, prob_threshold = 0.05)
    
    #15% penalty on the cost function if outside the desired range of groups
    if (label_count < label_lower) | (label_count > label_upper):
        penalty = 0.15 
    else:
        penalty = 0
    
    loss = cost + penalty
    
    return {'loss': loss, 'label_count': label_count, 'status': STATUS_OK}

In [10]:
def bayesian_search(embeddings, space, label_lower, label_upper, max_evals=100):
    """
    Perform bayseian search on hyperopt hyperparameter space to minimize objective function
    """
    
    trials = Trials()
    fmin_objective = partial(objective, embeddings=embeddings, label_lower=label_lower, label_upper=label_upper)
    best = fmin(fmin_objective,  
                space = space, 
                algo=tpe.suggest,
                max_evals=max_evals, 
                trials=trials)

    best_params = space_eval(space, best)
    print ('best:')
    print (best_params)
    print (f"label count: {trials.best_trial['result']['label_count']}")
    
    best_clusters = generate_clusters(embeddings, 
                                      n_neighbors = best_params['n_neighbors'], 
                                      n_components = best_params['n_components'], 
                                      min_cluster_size = best_params['min_cluster_size'],
                                      random_state = best_params['random_state'])
    
    return best_params, best_clusters, trials

In [11]:
hspace = {
    'n_neighbors': hp.choice('n_neighbors',range(12,16)),
    'n_components': hp.choice('n_components',range(3,8)),
    'min_cluster_size': hp.choice('min_cluster_size',range(35,100)),
    'random_state':42
}

label_lower=500
label_upper=1000
max_evals = 100

In [None]:
best_params, best_clusters, trials = bayesian_search(embeddings,
                                                     space=hspace,
                                                     label_lower=label_lower,
                                                     label_upper=label_upper,
                                                     max_evals=max_evals)

  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
  warn(



  3%|▎         | 3/100 [8:27:04<277:17:43, 10291.37s/trial, best loss: 0.6165201853352449]

In [None]:
len(best_clusters.probabilities_[best_clusters.probabilities_ == 0])


In [None]:
def clustering_documents(docs,cluster_labels):
    docs_df = pd.DataFrame(docs, columns=["text"])
    docs_df['cluster'] = cluster_labels
    docs_df['doc_id'] = range(len(docs_df))
    docs_per_topic = docs_df.groupby(['cluster'], as_index = False).agg({'text': ' '.join})
    return docs_df, docs_per_topic

In [187]:
docs_df, docs_per_topic = clustering_documents(docs,best_clusters.labels_)

In [223]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import nltk

stop_words = nltk.corpus.stopwords.words("portuguese")   

def c_tf_idf(documents, m, ngram_range=(3, 3)):
    count = CountVectorizer(ngram_range=ngram_range,stop_words=stop_words ).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count
  
c_tf_idf, count = c_tf_idf(docs_per_topic.text.values, m=len(docs))

def extract_top_n_words_per_topic(c_tf_idf, count, docs_per_topic, n=10):
    words = count.get_feature_names()
    labels = list(docs_per_topic.cluster)
    tf_idf_transposed = c_tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    return words, top_n_words

def extract_topic_sizes(df):
    topic_sizes = (df.groupby(['cluster'])
                     .text
                     .count()
                     .reset_index()
                     .rename({"cluster": "topic", "text": "size"}, axis='columns')
                     .sort_values("size", ascending=False))
    return topic_sizes

words,top_n_words = extract_top_n_words_per_topic(c_tf_idf, count, docs_per_topic, n=5)
topic_sizes = extract_topic_sizes(docs_df); 

In [224]:
topic_sizes

Unnamed: 0,topic,size
0,-1,7252
39,38,3615
46,45,3048
17,16,958
38,37,618
28,27,390
36,35,333
47,46,331
12,11,322
42,41,312


In [119]:
import collections


def  most_common(lst, n_words):
        """
        Return most common n words in list of words
        Arguments:
            lst: list of words
            n_words: int, number of top words by frequency to return
        Returns:
            counter.most_common(n_words): a list of the n most common elements
                                          and their counts from the most
                                          common to the least
        """

        counter = collections.Counter(lst)

        return counter.most_common(n_words)

In [124]:
import spacy


try:
     #nlp = spacy.load("en_core_web_sm")
    nlp = spacy.load("pt_core_news_sm")
except OSError:
    print("Downloading language model for the spaCy dependency parser\n"
                  "(only required the first time this is run)\n")
    from spacy.cli import download
    #download("en_core_web_sm")
    download("pt_core_news_sm")
    #nlp = spacy.load("en_core_web_sm")
    nlp =spacy.load("pt_core_news_sm")

def extract_labels(category_docs):
    """
    Extract labels from documents in the same cluster by concatenating
    most common verbs, ojects, and nouns
    """

    verbs = []
    dobjs = []
    nouns = []
    adjs = []
    
    verb = ''
    dobj = ''
    noun1 = ''
    noun2 = ''

    # for each document, append verbs, dobs, nouns, and adjectives to 
    # running lists for whole cluster
    for i in range(len(category_docs)):
        doc = nlp(category_docs[i])
        for token in doc:
            if token.is_stop==False:
                if token.dep_ == 'ROOT':
                    verbs.append(token.text.lower())

                elif token.dep_=='dobj':
                    dobjs.append(token.lemma_.lower())

                elif token.pos_=='NOUN':
                    nouns.append(token.lemma_.lower())
                    
                elif token.pos_=='ADJ':
                    adjs.append(token.lemma_.lower())
    
    # take most common words of each form
    if len(verbs) > 0:
        verb = most_common(verbs, 1)[0][0]
    
    if len(dobjs) > 0:
        dobj = most_common(dobjs, 1)[0][0]
    
    if len(nouns) > 0:
        noun1 = most_common(nouns, 1)[0][0]
    
    if len(set(nouns)) > 1:
        noun2 = most_common(nouns, 2)[1][0]
    
    # concatenate the most common verb-dobj-noun1-noun2 (if they exist)
    label_words = [verb, dobj]
    
    for word in [noun1, noun2]:
        if word not in label_words:
            label_words.append(word)
    
    if '' in label_words:
        label_words.remove('')
    
    label = '_'.join(label_words)
    
    return label

Downloading language model for the spaCy dependency parser
(only required the first time this is run)

✔ Download and installation successful
You can now load the package via spacy.load('pt_core_news_sm')


In [210]:
cluster_labels = np.unique(best_clusters.labels_)

label_dict = {}
for label in cluster_labels:
    cluster =  pd.DataFrame(docs_df[docs_df.cluster==label].text)
    cluster = cluster.reset_index().drop(columns=["index"])
    label_dict[label] = extract_labels(cluster.text)

KeyboardInterrupt: 

In [None]:
import numpy as np
import pandas as pd
from umap import UMAP
from typing import List
from sklearn.preprocessing import MinMaxScaler

import plotly.express as px
import plotly.graph_objects as go


def visualize_topics(topic_model,
                     topics: List[int] = None,
                     top_n_topics: int = None,
                     width: int = 650,
                     height: int = 650) -> go.Figure:
    """ Visualize topics, their sizes, and their corresponding words
    This visualization is highly inspired by LDAvis, a great visualization
    technique typically reserved for LDA.
    Arguments:
        topic_model: A fitted BERTopic instance.
        topics: A selection of topics to visualize
        top_n_topics: Only select the top n most frequent topics
        width: The width of the figure.
        height: The height of the figure.
    Usage:
    To visualize the topics simply run:
    ```python
    topic_model.visualize_topics()
    ```
    Or if you want to save the resulting figure:
    ```python
    fig = topic_model.visualize_topics()
    fig.write_html("path/to/file.html")
    ```
    <iframe src="../../getting_started/visualization/viz.html"
    style="width:1000px; height: 680px; border: 0px;""></iframe>
    """
    # Select topics based on top_n and topics args
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(topic_model.get_topic_freq().Topic.to_list()[1:top_n_topics + 1])
    else:
        topics = sorted(list(topic_model.get_topics().keys()))

    # Extract topic words and their frequencies
    topic_list = sorted(topics)
    frequencies = [topic_model.topic_sizes[topic] for topic in topic_list]
    words = [" | ".join([word[0] for word in topic_model.get_topic(topic)[:5]]) for topic in topic_list]

    # Embed c-TF-IDF into 2D
    all_topics = sorted(list(topic_model.get_topics().keys()))
    indices = np.array([all_topics.index(topic) for topic in topics])
    embeddings = topic_model.c_tf_idf.toarray()[indices]
    embeddings = MinMaxScaler().fit_transform(embeddings)
    embeddings = UMAP(n_neighbors=2, n_components=2, metric='hellinger').fit_transform(embeddings)

    # Visualize with plotly
    df = pd.DataFrame({"x": embeddings[1:, 0], "y": embeddings[1:, 1],
                       "Topic": topic_list[1:], "Words": words[1:], "Size": frequencies[1:]})
    return _plotly_topic_visualization(df, topic_list, width, height)


def _plotly_topic_visualization(df: pd.DataFrame,
                                topic_list: List[str],
                                width: int,
                                height: int):
    """ Create plotly-based visualization of topics with a slider for topic selection """

    def get_color(topic_selected):
        if topic_selected == -1:
            marker_color = ["#B0BEC5" for _ in topic_list[1:]]
        else:
            marker_color = ["red" if topic == topic_selected else "#B0BEC5" for topic in topic_list[1:]]
        return [{'marker.color': [marker_color]}]

    # Prepare figure range
    x_range = (df.x.min() - abs((df.x.min()) * .15), df.x.max() + abs((df.x.max()) * .15))
    y_range = (df.y.min() - abs((df.y.min()) * .15), df.y.max() + abs((df.y.max()) * .15))

    # Plot topics
    fig = px.scatter(df, x="x", y="y", size="Size", size_max=40, template="simple_white", labels={"x": "", "y": ""},
                     hover_data={"Topic": True, "Words": True, "Size": True, "x": False, "y": False})
    fig.update_traces(marker=dict(color="#B0BEC5", line=dict(width=2, color='DarkSlateGrey')))

    # Update hover order
    fig.update_traces(hovertemplate="<br>".join(["<b>Topic %{customdata[0]}</b>",
                                                 "Words: %{customdata[1]}",
                                                 "Size: %{customdata[2]}"]))

    # Create a slider for topic selection
    steps = [dict(label=f"Topic {topic}", method="update", args=get_color(topic)) for topic in topic_list[1:]]
    sliders = [dict(active=0, pad={"t": 50}, steps=steps)]

    # Stylize layout
    fig.update_layout(
        title={
            'text': "<b>Intertopic Distance Map",
            'y': .95,
            'x': 0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(
                size=22,
                color="Black")
        },
        width=width,
        height=height,
        hoverlabel=dict(
            bgcolor="white",
            font_size=16,
            font_family="Rockwell"
        ),
        xaxis={"visible": False},
        yaxis={"visible": False},
        sliders=sliders
    )

    # Update axes ranges
    fig.update_xaxes(range=x_range)
    fig.update_yaxes(range=y_range)

    # Add grid in a 'plus' shape
    fig.add_shape(type="line",
                  x0=sum(x_range) / 2, y0=y_range[0], x1=sum(x_range) / 2, y1=y_range[1],
                  line=dict(color="#CFD8DC", width=2))
    fig.add_shape(type="line",
                  x0=x_range[0], y0=sum(y_range) / 2, x1=x_range[1], y1=sum(y_range) / 2,
                  line=dict(color="#9E9E9E", width=2))
    fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10)
    fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10)
    fig.data = fig.data[::-1]

    return fig