In [1]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import umap
from umap import UMAP
import hdbscan
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import TfidfTransformer
import optuna
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
import shap
import numpy as np
import pandas as pd
import pickle

In [2]:
dftweets = pd.read_json('../luna_tweets.json')

In [3]:
tweets = dftweets["text"]

In [4]:
dftweets

Unnamed: 0,uid,tid,text,lang,created_at
0,0,1092017,Let’s be real. Most young people in Kenya are ...,en,2021-05-31 08:09:44
1,0,18925442,"#RETWEET!!\n\nA friend sent me this, someone o...",en,2019-08-06 17:30:22
2,0,18745080,Wuhan throws a pool party concert after 3 mont...,en,2020-08-19 00:31:47
3,0,18719812,"Hey guys, post your business/hustle on the com...",en,2019-03-22 07:13:13
4,0,18719462,The UK’s first socially distanced gig. In Ken...,en,2020-08-12 18:17:15
...,...,...,...,...,...
2183556,19185129,19185667,ICFAI Business School | IBS Online Selection P...,en,2022-02-23 10:22:40
2183557,19185129,19185659,ICFAI Business School | IBS Online Selection P...,en,2022-02-23 05:09:28
2183558,19185129,19185558,Congratulations to Ms. Arti For Securing Rs.20...,en,2022-04-06 08:47:29
2183559,19185129,19185180,Last few days left to Register for IBSAT Natio...,en,2022-09-23 12:31:20


Embeddings

In [14]:
#Separar los tweets en tandas para un procesado más rápido
tweets1 = tweets[0:436712]

tweets2 = tweets[436712:873424]
tweets2 = tweets2.reset_index(drop=True)

tweets3 = tweets[873424:1310136]
tweets3 = tweets3.reset_index(drop=True)

tweets4 = tweets[1310136:1746848]
tweets4 = tweets4.reset_index(drop=True)

tweets5 = tweets[1746848:2182355]
tweets5 = tweets5.reset_index(drop=True)

In [None]:
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
embeddings1 = embedding_model.encode(tweets1, show_progress_bar=True)

In [None]:
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
embeddings2 = embedding_model.encode(tweets2, show_progress_bar=True)

In [None]:
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
embeddings3 = embedding_model.encode(tweets3, show_progress_bar=True)

In [None]:
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
embeddings4 = embedding_model.encode(tweets4, show_progress_bar=True)

In [None]:
embedding_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
embeddings5 = embedding_model.encode(tweets5, show_progress_bar=True)

In [None]:
#Concatenación de los embeddings
embeddings12 = np.concatenate((embeddings1, embeddings2), axis=0)
embeddings123 = np.concatenate((embeddings12, embeddings3), axis=0)
embeddings1234 = np.concatenate((embeddings123, embeddings4), axis=0)
embeddings = np.concatenate((embeddings1234, embeddings5), axis=0)

In [None]:
# Use pickle.load to load the embeddings from the file
file_path = 'embeddings.pkl'

#to load
#with open(file_path, 'rb') as file:
    #embeddings = pickle.load(file)
    
#to create
with open(file_path, 'wb') as file:
    pickle.dump(embeddings, file)


Clustering de tweets

In [12]:
#Hiperparámetros de UMAP y HDBScan para introducir en BERTopic
umap_model = UMAP(n_neighbors=15, n_components=25, min_dist=0.0, metric='cosine', n_jobs=1, random_state=42)
hdbscan_model = HDBSCAN(min_cluster_size=150, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
topic_model = BERTopic(umap_model=umap_model, hdbscan_model=hdbscan_model)

In [None]:
topics, probs = topic_model.fit_transform(tweets, embeddings)

1057 tópicos. 1167621 tweets de ruido (tópico -1). 
Tarda unos cuatro días en ejecutarse.

In [None]:
topic_info=topic_model.get_topic_info()
topic_info

In [None]:
# Count vectorizer 
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(max_df=0.9, min_df=0.05)
topic_model.update_topics(tweets, vectorizer_model=vectorizer_model)

In [None]:
topic_model.get_topic_info()

Clustering de usuarios

In [None]:
dftweets["topic"] = topics

In [None]:
dftweetsSR = dftweets[dftweets['topic'] != -1]
dftweetsSR.reset_index(drop=True)

In [None]:
# Pivotar el dataframe para contar el número de tweets por usuario por tópico
pivot_df = dftweetsSR.pivot_table(index='uid', columns='topic', aggfunc='size', fill_value=0)

# Renombrar columnas
pivot_df.columns = ['topic_' + str(col) for col in pivot_df.columns]

# Uid es columna en vez de índice
pivot_df.reset_index(inplace=True)

In [None]:
# Comprobar que no hay usuarios que no hayan publicado ningún tweet en ningún tópico
uids_with_zero_tweets = pivot_df.index[pivot_df.sum(axis=1) == 0]

if len(uids_with_zero_tweets) > 0:
    print("Hay estos usuarios con cero tweets para cada tópico:")
    print(uids_with_zero_tweets)
else:
    print("No hay usuarios con cero tweets para cada tópico")

In [None]:
#Comprobar si hay tópicos sin ningún tweet
no_tweets_columns = pivot_df.columns[(pivot_df == 0).all()]

if no_tweets_columns.empty:
    print("Todos los tópicos tienen al menos un tweet")
else:
    print("Tópicos sin tweets:", list(no_tweets_columns))

In [None]:
# Quitar y guardar la columna de uid para el procesado
uid = pivot_df.pop('uid')

TF-IDF

In [None]:
tfidf_transformer = TfidfTransformer()
tfidf_matrix = tfidf_transformer.fit_transform(pivot_df)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=pivot_df.columns)

In [None]:
# Crear un array con los valores del dataframe
pivot_array = pivot_df.values
pivot_array

In [None]:
# Crear un array con los valores del dataframe
tfidf_array = tfidf_df.values
tfidf_array

Optuna

In [None]:
# Se define la optimización de parámetros
def objective(trial):
    # UMAP
    n_neighbors = trial.suggest_int("n_neighbors", 5, 50)
    n_components = trial.suggest_int("n_components", 5, 30)
    min_dist = trial.suggest_float("min_dist", 0.00, 0.5)
    metricUMAP = trial.suggest_categorical("metricUMAP", ["cosine", "euclidean"])
    
    #hdbscan
    min_cluster_size = trial.suggest_int("min_cluster_size", 10, 500, step=5)
    min_samples = trial.suggest_int("min_samples", 1, 100, step=5)
    metricHDB = trial.suggest_categorical("metricHDB", ["l2", "euclidean"])
    cluster_selection_method = trial.suggest_categorical("cluster_selection_method", ["eom", "leaf"])

    umap_model = UMAP(n_neighbors=n_neighbors, n_components=n_components, min_dist=min_dist, metric=metricUMAP)

    hdbscan_model = HDBSCAN(min_cluster_size=min_cluster_size, min_samples=min_samples, metric=metricHDB, cluster_selection_method=cluster_selection_method)

    embeddings_umap = umap_model.fit_transform(tfidf_array)
    clusters = hdbscan_model.fit_predict(embeddings_umap)

    calinski_harabasz_score = metrics.calinski_harabasz_score(tfidf_array, hdbscan_model.labels_)
    
    return calinski_harabasz_score

In [None]:
# calinski_harabasz se maximiza
study = optuna.create_study(direction="maximize")  

In [None]:
study.optimize(objective, n_trials=100)

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
# Mejores hiperparámetros
best_params = study.best_params
best_value = study.best_value
result_text = f"Mejores hiperparámetros con calinski_harabasz_score {best_value}: {best_params}"

"Best Hyperparameters with value calinski_harabasz_score 197.88977056021812: {'n_neighbors': 46, 'n_components': 24, 'min_dist': 0.26992862633318315, 'metricUMAP': 'cosine', 'min_cluster_size': 320, 'min_samples': 86, 'metricHDB': 'euclidean', 'cluster_selection_method': 'eom'}"

Counter({-1: 5669, 6: 3852, 3: 1257, 4: 1052, 1: 702, 0: 681, 5: 560, 2: 495})

Clustering de usuarios

In [None]:
umap_modelU = UMAP(n_neighbors=46, n_components=24, min_dist=0.26992862633318315, metric='euclidean', n_jobs=1, random_state=42)
embeddings_umapU = umap_model.fit_transform(tfidf_array)

In [None]:
hdbscan_modelU = HDBSCAN(min_cluster_size=320, min_samples=86, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
clusters = hdbscan_modelU.fit_predict(embeddings_umapU)

In [None]:
clusters_counter=Counter(hdbscan_modelU.labels_)
clusters_counter

In [None]:
# Se vuelve a introducir la columna uid
pivot_df.insert(0, 'uid', uid)

In [None]:
# Se añade la columna clusters para poder hacer consultas
pivot_df.insert(1, 'cluster', clusters)

In [None]:
pivot_df.query("cluster==1")

In [None]:
# Asignar el nombre correspondiente a cada tópico

name_values = topic_info['Name'].tolist()
new_column_names = pivot_df_clusters.columns[:2].tolist() + [f"topic{name}" for name in name_values]

pivot_df.columns = new_column_names
pivot_df

Comunidades

In [None]:
# Lista de tópicos más populares por cluster
cluster_topic_sums = pivot_df.groupby('cluster').sum()

most_popular_topics = {}
for cluster, group in cluster_topic_sums.iterrows():
    top_topics = group.nlargest(5).dropna()
    most_popular_topics[cluster] = top_topics

# Display the results
for cluster, topics in most_popular_topics.items():
    print(f"Cluster {cluster}:")
    for topic, count in topics.items():
        print(f"  {topic}: {count} tweets")
         print("\n")