In [None]:
import pandas as pd
import numpy as np

from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import nltk

In [None]:
dataset = pd.read_csv('../data/raw/crime_news.csv')

In [None]:
articles = dataset['text']

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download("punkt_tab")

sentences = [sent_tokenize(str(article), language='italian') for article in articles]
#track the document ID for all sentences
doc_ids = [[idx] * len(s) for idx, s in enumerate(sentences)]
sentences = [sentence for doc in sentences for sentence in doc]
doc_ids = [idx for l in doc_ids for idx in l]

In [None]:
#embedding_model = SentenceTransformer('nickprock/sentence-bert-base-italian-uncased')
embedding_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk

#4 - VECTORIZER
nltk.download('stopwords')
stop_words_it = stopwords.words('italian')

vectorizer_model = CountVectorizer(stop_words=stop_words_it, min_df=2, ngram_range=(1, 2))

In [None]:
from bertopic.vectorizers import ClassTfidfTransformer

#5 - c-TF-IDF
ctfidf_model = ClassTfidfTransformer(bm25_weighting=True, reduce_frequent_words=True)

In [None]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, TextGeneration

# KeyBERT
keybert = KeyBERTInspired()

# MMR
mmr = MaximalMarginalRelevance(diversity=0.3)

# Text generation with Llama 3

# All representation models
representation_model = {
    "KeyBERT": keybert,
    "MMR": mmr,
}

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

dim_model = PCA(n_components=5)
cluster_model = KMeans(n_clusters=50)

In [None]:
# Create BERTopic model
topic_model2 = BERTopic(
                        #Pipeline models
                       embedding_model=embedding_model,
                       umap_model=dim_model,
                       hdbscan_model=cluster_model,
                       vectorizer_model=vectorizer_model,
                       ctfidf_model=ctfidf_model,
                       representation_model=representation_model,
                       # Hyperparameters
                       top_n_words=10,
                       verbose=True)

# Train model
topics, probs = topic_model2.fit_transform(sentences, embeddings)

In [None]:
topic_model2.get_topic_info()

In [None]:
# Visualize topics with custom labels
topic_model2.visualize_topics()

In [None]:
# Visualize hierarchy with custom labels
topic_model2.visualize_hierarchy()

In [None]:
topic_model2.visualize_heatmap()

In [None]:
topic_model2.visualize_barchart(top_n_topics=15, n_words=5)