In [None]:
import pandas as pd
import numpy as np
import sys
import os
from pathlib import Path
import yaml
from datetime import timedelta
from nltk.corpus import stopwords 
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
import plotly.io as pio
pio.renderers.default='iframe'
import re
from util import *
from hdbscan import HDBSCAN
from ctransformers import AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline
from bertopic.representation import TextGeneration
import random

In [None]:
# carica il CSV
df_deghi = pd.read_csv("deghi_dataset.csv", sep="§")

In [None]:
df_deghi.head(10)

In [None]:
def puliscidataset(df):

        df['Text']=df['Text'].str.lower() #minuscolo

        symbols = "£x!\"#$%&()*+-/<=>?@[\]^_`{|}~\n"           #punteggiatura 

        for i in symbols:
                         df['Text'] = df['Text'].str.replace(i,' ',regex=False)

        df['Text'] = df['Text'].str.replace("'",' ',regex=False)
        df['Text'] = df['Text'].str.replace('€', 'euro ',regex=True)

        df=df[df['Text'].notna()]

        df['Text'] = df['Text'].str.replace('\d+', '',regex=False)
            
        #rimuovi numeri    
        df["Text"]=df["Text"].apply(lambda x: ' '.join([i for i in x.split() if not i.isdigit()]))    
    
        # Rimuovi parole che contengono numeri (parole come  bc "pe435st200")
        #df['Nota'] = df['Nota'].apply(lambda x: ' '.join([i for i in x.split() if not re.search(r'\d', i)]))
       
        return df

In [None]:
df_deghi=puliscidataset(df_deghi).copy()

In [None]:
# Conta parole e caratteri
df_deghi["n_parole"] = df_deghi["Text"].astype(str).apply(lambda x: len(x.split()))
df_deghi["n_caratteri"] = df_deghi["Text"].astype(str).apply(len)

In [None]:
# Statistiche descrittive
print(df_deghi[["n_parole", "n_caratteri"]].describe())

# Distribuzione lunghezze
print(df_deghi["n_parole"].value_counts().sort_index().head(20))  # primi 20 valori

In [None]:
import matplotlib.pyplot as plt

df_deghi["n_parole"].hist(bins=50, figsize=(10,5))
plt.xlabel("Numero di parole per recensione")
plt.ylabel("Frequenza")
plt.title("Distribuzione lunghezza recensioni Deghi")
plt.show()

In [None]:
import pandas as pd
import spacy

# carica modello italiano per split in frasi
nlp = spacy.load("it_core_news_sm")


def chunk_text_by_sentences(text, max_words=40, min_words=20):
    """
    Divide il testo in blocchi di frasi.
    - max_words = lunghezza massima di un blocco
    - min_words = lunghezza minima (se una frase è troppo corta, la unisce a quella successiva)
    """
    doc = nlp(str(text))
    chunks, current_chunk, current_len = [], [], 0

    for sent in doc.sents:
        words = sent.text.split()
        if current_len + len(words) > max_words and current_len >= min_words:
            chunks.append(" ".join(current_chunk))
            current_chunk, current_len = [], 0
        current_chunk.extend(words)
        current_len += len(words)

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# funzione principale
def make_chunked_dataframe(df, text_col="Text", id_col="IdReview", max_words=120, min_words=200):
    """
    Ritorna un nuovo dataframe con le recensioni spezzate in chunk.
    - Se una recensione ha <= min_words parole, resta intera.
    - Se ha > min_words parole, viene divisa in blocchi di max_words.
    """
    docs_chunked = []
    ids = []

    for idx, row in df.iterrows():
        text = str(row[text_col])
        n_words = len(text.split())
        if n_words > min_words:
            for chunk in chunk_text_by_sentences(text, max_words=max_words):
                docs_chunked.append(chunk)
                ids.append(row[id_col])
        else:
            docs_chunked.append(text)
            ids.append(row[id_col])

    return pd.DataFrame({id_col: ids, "Chunk": docs_chunked})

# esempio di utilizzo
df_chunked = make_chunked_dataframe(df_deghi, text_col="Text", id_col="IdReview",
                                    max_words=20, min_words=10)

print(df_chunked.head())
print("Dimensioni dataframe originale:", df_deghi.shape)
print("Dimensioni dataframe chunked:", df_chunked.shape)

In [None]:
# Conta parole e caratteri
df_chunked["n_parole"] = df_chunked["Chunk"].astype(str).apply(lambda x: len(x.split()))
df_chunked["n_caratteri"] = df_chunked["Chunk"].astype(str).apply(len)

In [None]:
# Se vuoi un istogramma (facoltativo)
import matplotlib.pyplot as plt

df_chunked["n_parole"].hist(bins=50, figsize=(10,5))
plt.xlabel("Numero di parole per recensione")
plt.ylabel("Frequenza")
plt.title("Distribuzione lunghezza recensioni Deghi")
plt.show()

In [None]:
# Statistiche descrittive
print(df_chunked[["n_parole", "n_caratteri"]].describe())

In [None]:
# prendi la colonna 'Text' come lista
docs = df_chunked["Chunk"].dropna().tolist()

In [None]:
#docs

# Traduce le singole note in vettori embeddings

In [None]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
#embeddings = embedding_model.encode(df_chunked["Chunk"].tolist(), show_progress_bar=True)
# modello monolingua italiano
#embedding_model = SentenceTransformer("nickprock/sentence-bert-base-italian-uncased")

In [None]:
from bertopic.representation import KeyBERTInspired
prompt = """<|system|>Sei un assistete che analizza documenti da un crm di una azienda di mobili per fornire ad un dirigente aspetti critici e negativi che appaiono nei documenti</s>
<|user|>
I seguenti documenti sono presi da un CRM di assistenza clienti di una azienda che vende mobili d'arredamento:
[DOCUMENTS]

Il tema è descritto dalle seguenti parole chiave:  '[KEYWORDS]'.

Sulla base delle informazioni sopra, puoi fornire una breve etichetta del topic di massimo 5 parole?</s>
<|assistant|>"""


from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline



model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-Nemo-Instruct-2407",
    load_in_8bit=True,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-Nemo-Instruct-2407")

generator = pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    max_new_tokens=350,
    repetition_penalty=1.1
)

# Text generation with Zephyr
minstral = TextGeneration(generator, prompt=prompt)
representation_model = {"KeyBERT": KeyBERTInspired(),
                        "LLM": minstral}


In [None]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.linear_model import LogisticRegression

In [None]:
umap_model = UMAP(
    n_neighbors=30,
    n_components=5,
    min_dist=0.2,
    metric="cosine",
    random_state=42
)

hdbscan_model = HDBSCAN(
    min_cluster_size=15,
    min_samples=10,
    gen_min_span_tree=True, 
    prediction_data=False,
    cluster_selection_method="eom"
)
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

# stopword italiane e inglesi
stop_it = stopwords.words("italian")
stop_en = stopwords.words("english")

# unisci le due liste
stop_words = list(set(stop_it + stop_en))

# Fine-tune topic representations after training BERTopic
vectorizer_model = CountVectorizer(stop_words=stop_words, ngram_range=(1, 2), min_df=5)
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

In [None]:
topic_model = BERTopic(umap_model=umap_model,hdbscan_model=hdbscan_model,embedding_model=embedding_model,representation_model=representation_model,ctfidf_model=ctfidf_model,  vectorizer_model=vectorizer_model,verbose=True)

In [None]:
topics, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.get_topic_info().to_csv('../result/topic_info.csv')

In [None]:
topic_model.get_topic_info()

In [None]:
fig = topic_model.visualize_hierarchy()
fig.show()

In [None]:
import datamapplot
import re

In [None]:
embeddings = embedding_model.encode(docs, show_progress_bar=True)

In [None]:
reduced_embeddings = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine', random_state=42).fit_transform(embeddings)

In [None]:
# Create a label for each document
llm_labels = [re.sub(r'\W+', ' ', label[0][0].split("\n")[0].replace('"', '')) for label in topic_model.get_topics(full=True)["LLM"].values()]
llm_labels = [label if label else "Unlabelled" for label in llm_labels]
all_labels = [llm_labels[topic+topic_model._outliers] if topic != -1 else "Unlabelled" for topic in topics]

# Run the visualization
datamapplot.create_plot(
    reduced_embeddings,
    all_labels,
    label_font_size=11,
    title="Deghi - BERTopic",
    sub_title="Topics generati con  mistralai/Mistral-Nemo-Instruct-2407",
    label_wrap_width=20,
    use_medoids=True,
    #logo=bertopic_logo,
    #logo_width=0.16
)