<a href="https://colab.research.google.com/github/livio-24/test-deduplication/blob/main/sbert_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# If using Google Drive, mount it
from google.colab import drive
drive.mount('/content/drive')  # Only if dataset is stored in Google Drive


Mounted at /content/drive


In [None]:
# ========================
# 🔥 1️⃣ SETUP & INSTALL DEPENDENCIES
# ========================

# Install required libraries
!pip install -q sentence-transformers pandas numpy


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

In [None]:
# ========================
# 📂 2️⃣ LOAD DATASET
# ========================

# Load dataset (modify path if needed)
file_path = "/content/drive/MyDrive/configuration_2/dataset.csv"  # Adjust to your file location
df = pd.read_csv(file_path)
df = df[:100]

# TEST 1 - embedding title + description

In [None]:
# Show dataset preview
print("Dataset Sample:")
df.head()

# Check dataset columns
print("\nDataset Columns:", df.columns)

# Ensure 'title' and 'description' columns exist
if "title" not in df.columns or "description" not in df.columns:
    raise ValueError("Dataset must contain 'title' and 'description' columns.")

# ========================
# EMBED TITLES & DESCRIPTIONS USING SBERT
# ========================

# Load SBERT model (Multilingual, optimized for similarity)
#model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
model = SentenceTransformer('nickprock/sentence-bert-base-italian-uncased')

# Combine 'title' and 'description' for richer representation
df["text_short"] = df["title"] + ", " + df["description"]

# Convert text into embeddings
embeddings = model.encode(df["text_short"].tolist(), batch_size=32, show_progress_bar=True, convert_to_numpy=True)

# Normalize embeddings (important for cosine similarity search)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

print("Embedding shape:", embeddings.shape)  # (N, 384) where N = number of articles



In [None]:
# Save embeddings to a .npy file (NumPy format)
#np.save("/content/drive/MyDrive/tesi/crime_article_embeddings.npy", embeddings)

print("Embeddings saved successfully to 'crime_article_embeddings.npy'")


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Compute cosine similarity matrix
cosine_sim_matrix = np.dot(embeddings, embeddings.T)  # Dot product of normalized vectors

print("✅ Cosine Similarity Matrix Shape:", cosine_sim_matrix.shape)  # Should be (N, N)


✅ Cosine Similarity Matrix Shape: (5067, 5067)


In [None]:
SIMILARITY_THRESHOLD = 0.75  # Adjust based on needs

# Extract pairs where similarity > threshold (excluding self-matches)
near_duplicates = []
N = 100#len(df)

for i in range(N):
    for j in range(i + 1, N):  # Avoid duplicate pairs & self-matches
        if cosine_sim_matrix[i, j] > SIMILARITY_THRESHOLD:
            near_duplicates.append({
                "Title 1": df.iloc[i]["text_short"],
                "Title 2": df.iloc[j]["text_short"],
                "Similarity Score": round(cosine_sim_matrix[i, j], 3)
            })

# Convert to DataFrame & Show Results
duplicates_df = pd.DataFrame(near_duplicates)
print("\n🔍 Sample Near-Duplicate Articles:")
#print(duplicates_df.head(10))

# Save results to CSV (optional)
#duplicates_df.to_csv("/content/near_duplicate_articles.csv", index=False)

print("\n✅ Near-duplicate detection completed! Results saved as 'near_duplicate_articles.csv'")


In [None]:
duplicates_df

Unnamed: 0,Title 1,Title 2,Similarity Score
0,"Omicidio a Santeramo vicino Bari, uomo ucciso ...",Trovato morto in casa a Padova con un colpo di...,0.763
1,"Andrea Prospero trovato morto a Perugia, i sos...","Andrea Prospero trovato morto a Perugia, stude...",0.951
2,"Omicidio di Maati Moubakir a Campi Bisenzio, l...","Omicidio del 17enne a Campi Bisenzio, tre giov...",0.804
3,Svolta nell'omicidio Francesco Marando a Boval...,"Omicidio di Marco Magagna a Bovisio Masciago, ...",0.778
4,"Neonato morto a Bari in culla termica, scopert...","Neonato trovato morto a Bari in culla, autopsi...",0.84
5,"Alex Cotoia assolto per l'omicidio del padre, ...",Alex Cotoia assolto in Appello a Torino per l'...,0.769
6,"Duplice femminicidio ma niente ergastolo, sent...","Salvatore Montefusco evita l'ergastolo, doppio...",0.805
7,Franco Dogna ucciso a coltellate in casa a Bar...,"Omicidio Franco Dogna a Bari, trovato morto ac...",0.872
8,"Cani azzannano e uccidono una donna a Latina, ...",Donna morta azzannata dai cani a Latina nel gi...,0.808
9,Ergastolo scampato per Salvatore Montefusco do...,"Salvatore Montefusco evita l'ergastolo, doppio...",0.758


In [None]:
!python -m spacy download it_core_news_sm


Collecting it-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/it_core_news_sm-3.7.0/it_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m78.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: it-core-news-sm
Successfully installed it-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('it_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


# TEST 2 - embedding singoli per title, description e text. Calcolo valore di similarità combinando i vari embedding

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import spacy
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Carica il modello spaCy per l'italiano
nlp = spacy.load("it_core_news_sm")

# Carica il modello SBERT (Multilingue)
#model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
model = SentenceTransformer('nickprock/sentence-bert-base-italian-uncased')

# Funzione per ottenere la media delle embedding delle frasi
def get_average_sentence_embedding(text, model):
    # Tokenizza il testo in frasi usando spaCy
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]

    # Calcola le embedding per ogni frase
    sentence_embeddings = model.encode(sentences,batch_size=32 , convert_to_numpy=True, show_progress_bar=True)

    # Calcola la media delle embedding delle frasi
    avg_embedding = np.mean(sentence_embeddings, axis=0)
    return avg_embedding

# Calcolare le embedding per titolo, descrizione e testo completo
title_embeddings = model.encode(df["title"].tolist(),batch_size=32 , convert_to_numpy=True, show_progress_bar=True)
description_embeddings = model.encode(df["description"].tolist(), batch_size=32 , convert_to_numpy=True, show_progress_bar=True)

# Per il testo completo, calcoliamo la media delle embedding delle frasi
full_text_embeddings = np.array([get_average_sentence_embedding(text, model) for text in df["text"]])

In [None]:
#np.save("/content/drive/MyDrive/test-sbert-it/title_embeddings.npy", title_embeddings)
#np.save("/content/drive/MyDrive/test-sbert-it/description_embeddings.npy", description_embeddings)
#np.save("/content/drive/MyDrive/test-sbert-it/text_embeddings.npy", full_text_embeddings)

In [None]:
title_embeddings = np.load("/content/drive/MyDrive/test-sbert-it/title_embeddings.npy")
description_embeddings = np.load("/content/drive/MyDrive/test-sbert-it/description_embeddings.npy")
full_text_embeddings = np.load("/content/drive/MyDrive/test-sbert-it/text_embeddings.npy")

In [None]:
# Calcolare la similarità coseno usando sklearn
title_sim_matrix = cosine_similarity(title_embeddings)
description_sim_matrix = cosine_similarity(description_embeddings)
full_text_sim_matrix = cosine_similarity(full_text_embeddings)

x = 1.5
y = 1.25
z = 2
# Combinare i valori di similarità (media ponderata o somma)
combined_sim_matrix = (x * title_sim_matrix + y * description_sim_matrix + z * full_text_sim_matrix) / (x + y + z)


In [None]:
# Impostare una soglia per trovare le coppie simili
SIMILARITY_THRESHOLD = 0.68# Adatta a seconda dei tuoi dati

# Estrai le coppie simili che superano la soglia
near_duplicates = []
N = len(df)

for i in range(N):
    for j in range(i + 1, N):  # Escludiamo le coppie duplicate e se stessa
        date_i = pd.to_datetime(df.iloc[i]["date_publication"])
        date_j = pd.to_datetime(df.iloc[j]["date_publication"])

        # Controlliamo che la differenza sia entro 5 giorni
        if abs((date_i - date_j).days) <= 5:
          if combined_sim_matrix[i, j] > SIMILARITY_THRESHOLD:
            near_duplicates.append({
                "Title 1": df.iloc[i]["title"],
                "Title 2": df.iloc[j]["title"],
                "Combined Similarity": round(combined_sim_matrix[i, j], 3)
            })

# Visualizza i risultati
duplicates_df = pd.DataFrame(near_duplicates)
print("\n🔍 Sample Near-Duplicate Articles:")

# Salva i risultati (facoltativo)
duplicates_df.to_csv("/content/near_duplicate_combined_articles.csv", index=False)
print("\n✅ Near-duplicate detection completed! Results saved as 'near_duplicate_combined_articles.csv'")


🔍 Sample Near-Duplicate Articles:

✅ Near-duplicate detection completed! Results saved as 'near_duplicate_combined_articles.csv'


In [None]:
duplicates_df

Unnamed: 0,Title 1,Title 2,Combined Similarity
0,"Uccisi in strada a Miano a Napoli, chi erano S...","Spari in strada a Napoli nel quartiere Miano, ...",0.782
1,"Andrea Prospero trovato morto a Perugia, i sos...","Andrea Prospero trovato morto a Perugia, stude...",0.806
2,"Omicidio Pierina Paganelli, di chi è la voce r...","Omicidio Pierina Paganelli, perché si parla di...",0.745
3,Chiesto l'ergastolo per l'omicidio della mamma...,"Omicidio di Pamela Mastropietro, ergastolo a I...",0.699
4,"Omicidio di Pamela Mastropietro, ergastolo a I...","Omicidio Sharon Verzeni, giudizio immediato pe...",0.703
5,"Omicidio di Pamela Mastropietro, ergastolo a I...",Alex Cotoia assolto in Appello a Torino per l'...,0.714
6,Proprietario dei cani che hanno sbranato Patri...,"Cani azzannano e uccidono una donna a Latina, ...",0.686
7,Caterina Pappalardo uccisa dal figlio Giosuè F...,Ergastolo scampato per Salvatore Montefusco do...,0.693
8,"Alex Cotoia assolto per l'omicidio del padre, ...",Alex Cotoia assolto in Appello a Torino per l'...,0.787
9,"Duplice femminicidio ma niente ergastolo, sent...",Ergastolo scampato per Salvatore Montefusco do...,0.703


In [None]:
val_set = pd.read_csv("/content/drive/MyDrive/tesi/validation_set_duplication.csv")

In [None]:
grouped_df = duplicates_df.groupby('Title 1')['Title 2'].apply(list).reset_index()
grouped_df

Unnamed: 0,Title 1,Title 2
0,"Alex Cotoia assolto per l'omicidio del padre, ...",[Alex Cotoia assolto in Appello a Torino per l...
1,"Andrea Prospero trovato morto a Perugia, i sos...","[Andrea Prospero trovato morto a Perugia, stud..."
2,"Cani azzannano e uccidono una donna a Latina, ...",[Donna morta azzannata dai cani a Latina nel g...
3,"Duplice femminicidio ma niente ergastolo, sent...","[Salvatore Montefusco evita l'ergastolo, doppi..."
4,Ergastolo scampato per Salvatore Montefusco do...,"[Salvatore Montefusco evita l'ergastolo, doppi..."
5,"Morte di Fabiana Piccioni a Giulianova, si esc...","[Corpo carbonizzato di una donna a Giulianova,..."
6,"Neonato trovato morto a Bari in culla, autopsi...","[Neonato trovato morto a Bari, tra gli indagat..."
7,"Omicidio Pierina Paganelli, di chi è la voce r...","[Omicidio Pierina Paganelli, perché si parla d..."
8,"Omicidio a Bovisio Masciago, Stella Boggio ai ...","[Omicidio di Marco Magagna a Bovisio Masciago,..."
9,"Omicidio di Pamela Mastropietro, ergastolo a I...",[Alex Cotoia assolto in Appello a Torino per l...


In [None]:
val_set['dup_pred'] = 0  # Initialize with a default value

# Iterate through the grouped DataFrame
for index, row in grouped_df.iterrows():
    doc1_index = row['Title 1']

    val_set.loc[val_set['title'] == doc1_index, 'dup_pred'] = 1  # Convert the list to a string for storage

In [None]:
#val_set.to_csv("/content/drive/MyDrive/tesi/validation_set_duplication.csv", index=False)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
y_true = val_set["is_dup"]
y_pred = val_set["dup_pred"]

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Accuracy: 0.85
Precision: 0.91
Recall: 0.42
F1 Score: 0.57
