In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
import itertools
import recordlinkage

# 1) Caricamento e pre-elaborazione dati
schema_file = "main_outputs/final_mediated_schema.csv"
df = pd.read_csv(schema_file)

# Seleziona colonne di interesse
df = df[['company_name', 'industry', 'headquarters_country', 'headquarters_city', 'year_founded']]
df.dropna(subset=['company_name'], inplace=True)

print(f"[Canopy Clustering] Numero di record iniziali: {len(df)}")

# 2) Creazione dei bigrammi per TF-IDF
def preprocess(text):
    return text.lower().strip() if pd.notna(text) else ""

df['company_name_clean'] = df['company_name'].apply(preprocess)

vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 2))  # Bigrammi
tfidf_matrix = vectorizer.fit_transform(df['company_name_clean'])

# 3) Canopy Clustering: Definiamo le soglie loose e tight
loose_threshold = 0.5
tight_threshold = 0.3

canopies = []
assigned_records = set()

for i in range(len(df)):
    if i in assigned_records:
        continue

    canopy = []
    center_vector = tfidf_matrix[i].toarray().flatten()  # Converti in vettore 1D

    for j in range(len(df)):
        if i == j:
            continue

        candidate_vector = tfidf_matrix[j].toarray().flatten()  # Converti in vettore 1D
        similarity = 1 - cosine(center_vector, candidate_vector)

        if similarity >= loose_threshold:
            canopy.append(j)
            if similarity >= tight_threshold:
                assigned_records.add(j)

    canopies.append((i, canopy))

print(f"[Canopy Clustering] Numero di canopies generati: {len(canopies)}")

# 4) Creazione delle coppie candidate dai canopies
candidate_pairs = set()

for center, canopy in canopies:
    for i, j in itertools.combinations([center] + canopy, 2):
        candidate_pairs.add((i, j))

print(f"[Canopy Clustering] Numero di coppie candidate: {len(candidate_pairs)}")

# 5) Definizione delle regole di confronto
compare = recordlinkage.Compare()

compare.string('company_name', 'company_name', method='jarowinkler', label='name_sim')
compare.string('industry', 'industry', method='jarowinkler', label='industry_sim')
compare.string('headquarters_country', 'headquarters_country', method='jarowinkler', label='country_sim')

# 6) Calcolo della matrice di similarità
candidate_pairs_list = list(candidate_pairs)
candidate_pairs_mi = pd.MultiIndex.from_tuples(candidate_pairs_list, names=['level_0', 'level_1'])

similarity_matrix = compare.compute(candidate_pairs_mi, df)
print(f"[Canopy Clustering] Dimensioni della similarity_matrix: {similarity_matrix.shape}")

# 7) Definizione delle regole di matching
matches = similarity_matrix[
    ((similarity_matrix['name_sim'] > 0.80) & (similarity_matrix['name_sim'] < 0.92)) |
    ((similarity_matrix['name_sim'] > 0.9) & (similarity_matrix['industry_sim'] > 0.5) & (similarity_matrix['country_sim'] > 0.5))
]

print(f"[Canopy Clustering] Numero di coppie finali considerate 'match': {len(matches)}")

# 8) Recupero dei record corrispondenti
matches = matches.reset_index()
matches.rename(columns={'level_0': 'id_left', 'level_1': 'id_right'}, inplace=True)
matches = matches.merge(df, left_on='id_left', right_index=True, how='left', suffixes=('', '_left'))
matches = matches.merge(df, left_on='id_right', right_index=True, how='left', suffixes=('_left', '_right'))

# Rimuoviamo le righe in cui i nomi delle aziende sono identici (stesso record)
matches = matches[matches['company_name_left'] != matches['company_name_right']]

# 9) Salvataggio in CSV
output_file = "matched_companies_canopy.csv"
matches.to_csv(output_file, index=False)
print(f"[Canopy Clustering] ✅ File '{output_file}' generato!")


  df = pd.read_csv(schema_file)


[Canopy Clustering] Numero di record iniziali: 75793


ValueError: Input vector should be 1-D.