In [6]:
import pandas as pd
import numpy as np
import json
import re
from difflib import SequenceMatcher
from src.ingestion import download_from_drive
from tqdm.notebook import tqdm  # For progress bars

In [3]:
# Load the databases
df_anagrafica = download_from_drive("csv_startup_anagrafica")
df_crunchbase = download_from_drive("estrazione_crunchbase_5k")

# Display the first few rows of each dataframe to understand their structure
print("Startup Anagrafica Dataset:")
display(df_anagrafica.head())

print("\nCrunchbase Dataset:")
display(df_crunchbase.head())

Startup Anagrafica Dataset:


Unnamed: 0.1,Unnamed: 0,Ragione sociale,BvD ID number,Codice fiscale,Numero CCIAA,Partita IVA,Indirizzo sede legale,Provincia,Indirizzo sede legale - Codice postale,Indirizzo sede legale - Regione,Sede operativa,Sede operativa - Provincia,Sede operativa - Codice postale,Sede operativa - Regione - Regione,ATECO 2007\ncodice,Anno di costituzione,Fondazione,Stato giuridico,Forma giuridica
0,1.0,RAINSHOW S.R.L.,IT03595550835,3595550835,ME0248323,3595551000.0,"VIA FRANCESCO CRISPI, 4",Messina,98066.0,Sicilia,,,,,711210.0,06/12/2019,,Attiva,S.R.L.
1,2.0,CHECKSIG S.R.L. SOCIETA' BENEFIT,IT11028330964,11028330964,MI2573724,11028330000.0,"PZA DEL LIBERTY, 8",Milano,20121.0,Lombardia,PIAZZA DEL LIBERTY 8,Milano,20121.0,LOMBARDIA,620100.0,25/10/2019,,Attiva,S.R.L.
2,3.0,OTOFARMA S.P.A.,IT08431301210,8431301210,NA0957454,8431301000.0,"VLE ANTONIO GRAMSCI, 17/B",Napoli,80122.0,Campania,VIA VIC S.M. DEL PIANTO TORRE 2,Napoli,80144.0,CAMPANIA,266002.0,13/07/2016,,Attiva,S.P.A.
3,4.0,SIKELIA OIL S.R.L.,IT03577640836,3577640836,ME0247028,3577641000.0,"VIA STRETTO I COCCOMELLI, 64",Messina,98051.0,Sicilia,,,,,473000.0,15/07/2019,,Attiva,S.R.L.
4,5.0,POWER.IT S.R.L.,IT04355290166,4355290166,BG0456683,4355290000.0,"VIA ALESSANDRO NOLI, 13",Bergamo,24125.0,Lombardia,,,,,352300.0,31/01/2019,,Attiva,S.R.L.



Crunchbase Dataset:


Unnamed: 0,Organization Name,Organization Name URL,IPO Status,Company Type,Industries,Number of Employees,Headquarters Location,CB Rank (Organization),Stage,CB Rank (Company),...,Exit Date,Exit Date Precision,Founded Date,Founded Date Precision,Operating Status,Description,Estimated Revenue Range,Postal Code,Headquarters Regions,Number of Portfolio Organizations
0,Enilive,https://www.crunchbase.com/organization/enilive,Private,For Profit,"Biofuel, Car Sharing, Energy Efficiency, Trans...",1001-5000,"Roma, Lazio, Italy",1495,,1173,...,,,2023-01-01,year,Active,Enilive is Eni's mobility transformation compa...,,144.0,"European Union (EU), Europe, Middle East, and ...",
1,Electrolux Professional,https://www.crunchbase.com/organization/electr...,Public,For Profit,"Laundry and Dry-cleaning, Machinery Manufacturing",1001-5000,"Pordenone, Friuli-Venezia Giulia, Italy",3453,,2808,...,2020-05-27,day,2019-01-01,day,Active,Electrolux Professional is a global provider o...,$1B to $10B,,"European Union (EU), Europe, Middle East, and ...",
2,NanoPhoria,https://www.crunchbase.com/organization/nanoph...,Private,For Profit,"Analytics, Biotechnology, Health Care",1-10,"Milan, Lombardia, Italy",4329,,3530,...,,,2022-01-01,year,Active,"NanoPhoria develops non-viral, nano-delivery s...",$1M to $10M,20122.0,"European Union (EU), Europe, Middle East, and ...",
3,Akamas,https://www.crunchbase.com/organization/akamas,Private,For Profit,"Analytics, Application Performance Management,...",11-50,"Milan, Lombardia, Italy",4411,,3623,...,,,2019-01-01,year,Active,Akamas AI safely optimizes application workloa...,$1M to $10M,20158.0,"European Union (EU), Europe, Middle East, and ...",
4,Arsenale Bioyards,https://www.crunchbase.com/organization/arsena...,Private,For Profit,"Biotechnology, Health Care, Manufacturing",1-10,"Milan, Lombardia, Italy",4546,,3719,...,,,2023-01-01,year,Active,Arsenale is an innovative end-to-end bio-manuf...,,20122.0,"European Union (EU), Europe, Middle East, and ...",


In [4]:
import pandas as pd
from thefuzz import process, fuzz

def pulisci_stringa(s: str) -> str:
    """
    Esegue operazioni di cleaning basilari su una stringa:
    - conversione a minuscolo
    - rimozione spazi iniziali/finali
    - (eventualmente) rimozione di punteggiatura o caratteri speciali
    """
    if not isinstance(s, str):
        if s is None:
            return ""
        else:
            return str(s)
    s = s.strip().lower()
    # Remove legal entity types
    s = re.sub(r'\b(srl|s\.r\.l|s\.r\.l\.|s.r.l|spa|s\.p\.a|s\.p\.a\.|s.p.a|inc|llc|ltd|gmbh|corp|corporation)\b', '', s)
    
    # Remove special characters and extra spaces
    s = re.sub(r'[^\w\s]', ' ', s)  # Replace special chars with space
    s = re.sub(r'\s+', ' ', s)       # Replace multiple spaces with single space
    return s

def fuzzy_merge(
    df1: pd.DataFrame,
    df2: pd.DataFrame,
    key1: str,
    key2: str,
    threshold: int = 80
) -> pd.DataFrame:
    """
    Esegue un merge 'fuzzy' tra df1 e df2 in base a key1 (df1) e key2 (df2).
    - threshold definisce il valore minimo di similarità (0-100)
    - Ritorna un DataFrame con le corrispondenze migliori trovate per ogni riga di df1.
    """
    # Copia dei DataFrame per non modificare gli originali
    df1_clean = df1.copy()
    df2_clean = df2.copy()
    
    # Creiamo colonne "pulite" per il matching
    df1_clean['key_clean'] = df1_clean[key1].apply(pulisci_stringa)
    df2_clean['key_clean'] = df2_clean[key2].apply(pulisci_stringa)
    
    # Per facilitare le ricerche, creiamo un dizionario {nome_clean: indice} per df2
    unique_keys2 = df2_clean['key_clean'].unique().tolist()
    
    # Lista per salvare i risultati
    risultati = []

    for idx1, row in df1_clean.iterrows():
        name_to_match = row['key_clean']
        
    # Trova il miglior match in df2 usando thefuzz.process
        # extractOne ritorna (best_match, score, indice)
        best_match, best_score = process.extractOne(name_to_match, unique_keys2, scorer=fuzz.partial_ratio)
        
        if best_score >= threshold:
            # Recuperiamo la riga corrispondente in df2
            match_row = df2_clean[df2_clean['key_clean'] == best_match].iloc[0]
            
            # Costruiamo un dizionario con i dati combinati
            combined_data = {}
            
            # Dati di df1
            for col in df1_clean.columns:
                combined_data[f"df1_{col}"] = row[col]
            
            # Dati di df2
            for col in df2_clean.columns:
                combined_data[f"df2_{col}"] = match_row[col]
            
            combined_data['similarity_score'] = best_score
            
            risultati.append(combined_data)
        else:
            # Se non supera la soglia di similarità, si può comunque salvare la riga di df1 senza match
            # (oppure scartarla del tutto, a seconda dell’obiettivo)
            combined_data = {f"df1_{col}": row[col] for col in df1_clean.columns}
            combined_data['df2_key_clean'] = None
            combined_data['similarity_score'] = best_score
            risultati.append(combined_data)
    
    # Convertiamo la lista di dizionari in DataFrame
    df_risultati = pd.DataFrame(risultati)
    return df_risultati

# Esempio d'uso
# df_merged = fuzzy_merge(df_anagrafica, df_crunchbase, "Ragione sociale", "Organization Name", threshold=80)

ModuleNotFoundError: No module named 'thefuzz'

In [5]:
df_risultati = fuzzy_merge(df_anagrafica, df_crunchbase, "Ragione sociale", "Organization Name", threshold=90)
df_risultati

NameError: name 'fuzzy_merge' is not defined

In [None]:
df_risultati[df_risultati['similarity_score'] >= 90][['df1_Ragione sociale', 'df2_Organization Name', 'similarity_score']].sort_values(by='similarity_score', ascending=True)

Unnamed: 0,df1_Ragione sociale,df2_Organization Name,similarity_score
7336,MUSIC4MASTERS SOCIETA' A RESPONSABILITA' LIMITATA,Asters,90
11820,BE.EVO SRL,Be-E,90
11814,SUPERNOVA INDUSTRIES SOCIETA' A RESPONSABILITA...,Supernova,90
11806,SNELIX SOCIETA' A RESPONABILITA' LIMITATA,Snelix,90
11804,DYMENSION SYSTEMS SRL,EMS,90
...,...,...,...
4,POWER.IT S.R.L.,Power.it,100
10,HINOVIA SRL,Hinovia,100
13,SHOW REEL AGENCY S.R.L.,Show Reel Agency,100
21,KIMERA AUTOMOBILI S.R.L.,Kimera Automobili,100


In [32]:
df_risultati = fuzzy_merge(df_anagrafica, df_crunchbase, "Indirizzo sede legale - Codice postale", "Postal Code", threshold=10)
df_risultati

Unnamed: 0,df1_Unnamed: 0,df1_Ragione sociale,df1_BvD ID number,df1_Codice fiscale,df1_Numero CCIAA,df1_Partita IVA,df1_Indirizzo sede legale,df1_Provincia,df1_Indirizzo sede legale - Codice postale,df1_Indirizzo sede legale - Regione,...,df2_Founded Date,df2_Founded Date Precision,df2_Operating Status,df2_Description,df2_Estimated Revenue Range,df2_Postal Code,df2_Headquarters Regions,df2_Number of Portfolio Organizations,df2_key_clean,similarity_score
0,1.0,RAINSHOW S.R.L.,IT03595550835,3595550835,ME0248323,3.595551e+09,"VIA FRANCESCO CRISPI, 4",Messina,98066.0,Sicilia,...,2019-07-26,day,Active,Regensight creates incision-free theranostics ...,$1M to $10M,198,"European Union (EU), Europe, Middle East, and ...",,198,72
1,2.0,CHECKSIG S.R.L. SOCIETA' BENEFIT,IT11028330964,11028330964,MI2573724,1.102833e+10,"PZA DEL LIBERTY, 8",Milano,20121.0,Lombardia,...,2023-01-01,year,Active,"Fintech, Lending, Payment, SaaS",,20121,"European Union (EU), Europe, Middle East, and ...",,20121,95
2,3.0,OTOFARMA S.P.A.,IT08431301210,8431301210,NA0957454,8.431301e+09,"VLE ANTONIO GRAMSCI, 17/B",Napoli,80122.0,Campania,...,2020-11-01,month,Active,2Watch is a media tech company which is into g...,$1M to $10M,80122,"European Union (EU), Europe, Middle East, and ...",,80122,95
3,4.0,SIKELIA OIL S.R.L.,IT03577640836,3577640836,ME0247028,3.577641e+09,"VIA STRETTO I COCCOMELLI, 64",Messina,98051.0,Sicilia,...,2019-01-01,year,Active,"CodGo provides application development, mobile...",,98051,"European Union (EU), Europe, Middle East, and ...",,98051,95
4,5.0,POWER.IT S.R.L.,IT04355290166,4355290166,BG0456683,4.355290e+09,"VIA ALESSANDRO NOLI, 13",Bergamo,24125.0,Lombardia,...,2021-01-01,year,Active,Service on Demand is an employment agency that...,$1M to $10M,24125,"European Union (EU), Europe, Middle East, and ...",,24125,95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11962,11963.0,CIFRANDA S.R.L.,IT03106280849,3106280849,AG0227310,3.106281e+09,"VLE CARLO AZEGLIO CIAMPI, 28",Agrigento,92026.0,Sicilia,...,2021-01-01,year,Active,Wedding In Cloud helps to create a digital alb...,,7026,"European Union (EU), Europe, Middle East, and ...",,7026,73
11963,11964.0,SNEARK S.R.L.,IT03082980842,3082980842,AG0225774,3.082981e+09,"VIA RE TANCREDI, 20",Agrigento,92100.0,Sicilia,...,2020-06-08,day,Active,Medicud develops a device for treating wounds ...,,100,"European Union (EU), Europe, Middle East, and ...",,100,90
11964,11965.0,MATT LAMPEDUSA S.R.L.,IT03067220842,3067220842,AG0224714,3.067221e+09,"VIA GIOVANNI VERGA, 38",Agrigento,,Sicilia,...,2019-01-01,day,Active,Electrolux Professional is a global provider o...,$1B to $10B,,"European Union (EU), Europe, Middle East, and ...",,,100
11965,11966.0,PISTACCHIOBAFFO SOCIETA' A RESPONSABILITA' LIM...,IT03033680848,3033680848,AG0222627,3.033681e+09,"VIA MATTEO CIMARRA, 12",Agrigento,92100.0,Sicilia,...,2020-06-08,day,Active,Medicud develops a device for treating wounds ...,,100,"European Union (EU), Europe, Middle East, and ...",,100,90


In [33]:
df_risultati[df_risultati['similarity_score'] >= 90][['df1_Ragione sociale', 'df2_Organization Name', 'similarity_score']].sort_values(by='similarity_score', ascending=True)

Unnamed: 0,df1_Ragione sociale,df2_Organization Name,similarity_score
11939,NIFA SRL,Yeastime,90
11965,PISTACCHIOBAFFO SOCIETA' A RESPONSABILITA' LIM...,Medicud,90
11963,SNEARK S.R.L.,Medicud,90
11960,SNAP-MARKETING S.R.L.,NieF,90
11949,DIGITALIA S.R.L.,Annuario Cattolico,90
...,...,...,...
11964,MATT LAMPEDUSA S.R.L.,Electrolux Professional,100
10140,INNOVA GESTIONI IMMOBILIARI S.R.L. S.B. IN SIG...,Electrolux Professional,100
11023,ORIENTALENT S.R.L.,Electrolux Professional,100
11090,CIRCLE MARKET S.R.L.,Electrolux Professional,100


In [None]:
df_risultati.columns

Index(['df1_Unnamed: 0', 'df1_Ragione sociale', 'df1_BvD ID number',
       'df1_Codice fiscale', 'df1_Numero CCIAA', 'df1_Partita IVA',
       'df1_Indirizzo sede legale', 'df1_Provincia',
       'df1_Indirizzo sede legale - Codice postale',
       'df1_Indirizzo sede legale - Regione', 'df1_Sede operativa',
       'df1_Sede operativa - Provincia', 'df1_Sede operativa - Codice postale',
       'df1_Sede operativa - Regione - Regione', 'df1_ATECO 2007\ncodice',
       'df1_Anno di costituzione', 'df1_Fondazione', 'df1_Stato giuridico',
       'df1_Forma giuridica', 'df1_key_clean', 'df2_key_clean',
       'similarity_score'],
      dtype='object')