## **Record Linkage del dataset**

In [5]:
import pandas as pd
import numpy as np
import time

import recordlinkage
import warnings
from recordlinkage.index import Full

warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('aziende_merged_finale.csv', low_memory=False)
df = df.drop(['Unnamed: 0'], axis=1)
df

Unnamed: 0,name,company_website,country,number_of_employees,industry,founded_year,market_cap,revenue,rank,sector,city,valuation,stock,ceo,market_value
0,Proteus Digital Health Careers,www.proteus.com,"Redwood City, CA",201 to 500 employees,Pharmaceuticals,2004,,,,,,,,,
1,Zelis Careers,www.zelis.com,"Bedminster, NJ","1,001 to 5,000 employees",Information Technology & Services,2016,,,,,,,,,
2,UpNest Careers,www.upnest.com,"Burlingame, CA",1 to 50 employees,Real Estate,2013,,,,,,,,,
3,conferacity Careers,conferacity.com,"Menlo Park, CA",1 to 50 employees,Broadcast & Online Media,2013,,,,,,,,,
4,Zenoti Careers,www.zenoti.com,"Bellevue, WA","501 to 1,000 employees",Computer Software,2010,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120254,Wayne's Coffee,,,,Consumer services,,,,,Restaurants & bars,,,,,
120255,WESC,,,,Consumer goods,,,,,Clothing & accessories,,,,,
120256,WG Film,,,,Consumer services,,,,,Broadcasting & entertainment,,,,,
120257,WM-data,,,,Technology,,,,,Software,,,,,


In [5]:
# Indicizzazione dei dati e calcolo tempo necessario per indicizzazione
start_time = time.time()

indexer = recordlinkage.Index()
indexer.block('name') # Blocco sulla colonna "name" (blocco informazioni utilizzate per l'indicizzazione)
candidate_links = indexer.index(df)

print("Tempo di indicizzazione dei dati: ", time.time() - start_time, "secondi")
print(candidate_links)

Tempo di indicizzazione dei dati:  0.17244696617126465 secondi
MultiIndex([( 51039,    952),
            ( 25131,    953),
            ( 47097,    953),
            ( 47097,  25131),
            ( 95494,    953),
            ( 95494,  25131),
            ( 95494,  47097),
            (100561,    953),
            (100561,  25131),
            (100561,  47097),
            ...
            (118747, 117706),
            (117768, 117732),
            (118347, 117732),
            (118347, 117768),
            (119764, 117776),
            (117935, 117812),
            (119467, 117821),
            (118118, 117923),
            (118021, 117943),
            (118745, 118013)],
           length=155900)


In [6]:
# Confrontare ogni coppia di record candidate e calcolo empo necessario per effettuare il confronto
start_time = time.time()

compare_cl = recordlinkage.Compare()

compare_cl.string('name', 'name', label='name', threshold = 0.9)
compare_cl.string('country', 'country', label='country', threshold = 0.8)
compare_cl.string('company_website', 'company_website', label='comprany_website', threshold = 0.9)
compare_cl.string('number_of_employees', 'number_of_employees', label='number_of_employees', threshold = 0.6)
compare_cl.string('industry', 'industry', label='industry', threshold = 0.7)
compare_cl.string('market_cap', 'market_cap', label='market_cap')
compare_cl.string('revenue', 'revenue', label='revenue')
compare_cl.string('rank', 'rank', label='rank')
compare_cl.string('sector', 'sector', label='sector')
compare_cl.string('city', 'city', label='city')
compare_cl.string('valuation', 'valuation', label='valuation')
compare_cl.string('stock', 'stock', label='stock')
compare_cl.string('ceo', 'ceo', label='ceo')
compare_cl.string('market_value', 'market_value', label='market_value')
compare_cl.exact('founded_year', 'founded_year', label='founded_year')

features = compare_cl.compute(candidate_links, df)

print("Tempo di confronto delle coppie di record: ", time.time() - start_time, "secondi")

Tempo di confronto delle coppie di record:  9.054016590118408 secondi


In [7]:
features = features.head(len(df))
features

Unnamed: 0,Unnamed: 1,name,country,comprany_website,number_of_employees,industry,market_cap,revenue,rank,sector,city,valuation,stock,ceo,market_value,founded_year
51039,952,1.0,0.0,0.0,0.0,0.0,0.125000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0
25131,953,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0
47097,953,1.0,0.0,0.0,0.0,0.0,0.230769,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0
47097,25131,1.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0
95494,953,1.0,1.0,0.0,1.0,0.0,0.571429,0.142857,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81476,81474,1.0,1.0,0.0,0.0,0.0,0.857143,0.000000,0.833333,0.0,0.0,0.333333,0.0,0.0,0.0,0
81476,81475,1.0,1.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0
81483,81482,1.0,0.0,0.0,0.0,0.0,0.857143,0.000000,0.000000,0.0,0.0,0.666667,0.0,0.0,0.0,0
81484,81482,1.0,0.0,0.0,0.0,0.0,0.857143,0.000000,0.666667,0.0,0.0,0.666667,0.0,0.0,0.0,0


### **Dedupe**

In [117]:
import pandas as pd
import dedupe
import time

df = pd.read_csv('aziendeMerged_ordinato.csv', low_memory=False)

In [118]:
from unidecode import unidecode
import re

# Pulizia dei dati
def preProcess(column):
    column = unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    # Se la colonna è vuota, imposta il valore su None
    if not column:
        column = None
    return column

In [119]:
# Prende come input il file CSV e restituisce un dizionario
def readData(filename):
    data_d = {}
    with open(filename, encoding="utf8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = str(row['name'])
            data_d[row_id] = dict(clean_row)

    return data_d

In [120]:
start_time = time.time()
data_d = readData('aziendeMerged_ordinato.csv')
print("Tempo di lettura del dataset: ", time.time() - start_time, "secondi")

importing data ...
Tempo di lettura del dataset:  6.940991640090942 secondi


In [121]:
# Conta il numero di valori nulli in ciascuna colonna
num_missing = df.isnull().sum()
num_missing = num_missing.sort_values()

print(num_missing)

name                        0
market_cap              40444
country                 44790
valuation               64783
number_of_employees     73448
founded_year            78733
revenue                 88830
industry                89327
rank                    91252
company_website         94980
ceo                     97788
stock                   99104
sector                 112591
city                   114425
market_value           118250
dtype: int64


In [122]:
df = df.loc[df.isna().sum(axis=1) <= 7]
df

Unnamed: 0,name,company_website,country,number_of_employees,industry,founded_year,market_cap,revenue,rank,sector,city,valuation,stock,ceo,market_value
29,1&1 AG,https://www.mcafee.com/,Germany,6396,,2017,5.510 Billion USD,51 Million USD,,,,4.804 Billion USD,,,
31,1-800-FLOWERS.COM,https://www.farmaciasguadalajara.com/,USA,4800,,Not found,2.067 Billion USD,"1,642 Million USD",,,,1.579 Billion USD,,,
52,10X GENOMICS,https://www.halinvestments.nl/,USA,51397,,1968,19.507 Billion USD,"2,862 Million USD",,,,14.275 Billion USD,,"John R. Stuelpnagel, D.V.M",
84,1LIFE HEALTHCARE INC,https://www.itron.com,USA,6065,,1996,3.378 Billion USD,"73,816 Million USD",,,,2.940 Billion USD,,,
87,1ST SOURCE CORPORATION,https://www.1stsource.com/,USA,1130,,Not found,1.175 Billion USD,476 Million USD,,,,1.295 Billion USD,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119673,geobra Brandstätter Stiftung & Co. KG (aka Pla...,www.playmobil.de,BY,4635,,1876,,856.4,308.0,Consumer Products,,,,,
119707,huber group,www.hubergroup.de,BY,3500,,1765,,976.8,263.0,Industrial Manufacturing,,,,,
119733,iA Financial Corporation,https://www.forbes.com/companies/legrand,France,34244,Insurance,1892.0,,$13.3B,,,,,,Benoît Coquart,$24.35 B
120045,"salesforce.com, inc.",,United States,73541,Software—Application,"February 3, 1999",$193.80 B,,,Technology,,,CRM,Mr. Marc R. Benioff,


In [123]:
# Conta il numero di valori nulli in ciascuna colonna
num_missing = df.isnull().sum()
num_missing = num_missing.sort_values()

print(num_missing)

name                       0
country                    0
number_of_employees       30
founded_year             705
revenue                 1259
company_website         1860
market_cap              2512
valuation               4324
ceo                     5971
industry                7283
sector                  8784
market_value            9107
stock                   9249
rank                    9583
city                   11107
dtype: int64


In [124]:
df = df.astype(str)

In [125]:
# crea un dizionario dei campi del dataset da usare per la deduplicazione
fields = [
        {'field': 'name', 'type': 'String'},
        {'field': 'country', 'type': 'String', 'has missing': True},
        {'field': 'number_of_employees', 'type': 'String', 'has missing': True},
        {'field': 'founded_year', 'type': 'String', 'has missing': True},
        {'field': 'revenue', 'type': 'String', 'has missing': True},
    ]
deduper = dedupe.Dedupe(fields)

In [126]:
deduper.prepare_training(data_d)

In [127]:
# inizio del training dell'algoritmo di deduplicazione
dedupe.console_label(deduper)

name : tingyi (cayman islands) holding
country : taiwan
number_of_employees : 58182
founded_year : 1920
revenue : 4,601 million usd

name : tingyi (cayman islands) holding corp
country : china
number_of_employees : 62,107
founded_year : None
revenue : $11.5b

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


y


name : construction partners
country : usa
number_of_employees : 5,654
founded_year : not found
revenue : not found

name : construction partners inc
country : united states of america
number_of_employees : 1,035
founded_year : None
revenue : $1.3b

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : sumitomo osaka cement
country : japan
number_of_employees : 3005
founded_year : not found
revenue : not found

name : sumitomo osaka cement co ltd
country : japan
number_of_employees : 3,068
founded_year : None
revenue : $1.6b

2/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : nolte gmbh & co.kgaa
country : rp
number_of_employees : 2,082
founded_year : 1921
revenue : 623.1

name : voith gmbh & co kgaa
country : germany
number_of_employees : 20,634
founded_year : None
revenue : $5.0b

3/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : mts systems corporation
country : usa
number_of_employees : 1,776
founded_year : 2009
revenue : not found

name : nrt
country : germany
number_of_employees : 24
founded_year : 2009
revenue : 2,449,370

3/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : sto express
country : israel
number_of_employees : 5,931
founded_year : 2001.0
revenue : $39.2b

name : yto express
country : None
number_of_employees : 924
founded_year : None
revenue : None

3/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : zhejiang huahai pharmaceutical co. ltd
country : None
number_of_employees : 6,655
founded_year : None
revenue : None

name : zhejiang starry pharmaceutical co. ltd
country : None
number_of_employees : 1,729
founded_year : None
revenue : None

3/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : airtac international
country : taiwan
number_of_employees : None
founded_year : None
revenue : None

name : airtac international group
country : None
number_of_employees : not found
founded_year : None
revenue : None

3/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : gett
country : israel
number_of_employees : 964
founded_year : 1977
revenue : 453 million usd

name : getty images
country : usa
number_of_employees : None
founded_year : None
revenue : None

4/10 positive, 4/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : klabin
country : brazil
number_of_employees : 7,000
founded_year : 1866
revenue : 375 million usd

name : klab
country : japan
number_of_employees : None
founded_year : None
revenue : None

4/10 positive, 5/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : intercom
country : usa
number_of_employees : None
founded_year : 2014
revenue : not found

name : interpublic group of companies
country : usa
number_of_employees : 11,012
founded_year : 1949
revenue : 0 million usd

4/10 positive, 6/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : alibaba
country : china
number_of_employees : 243.903
founded_year : None
revenue : $129.98 b

name : hangzhou lianluo interactive information technology
country : china
number_of_employees : None
founded_year : not found
revenue : not found

4/10 positive, 7/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : ample
country : united states
number_of_employees : None
founded_year : 2014
revenue : $271.3m

name : atlas air worldwide holdings inc
country : united states of america
number_of_employees : 4,056
founded_year : None
revenue : $4.0b

4/10 positive, 8/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : wilmar international
country : singapore
number_of_employees : 100,000
founded_year : 1889
revenue : 18,336 million usd

name : wilmar international ltd
country : None
number_of_employees : 100,000
founded_year : None
revenue : None

4/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : post
country : None
number_of_employees : None
founded_year : None
revenue : None

name : postnl
country : netherlands
number_of_employees : None
founded_year : None
revenue : None

5/10 positive, 9/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : bancolombia
country : colombia
number_of_employees : 31900
founded_year : 1902
revenue : 1,004 million usd

name : bancolombia sa
country : colombia
number_of_employees : 30,000
founded_year : None
revenue : $6.4b

5/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : waters
country : united states
number_of_employees : 7,800
founded_year : 1958
revenue : $2.79 b

name : waters corp
country : united states of america
number_of_employees : 7,800
founded_year : None
revenue : $2.8b

6/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : alibaba
country : china
number_of_employees : 243.903
founded_year : None
revenue : $129.98 b

name : alibaba pictures group ltd
country : None
number_of_employees : 1,348
founded_year : None
revenue : None

7/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : holcim
country : None
number_of_employees : not found
founded_year : None
revenue : None

name : holcim group
country : switzerland
number_of_employees : None
founded_year : None
revenue : None

8/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : toyota
country : None
number_of_employees : None
founded_year : None
revenue : None

name : toyota tsusho
country : japan
number_of_employees : None
founded_year : None
revenue : None

9/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : free
country : None
number_of_employees : None
founded_year : 1999
revenue : None

name : freenet
country : germany
number_of_employees : None
founded_year : None
revenue : None

10/10 positive, 10/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : robinhood
country : None
number_of_employees : 3.400
founded_year : None
revenue : $1.21 b

name : robinhood insurance broker
country : None
number_of_employees : None
founded_year : 2008 (15 yrs old)
revenue : None

10/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : la technologies
country : None
number_of_employees : None
founded_year : None
revenue : None

name : sa technologies
country : None
number_of_employees : None
founded_year : 2002 (21 yrs old)
revenue : None

11/10 positive, 11/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : midea group
country : None
number_of_employees : None
founded_year : None
revenue : None

name : sidea group
country : italy
number_of_employees : 49
founded_year : 2015
revenue : 2,429,590

11/10 positive, 12/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : software and management group ltd
country : None
number_of_employees : None
founded_year : 8 january 2019
revenue : None

name : software ag
country : germany
number_of_employees : None
founded_year : None
revenue : None

11/10 positive, 13/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : idex corporation
country : None
number_of_employees : 7,536
founded_year : None
revenue : None

name : idex
country : usa
number_of_employees : None
founded_year : None
revenue : None

11/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : reckitt benckiser group
country : uk
number_of_employees : 43,000
founded_year : 2007
revenue : 15,849 million usd

name : reckitt benckiser
country : united-kingdom
number_of_employees : None
founded_year : None
revenue : None

12/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : ing group
country : None
number_of_employees : None
founded_year : None
revenue : None

name : ing
country : netherlands
number_of_employees : None
founded_year : None
revenue : None

13/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : shanghai international port (group)
country : china
number_of_employees : 14,068
founded_year : 1985
revenue : 279 million usd

name : shanghai international port
country : united states
number_of_employees : 1,854
founded_year : 1993.0
revenue : $5.2b

14/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : druva
country : usa
number_of_employees : 467
founded_year : 2013
revenue : not found

name : druva
country : united states
number_of_employees : None
founded_year : 2007
revenue : $475m

15/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : cogeco communications
country : canada
number_of_employees : 713
founded_year : 1972
revenue : 1,930 million usd

name : cogeco inc
country : canada
number_of_employees : 3,844
founded_year : None
revenue : $2.4b

16/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : h & a holdings limited
country : None
number_of_employees : None
founded_year : 19 june 1973
revenue : None

name : v & a holdings limited
country : None
number_of_employees : None
founded_year : 5 may 1993
revenue : None

17/10 positive, 14/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : china resources mixc lifestyle services limited
country : china
number_of_employees : 20,200
founded_year : 2005
revenue : 939 million usd

name : dxy
country : china
number_of_employees : 568
founded_year : 2011
revenue : not found

17/10 positive, 15/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : pt bank rakyat indonesia (persero) tbk
country : indonesia
number_of_employees : 61,531
founded_year : 2009
revenue : 0 million usd

name : pt semen indonesia (persero) tbk
country : indonesia
number_of_employees : 9,297
founded_year : None
revenue : $2.4b

17/10 positive, 16/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


name : archer daniels midland company careers
country : chicago, il
number_of_employees : 10,000+ employees
founded_year : 1902
revenue : None

name : archer-daniels-midland company
country : None
number_of_employees : 39,218
founded_year : minneapolis, minnesota, u.s.1902
revenue : None

17/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : seven west media limited
country : None
number_of_employees : not found
founded_year : None
revenue : None

name : seven west media ltd
country : australia
number_of_employees : 4,528
founded_year : None
revenue : $1.2b

18/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : intesa sanpaolo s.p.a.
country : None
number_of_employees : 78,000
founded_year : None
revenue : None

name : intesa sanpaolo spa
country : None
number_of_employees : None
founded_year : 1925
revenue : None

19/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : canadian tire corp ltd
country : canada
number_of_employees : 13,435
founded_year : None
revenue : $12.2b

name : canadian tire corporation, limited
country : None
number_of_employees : 13,435
founded_year : None
revenue : None

20/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


name : bilt
country : None
number_of_employees : None
founded_year : 1945 (78 yrs old)
revenue : None

name : bilt rewards
country : united states
number_of_employees : None
founded_year : 2021
revenue : $210.3m

21/10 positive, 17/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling


In [128]:
deduper.train()

In [129]:
print('clustering...')

start_time = time.time()
clustered_dupes = deduper.partition(data_d, 0.5)

print('Numero sets duplicati', len(clustered_dupes))
print("Durata clustering: ", time.time() - start_time, "secondi")

clustering...
Numero sets duplicati 34616
Durata clustering:  1408.964673757553 secondi


In [130]:
import csv 

cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
    for record_id, score in zip(records, scores):
        cluster_membership[record_id] = {
            "Cluster ID": cluster_id,
            "confidence_score": score
        }

In [131]:
cluster_membership['Apple']

{'Cluster ID': 608, 'confidence_score': 0.5526490198073372}

In [None]:
with open('aziendeMerged_cluster.csv', 'w', encoding="utf8") as f_output, open('aziendeMerged_ordinato.csv', encoding="utf8") as f_input:

    reader = csv.DictReader(f_input)
    fieldnames = ['Cluster ID', 'confidence_score'] + reader.fieldnames

    writer = csv.DictWriter(f_output, fieldnames=fieldnames)
    writer.writeheader()

    for row in reader:
      row_id = str(row['name'])
      row.update(cluster_membership[row_id])
      writer.writerow(row)

## **Model evaluation**

In [26]:
import pandas as pd
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

# Leggi i dati
true_data = pd.read_csv('aziendeMerged_trueCluster.csv', nrows=2000)
cluster_data = pd.read_csv('aziendeMerged_cluster.csv', nrows=2000)

# Seleziona le colonne di interesse
true_data = true_data[['TrueID']]
cluster_data = cluster_data[['Cluster ID']]

# Unisci i dati in un unico dataframe
df = pd.concat([true_data, cluster_data], axis=1)
df

Unnamed: 0,TrueID,Cluster ID
0,415,415
1,698,698
2,27015,27015
3,2343,2343
4,9707,9707
...,...,...
1995,111172,698
1996,111173,698
1997,111174,698
1998,111175,698


In [27]:
# Conta il numero di valori corrispondenti e non corrispondenti
true_positive = df[df['TrueID'] == df['Cluster ID']].shape[0]
false_positive = df[df['TrueID'] != df['Cluster ID']].shape[0]

# Calcola la precision, recall e f1-score
precision = precision_score(df['TrueID'], df['Cluster ID'], average='weighted')
recall = recall_score(df['TrueID'], df['Cluster ID'], average='weighted')
f1score = f1_score(df['TrueID'], df['Cluster ID'], average='weighted')

# Stampa i risultati
print('True positive:', true_positive)
print('False positive:', false_positive)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1score)

True positive: 1907
False positive: 93
Precision: 0.9403592450390117
Recall: 0.9535
F1-score: 0.9438534613812217
