## Record Linkage

In [1]:
!pip install recordlinkage --q

In [2]:
import numpy as np 
import pandas as pd
import recordlinkage

In [3]:
final = pd.read_csv("src/final_dataset.csv")
final = final.drop(columns=['Unnamed: 0'])
final.head()

Unnamed: 0,name,country,sector,founded,marketcap,revenue,employees,links,ceo
0,LACEWORK,UNITED STATES,CYBERSECURITY,2015.0,,,,,
1,TIPALTI,UNITED STATES,FINTECH,2010.0,,,,,
2,TEMPUS,UNITED STATES,HEALTH,2015.0,,,,,
3,ANDURIL,UNITED STATES,ARTIFICIAL INTELLIGENCE,2017.0,,,,,
4,BOLT,ESTONIA,AUTO & TRANSPORTATION,2013.0,,,,,


In [4]:
final = final.sort_values('name')
final = final.reset_index(drop=True)
final.head()

Unnamed: 0,name,country,sector,founded,marketcap,revenue,employees,links,ceo
0,ADDUS HOMECARE,USA,,,1690000000.0,,,,
1,#SINOB,GERMANY,RETAIL,2015.0,,4.0,14.0,HTTPS://WWW.SINOB.DE/,
2,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIN,SPAIN,,,870000000.0,,,,
3,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,,870000000.0,,,,
4,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,,820000000.0,,,,


In [5]:
final.isnull().sum()

name              0
country        6933
sector       122786
founded      132943
marketcap     98028
revenue      121663
employees    130431
links        156234
ceo          151873
dtype: int64

In [6]:
for column_name in final.columns:
    column = final[column_name]
    # Get the count of Zeros in column 
    count = (column == 0.0).sum()
    print('Count of zeros in column', column_name, ' is: ', count)

Count of zeros in column name  is:  0
Count of zeros in column country  is:  0
Count of zeros in column sector  is:  0
Count of zeros in column founded  is:  0
Count of zeros in column marketcap  is:  12
Count of zeros in column revenue  is:  416
Count of zeros in column employees  is:  0
Count of zeros in column links  is:  0
Count of zeros in column ceo  is:  0


In [7]:
for c in final.columns:
    print(final[c].dtype)
    if final[c].dtype == float:
        final[c] = final[c].replace(np.nan, 0)

object
object
object
float64
float64
float64
float64
object
object


In [8]:
final.isnull().sum()

name              0
country        6933
sector       122786
founded           0
marketcap         0
revenue           0
employees         0
links        156234
ceo          151873
dtype: int64

In [9]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173033 entries, 0 to 173032
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   name       173033 non-null  object 
 1   country    166100 non-null  object 
 2   sector     50247 non-null   object 
 3   founded    173033 non-null  float64
 4   marketcap  173033 non-null  float64
 5   revenue    173033 non-null  float64
 6   employees  173033 non-null  float64
 7   links      16799 non-null   object 
 8   ceo        21160 non-null   object 
dtypes: float64(4), object(5)
memory usage: 11.9+ MB


## Index

In [10]:
indexer = recordlinkage.Index()
indexer.block('name')
pairs_name = indexer.index(final)
indexer.sortedneighbourhood('name', window=1)
pairs_neigh = indexer.index(final)

In [11]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [12]:
# from recordlinkage.base import BaseIndexAlgorithm
# import time
# class CustomIndex(BaseIndexAlgorithm):
#     def _link_index(self, df_a, df_b):
#         t0 = time.time()
#         print(len(df_a))
#         indici1=[]
#         indici2=[]
#         x = 50
#         for i in range(0, len(df_a)):
#             if(i%5000 == 0):
#                 print(i)
#             if (i + x) >= len(df_a):
#                 x -= 1
#             for j in range(i, (i+x)):
#                 if(similar(df_a.loc[i, 'name'], df_a.loc[j, 'name'])>0.35):
#                     indici1.append(i)
#                     indici2.append(j)
        
#         indici = [indici1, indici2]
#         t1 = time.time()
#         print(t1-t0)
#         return pd.MultiIndex.from_arrays(indici, names=('first', 'second'))  

In [13]:
# indexer = CustomIndex()
# candidate_pairs = indexer.index(final, final)
# pairs = candidate_pairs

In [14]:
pairs = pairs_neigh.append(pairs_name)
pairs = pairs.drop_duplicates(keep='first')

In [15]:
print(pairs)

MultiIndex([(     4,      3),
            (     5,      3),
            (     5,      4),
            (     6,      3),
            (     6,      4),
            (     6,      5),
            (     7,      3),
            (     7,      4),
            (     7,      5),
            (     7,      6),
            ...
            (173020, 173019),
            (173022, 173021),
            (173024, 173023),
            (173026, 173025),
            (173028, 173027),
            (173029, 173027),
            (173029, 173028),
            (173030, 173027),
            (173030, 173028),
            (173030, 173029)],
           length=491440)


In [16]:
# 80% training set, 20% test set
n_perc = int((len(pairs) * 80) / 100)
train_pairs = pairs[:n_perc]
test_pairs = pairs[n_perc:]

## Training 

In [17]:
from recordlinkage.base import BaseCompareFeature
class CompareNumbers(BaseCompareFeature):
    # Similarity score between two numbers
    def _compute_vectorized(self, s1, s2):
        s1 = abs(s1)
        s2 = abs(s2)
        return round(1 - abs(s1 - s2) / (s1 + s2))

In [18]:
# jarowinkler gives priority to the begining of the string
# levenshtein cares more about the order
# compare.string -> default: levenshtein
compare = recordlinkage.Compare()

# add missing_value=pd.NA in compare.string if founded, marketcap, revenue, employees added
compare.string('name', 'name', label="name", threshold=0.60)
compare.string('country', 'country', label="country", threshold=0.60, missing_value=np.nan)
compare.string('sector', 'sector', label="sector", threshold=0.50, missing_value=np.nan)
# compare.add(CompareNumbers('founded', 'founded', label="founded"))
# compare.add(CompareNumbers('marketcap', 'marketcap', label="marketcap"))
# compare.add(CompareNumbers('revenue', 'revenue', label="revenue"))
# compare.add(CompareNumbers('employees', 'employees', label="employees"))
compare.string('links', 'links', method='jarowinkler', label="links", threshold=0.50, missing_value=np.nan)
compare.string('ceo', 'ceo', method='jarowinkler', label="ceo", threshold=0.60, missing_value=np.nan)

training_features = compare.compute(train_pairs, final, final)
training_features['score'] = training_features.loc[:, 'name':'ceo'].sum(axis=1)
training_features['null_values'] = training_features.loc[:, 'name':'ceo'].isnull().sum(axis=1)

In [19]:
training_features.head(10)

Unnamed: 0,Unnamed: 1,name,country,sector,links,ceo,score,null_values
4,3,1.0,1.0,,,,2.0,3
5,3,1.0,1.0,,,,2.0,3
5,4,1.0,1.0,,,,2.0,3
6,3,1.0,1.0,,,,2.0,3
6,4,1.0,1.0,,,,2.0,3
6,5,1.0,1.0,,,,2.0,3
7,3,1.0,1.0,,,,2.0,3
7,4,1.0,1.0,,,,2.0,3
7,5,1.0,1.0,,,,2.0,3
7,6,1.0,1.0,,,,2.0,3


In [20]:
for c in training_features.columns:
    training_features[c] = training_features[c].replace(np.nan, 0)

In [21]:
col = len(final.columns) - 4
training_matches = training_features[training_features['score'] >= (col - training_features['null_values'])/2]
matches_train = training_matches
training_matches = training_matches.reset_index()

In [22]:
# toDrop = ['name', 'country', 'sector', 'founded', 'marketcap', 'revenue', 'employees', 'links', 'ceo', 'score', 'null_values']
toDrop = ['name', 'country', 'sector', 'links', 'ceo', 'score', 'null_values']
training_matches = training_matches.drop(toDrop, axis=1)
training_matches.head()

Unnamed: 0,level_0,level_1
0,4,3
1,5,3
2,5,4
3,6,3
4,6,4


In [23]:
training_matches = pd.MultiIndex.from_frame(training_matches)

In [24]:
training_features = training_features.drop('null_values', axis=1)
training_features.head()

Unnamed: 0,Unnamed: 1,name,country,sector,links,ceo,score
4,3,1.0,1.0,0.0,0.0,0.0,2.0
5,3,1.0,1.0,0.0,0.0,0.0,2.0
5,4,1.0,1.0,0.0,0.0,0.0,2.0
6,3,1.0,1.0,0.0,0.0,0.0,2.0
6,4,1.0,1.0,0.0,0.0,0.0,2.0


#### Classifier

In [25]:
classifier = recordlinkage.NaiveBayesClassifier()
classifier.fit(training_features, training_matches)

## Testing

In [26]:
compare = recordlinkage.Compare()

# add missing_value=pd.NA in compare.string if founded, marketcap, revenue, employees added
compare.string('name', 'name', label="name", threshold=0.60)
compare.string('country', 'country', label="country", threshold=0.60, missing_value=np.nan)
compare.string('sector', 'sector', label="sector", threshold=0.50, missing_value=np.nan)
# compare.add(CompareNumbers('founded', 'founded', label="founded"))
# compare.add(CompareNumbers('marketcap', 'marketcap', label="marketcap"))
# compare.add(CompareNumbers('revenue', 'revenue', label="revenue"))
# compare.add(CompareNumbers('employees', 'employees', label="employees"))
compare.string('links', 'links', method='jarowinkler', label="links", threshold=0.50, missing_value=np.nan)
compare.string('ceo', 'ceo', method='jarowinkler', label="ceo", threshold=0.60, missing_value=np.nan)

test_features = compare.compute(test_pairs, final, final)
test_features['score'] = test_features.loc[:, 'name':'ceo'].sum(axis=1)
test_features['null_values'] = test_features.loc[:, 'name':'ceo'].isnull().sum(axis=1)

In [27]:
# uncomment if founded, marketcap, revenue, employees used
for c in test_features.columns:
    test_features[c] = test_features[c].replace(np.nan, 0)

In [28]:
col = len(final.columns) - 4
test_matches = test_features[test_features['score'] >= (col - test_features['null_values'])/2]
matches_test = test_matches
test_matches = test_matches.reset_index()

In [29]:
# toDrop = ['name', 'country', 'sector', 'founded', 'marketcap', 'revenue', 'employees', 'links', 'ceo', 'score', 'null_values']
toDrop = ['name', 'country', 'sector', 'links', 'ceo', 'score', 'null_values']
test_matches = test_matches.drop(toDrop, axis=1)
test_matches.head()

Unnamed: 0,level_0,level_1
0,137955,137954
1,137956,137953
2,137956,137954
3,137956,137955
4,137957,137953


In [30]:
test_matches = pd.MultiIndex.from_frame(test_matches)

In [31]:
test_features = test_features.drop('null_values', axis=1)
test_features.head()

Unnamed: 0,Unnamed: 1,name,country,sector,links,ceo,score
137955,137954,1.0,1.0,0.0,0.0,0.0,2.0
137956,137953,1.0,1.0,0.0,0.0,0.0,2.0
137956,137954,1.0,1.0,0.0,0.0,0.0,2.0
137956,137955,1.0,1.0,0.0,0.0,0.0,2.0
137957,137953,1.0,1.0,0.0,0.0,0.0,2.0


In [32]:
predictions = classifier.predict(test_features)

## Evaluation

In [33]:
# Confusion matrix
confusion_matrix = recordlinkage.confusion_matrix(test_matches, predictions, len(test_features))
print(confusion_matrix)


# Metrics
recall = recordlinkage.recall(test_matches, predictions)
print('\nRecall:', recall)
precision = recordlinkage.precision(test_matches, predictions)
print('Precision:', precision)
fscore = recordlinkage.fscore(confusion_matrix)
print('F-score:', fscore)
accuracy = recordlinkage.accuracy(test_matches, predictions, len(test_features))
print('Accuracy:', accuracy)

[[97478     0]
 [  810     0]]

Recall: 1.0
Precision: 0.9917589125834283
F-score: 0.9958624071595681
Accuracy: 0.9917589125834283


In [34]:
false_negatives = test_matches.difference(predictions)
false_negatives

MultiIndex([], )

In [35]:
try:
    fn_from_dfA = false_negatives[0][0]
    fn_from_dfB = false_negatives[0][1]

    display(final[final.index == fn_from_dfA])
    display(final[final.index == fn_from_dfB])
except:
    print("No False Negatives Present")

No False Negatives Present


In [36]:
try: 
    fn_from_dfA = false_negatives[1][0]
    fn_from_dfB = false_negatives[1][1]

    display(final[final.index == fn_from_dfA])
    display(final[final.index == fn_from_dfB])
except: 
    print("No False Negatives Present")

No False Negatives Present


## Linking tables

### Train set

In [37]:
matches_train.head()

Unnamed: 0,Unnamed: 1,name,country,sector,links,ceo,score,null_values
4,3,1.0,1.0,0.0,0.0,0.0,2.0,3
5,3,1.0,1.0,0.0,0.0,0.0,2.0,3
5,4,1.0,1.0,0.0,0.0,0.0,2.0,3
6,3,1.0,1.0,0.0,0.0,0.0,2.0,3
6,4,1.0,1.0,0.0,0.0,0.0,2.0,3


In [38]:
l = []
print(len(matches_train))
for m in matches_train.iterrows():
    match = matches_train.loc[m[0][0]].index.to_list()
    l.append(match)

df = pd.DataFrame(l)
df = df.drop_duplicates()
df[len(df.columns)] = df.loc[:, :].isnull().sum(axis=1)
no_duplicates = df.values.tolist()


389564


In [39]:
d = {}
n = 0
for i in no_duplicates:
    l = int(len(i) - i[-1] - 1)
    if l == 1:
        d[i[0]] = n
    else:
        for j in range(l):
            d[i[j]] = n
    n += 1

In [40]:
m_train = final[final.index.isin(d)].copy(deep=True)
m_train['id'] = m_train.index.map(d)

In [41]:
m_train.head(10)

Unnamed: 0,name,country,sector,founded,marketcap,revenue,employees,links,ceo,id
3,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,0.0,870000000.0,0.0,0.0,,,4
4,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,0.0,820000000.0,0.0,0.0,,,4
5,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,SUPERMARKETS FOOD RETAIL,0.0,870000000.0,0.0,0.0,,,4
6,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,0.0,820000000.0,0.0,0.0,,,4
7,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,0.0,880000000.0,0.0,0.0,,,4
9,(HLBANK) HONG LEONG BANK,MALAYSIA,,0.0,9720000000.0,0.0,0.0,,,9
10,(HLBANK) HONG LEONG BANK,MALAYSIA,,0.0,9700000000.0,0.0,0.0,,,9
11,(HLBANK) HONG LEONG BANK,MALAYSIA,,0.0,10020000000.0,0.0,0.0,,,9
12,(HLBANK) HONG LEONG BANK,MALAYSIA,BANKS FINANCIAL-SERVICES,0.0,9970000000.0,0.0,0.0,,,9
13,(HLBANK) HONG LEONG BANK,MALAYSIA,,0.0,9970000000.0,0.0,0.0,,,9


In [42]:
m_train = m_train.groupby(['id']).agg({'name': 'first',
                             'country': 'first',
                             'sector': 'first',
                             'founded': 'first',
                             'marketcap': 'first',
                             'revenue': 'first',
                             'employees': 'first',
                             'links': 'first',
                             'ceo': 'first'})


In [43]:
# m_train.to_csv('src/train_companies.csv')

### Test set

In [44]:
matches_test.head()

Unnamed: 0,Unnamed: 1,name,country,sector,links,ceo,score,null_values
137955,137954,1.0,1.0,0.0,0.0,0.0,2.0,3
137956,137953,1.0,1.0,0.0,0.0,0.0,2.0,3
137956,137954,1.0,1.0,0.0,0.0,0.0,2.0,3
137956,137955,1.0,1.0,0.0,0.0,0.0,2.0,3
137957,137953,1.0,1.0,0.0,0.0,0.0,2.0,3


In [45]:
l = []
print(len(matches_test))
for m in matches_test.iterrows():
    match = matches_test.loc[m[0][0]].index.to_list()
    l.append(match)

df = pd.DataFrame(l)
df = df.drop_duplicates()
df[len(df.columns)] = df.loc[:, :].isnull().sum(axis=1)
no_duplicates = df.values.tolist()


97478


In [46]:
d = {}
n = 0
for i in no_duplicates:
    l = int(len(i) - i[-1] - 1)
    if l == 1:
        d[i[0]] = n
    else:
        for j in range(l):
            d[i[j]] = n
    n += 1

In [47]:
m_test = final[final.index.isin(d)].copy(deep=True)
m_test['id'] = m_test.index.map(d)

In [48]:
m_test.head()

Unnamed: 0,name,country,sector,founded,marketcap,revenue,employees,links,ceo,id
137953,"SICHUAN HONGDA CO.,LTD",CHINA,,0.0,0.0,0.0,0.0,,,2
137954,"SICHUAN HONGDA CO.,LTD",CHINA,,0.0,0.0,0.0,0.0,,,2
137955,"SICHUAN HONGDA CO.,LTD",CHINA,,0.0,0.0,0.0,0.0,HTTP://WWW.SICHUANHONGDA.COM/,,2
137956,"SICHUAN HONGDA CO.,LTD",CHINA,,0.0,1019000000.0,0.0,0.0,,,2
137958,"SICHUAN INJET ELECTRIC STOCK CO.,LTD.",CHINA,,0.0,1234000000.0,0.0,0.0,,,6


In [49]:
m_test = m_test.groupby(['id']).agg({'name': 'first',
                                     'country': 'first',
                                     'sector': 'first',
                                     'founded': 'first',
                                     'marketcap': 'first',
                                     'revenue': 'first',
                                     'employees': 'first',
                                     'links': 'first',
                                     'ceo': 'first'})


In [50]:
# m_test.to_csv('src/test_companies.csv')

In [51]:
new_companies = pd.concat([m_train, m_test])

In [52]:
new_companies

Unnamed: 0_level_0,name,country,sector,founded,marketcap,revenue,employees,links,ceo
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,SUPERMARKETS FOOD RETAIL,0.0,8.700000e+08,0.0,0.0,,
9,(HLBANK) HONG LEONG BANK,MALAYSIA,BANKS FINANCIAL-SERVICES,0.0,9.720000e+09,0.0,0.0,,
10,0X,UNITED STATES,FINTECH,0.0,0.000000e+00,0.0,0.0,,
15,1&1,GERMANY,,0.0,2.430000e+09,0.0,0.0,,
21,1&1 AG,GERMANY,COMMUNICATION SERVICES,0.0,0.000000e+00,4312000.0,3163.0,HTTPS://WWW.1UND1-DRILLISCH.DE,MR. RALPH DOMMERMUTH
...,...,...,...,...,...,...,...,...,...
23166,ŽELEZNIČNÁ SPOLOČNOSŤ SLOVENSKO,BRATISLAVA,RAILROADS,2004.0,0.000000e+00,0.0,0.0,,
23167,ŽIA VALDA,VILNIUS,INVESTMENT SERVICES,1997.0,0.000000e+00,0.0,0.0,,
23168,ŽIVNOSTENSKÁ BANKA,PRAGUE,BANKS,1868.0,0.000000e+00,0.0,0.0,,
23169,ȚIRIAC HOLDINGS,BUCHAREST,-,1990.0,0.000000e+00,0.0,0.0,,


In [53]:
new_companies = new_companies.replace(0, np.nan)
new_companies = new_companies.reset_index()
new_companies = new_companies.drop('id', axis=1)

new_companies.founded = new_companies.founded.astype(object).astype('Int64')
new_companies.marketcap = new_companies.marketcap.astype(object).astype('Int64')
new_companies.revenue = new_companies.revenue.astype(object).astype('Int64')
new_companies.employees = new_companies.employees.astype(object).astype('Int64')

In [54]:
new_companies.to_csv('src/new_dataset.csv')