## Record Linkage

In [1]:
!pip install recordlinkage --q

In [2]:
import numpy as np 
import pandas as pd
import recordlinkage

In [3]:
final = pd.read_csv("src/final_dataset.csv")
final = final.drop(columns=['Unnamed: 0'])
final.head()

Unnamed: 0,name,country,sector,founded,marketcap,revenue,employees,links,ceo
0,LACEWORK,UNITED STATES,CYBERSECURITY,2015.0,,,,,
1,TIPALTI,UNITED STATES,FINTECH,2010.0,,,,,
2,TEMPUS,UNITED STATES,HEALTH,2015.0,,,,,
3,ANDURIL,UNITED STATES,ARTIFICIAL INTELLIGENCE,2017.0,,,,,
4,BOLT,ESTONIA,AUTO & TRANSPORTATION,2013.0,,,,,


In [4]:
final = final.sort_values('name')
final = final.reset_index(drop=True)
final.head()

Unnamed: 0,name,country,sector,founded,marketcap,revenue,employees,links,ceo
0,ADDUS HOMECARE,USA,,,1690000000.0,,,,
1,#SINOB,GERMANY,RETAIL,2015.0,,4.0,14.0,HTTPS://WWW.SINOB.DE/,
2,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIN,SPAIN,,,870000000.0,,,,
3,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,,870000000.0,,,,
4,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,,820000000.0,,,,


In [5]:
final.isnull().sum()

name              0
country        6933
sector       122786
founded      132943
marketcap     98028
revenue      121663
employees    130431
links        156234
ceo          151873
dtype: int64

In [6]:
for c in final.columns:
    print(final[c].dtype)
    if final[c].dtype == float:
        final[c] = final[c].replace(np.nan, 0)

object
object
object
float64
float64
float64
float64
object
object


In [7]:
final.isnull().sum()

name              0
country        6933
sector       122786
founded           0
marketcap         0
revenue           0
employees         0
links        156234
ceo          151873
dtype: int64

In [8]:
final.head()

Unnamed: 0,name,country,sector,founded,marketcap,revenue,employees,links,ceo
0,ADDUS HOMECARE,USA,,0.0,1690000000.0,0.0,0.0,,
1,#SINOB,GERMANY,RETAIL,2015.0,0.0,4.0,14.0,HTTPS://WWW.SINOB.DE/,
2,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIN,SPAIN,,0.0,870000000.0,0.0,0.0,,
3,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,0.0,870000000.0,0.0,0.0,,
4,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,0.0,820000000.0,0.0,0.0,,


In [9]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173033 entries, 0 to 173032
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   name       173033 non-null  object 
 1   country    166100 non-null  object 
 2   sector     50247 non-null   object 
 3   founded    173033 non-null  float64
 4   marketcap  173033 non-null  float64
 5   revenue    173033 non-null  float64
 6   employees  173033 non-null  float64
 7   links      16799 non-null   object 
 8   ceo        21160 non-null   object 
dtypes: float64(4), object(5)
memory usage: 11.9+ MB


In [10]:
final.head()

Unnamed: 0,name,country,sector,founded,marketcap,revenue,employees,links,ceo
0,ADDUS HOMECARE,USA,,0.0,1690000000.0,0.0,0.0,,
1,#SINOB,GERMANY,RETAIL,2015.0,0.0,4.0,14.0,HTTPS://WWW.SINOB.DE/,
2,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIN,SPAIN,,0.0,870000000.0,0.0,0.0,,
3,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,0.0,870000000.0,0.0,0.0,,
4,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,0.0,820000000.0,0.0,0.0,,


## Block Index

In [11]:
indexer = recordlinkage.Index()
indexer.block('name')
pairs = indexer.index(final, final)

In [12]:
print(pairs)

MultiIndex([(     0,      0),
            (     1,      1),
            (     2,      2),
            (     3,      3),
            (     3,      4),
            (     3,      5),
            (     3,      6),
            (     3,      7),
            (     3,      8),
            (     4,      3),
            ...
            (173029, 173027),
            (173029, 173028),
            (173029, 173029),
            (173029, 173030),
            (173030, 173027),
            (173030, 173028),
            (173030, 173029),
            (173030, 173030),
            (173031, 173031),
            (173032, 173032)],
           length=1155913)


In [13]:
# 80% training set, 20% test set
n_perc = int((len(pairs) * 80) / 100)
train_pairs = pairs[:n_perc]
test_pairs = pairs[n_perc:]

## Training 

In [14]:
from recordlinkage.base import BaseCompareFeature
class CompareNumbers(BaseCompareFeature):
    # Similarity score between two numbers
    def _compute_vectorized(self, s1, s2):
        s1 = abs(s1)
        s2 = abs(s2)
        return round(1 - abs(s1 - s2) / (s1 + s2))

In [15]:
# jarowinkler gives priority to the begining of the string
# levenshtein cares more about the order
# compare.string -> default: levenshtein
compare = recordlinkage.Compare()

compare.string('name', 'name', label="name", threshold=0.90)
compare.string('country', 'country', label="country", threshold=0.90)
compare.string('sector', 'sector', label="sector", threshold=0.80)
# compare.add(CompareNumbers('founded', 'founded', label="founded"))
# compare.add(CompareNumbers('marketcap', 'marketcap', label="marketcap"))
# compare.add(CompareNumbers('revenue', 'revenue', label="revenue"))
# compare.add(CompareNumbers('employees', 'employees', label="employees"))
compare.string('links', 'links', method='jarowinkler', label="links", threshold=0.80)
compare.string('ceo', 'ceo', method='jarowinkler', label="ceo", threshold=0.90)

training_features = compare.compute(train_pairs, final, final)
training_features['score'] = training_features.loc[:, 'name':'ceo'].sum(axis=1)
training_features['null_values'] = training_features.loc[:, 'name':'ceo'].isnull().sum(axis=1)

In [16]:
training_features.head(10)

Unnamed: 0,Unnamed: 1,name,country,sector,links,ceo,score,null_values
0,0,1.0,1.0,0.0,0.0,0.0,2.0,0
1,1,1.0,1.0,1.0,1.0,0.0,4.0,0
2,2,1.0,1.0,0.0,0.0,0.0,2.0,0
3,3,1.0,1.0,0.0,0.0,0.0,2.0,0
3,4,1.0,1.0,0.0,0.0,0.0,2.0,0
3,5,1.0,1.0,0.0,0.0,0.0,2.0,0
3,6,1.0,1.0,0.0,0.0,0.0,2.0,0
3,7,1.0,1.0,0.0,0.0,0.0,2.0,0
3,8,1.0,1.0,0.0,0.0,0.0,2.0,0
4,3,1.0,1.0,0.0,0.0,0.0,2.0,0


In [17]:
# for c in training_features.columns:
#     training_features[c] = training_features[c].replace(np.nan, 0)

In [18]:
final.loc[3:8]

Unnamed: 0,name,country,sector,founded,marketcap,revenue,employees,links,ceo
3,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,0.0,870000000.0,0.0,0.0,,
4,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,0.0,820000000.0,0.0,0.0,,
5,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,SUPERMARKETS FOOD RETAIL,0.0,870000000.0,0.0,0.0,,
6,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,0.0,820000000.0,0.0,0.0,,
7,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,0.0,880000000.0,0.0,0.0,,
8,(DIA) DISTRIBUIDORA INTERNACIONAL DE ALIMENTACIÓN,SPAIN,,0.0,830000000.0,0.0,0.0,,


In [19]:
col = len(final.columns) - 4
training_matches = training_features[training_features['score'] > (col - training_features['null_values'])/2].reset_index()

In [20]:
# toDrop = ['name', 'country', 'sector', 'founded', 'marketcap', 'revenue', 'employees', 'links', 'ceo', 'score', 'null_values']
toDrop = ['name', 'country', 'sector', 'links', 'ceo', 'score', 'null_values']
training_matches = training_matches.drop(toDrop, axis=1)
training_matches.head()

Unnamed: 0,level_0,level_1
0,1,1
1,5,5
2,12,12
3,15,15
4,17,17


In [21]:
training_matches = pd.MultiIndex.from_frame(training_matches) #matches conversion

In [22]:
training_features = training_features.drop('null_values', axis=1)
training_features.head()

Unnamed: 0,Unnamed: 1,name,country,sector,links,ceo,score
0,0,1.0,1.0,0.0,0.0,0.0,2.0
1,1,1.0,1.0,1.0,1.0,0.0,4.0
2,2,1.0,1.0,0.0,0.0,0.0,2.0
3,3,1.0,1.0,0.0,0.0,0.0,2.0
3,4,1.0,1.0,0.0,0.0,0.0,2.0


#### Classifier

In [23]:
classifier = recordlinkage.NaiveBayesClassifier()
classifier.fit(training_features, training_matches)

## Testing

In [24]:
compare = recordlinkage.Compare()

compare.string('name', 'name', label="name", threshold=0.90)
compare.string('country', 'country', label="country", threshold=0.90)
compare.string('sector', 'sector', label="sector", threshold=0.80)
# compare.add(CompareNumbers('founded', 'founded', label="founded"))
# compare.add(CompareNumbers('marketcap', 'marketcap', label="marketcap"))
# compare.add(CompareNumbers('revenue', 'revenue', label="revenue"))
# compare.add(CompareNumbers('employees', 'employees', label="employees"))
compare.string('links', 'links', method='jarowinkler', label="links", threshold=0.80)
compare.string('ceo', 'ceo', method='jarowinkler', label="ceo", threshold=0.90)

test_features = compare.compute(test_pairs, final, final)
test_features['score'] = test_features.loc[:, 'name':'ceo'].sum(axis=1)
test_features['null_values'] = test_features.loc[:, 'name':'ceo'].isnull().sum(axis=1)

In [25]:
test_features.head()

Unnamed: 0,Unnamed: 1,name,country,sector,links,ceo,score,null_values
138044,138046,1.0,1.0,0.0,0.0,0.0,2.0,0
138044,138047,1.0,1.0,0.0,0.0,0.0,2.0,0
138044,138048,1.0,1.0,0.0,0.0,0.0,2.0,0
138045,138044,1.0,1.0,0.0,0.0,0.0,2.0,0
138045,138045,1.0,1.0,0.0,0.0,0.0,2.0,0


In [26]:
# for c in test_features.columns:
#     test_features[c] = test_features[c].replace(np.nan, 0)

In [27]:
col = len(final.columns) - 4
test_matches = test_features[test_features['score'] > (col - test_features['null_values'])/2].reset_index()

In [28]:
# toDrop = ['name', 'country', 'sector', 'founded', 'marketcap', 'revenue', 'employees', 'links', 'ceo', 'score', 'null_values']
toDrop = ['name', 'country', 'sector', 'links', 'ceo', 'score', 'null_values']
test_matches = test_matches.drop(toDrop, axis=1)
test_matches.head()

Unnamed: 0,level_0,level_1
0,138050,138050
1,138052,138052
2,138054,138054
3,138055,138055
4,138055,138056


In [29]:
test_matches = pd.MultiIndex.from_frame(test_matches) #matches conversion

In [30]:
test_features = test_features.drop('null_values', axis=1)
test_features.head()

Unnamed: 0,Unnamed: 1,name,country,sector,links,ceo,score
138044,138046,1.0,1.0,0.0,0.0,0.0,2.0
138044,138047,1.0,1.0,0.0,0.0,0.0,2.0
138044,138048,1.0,1.0,0.0,0.0,0.0,2.0
138045,138044,1.0,1.0,0.0,0.0,0.0,2.0
138045,138045,1.0,1.0,0.0,0.0,0.0,2.0


In [31]:
predictions = classifier.predict(test_features)

## Evaluation

In [32]:
# Confusion matrix
confusion_matrix = recordlinkage.confusion_matrix(test_matches, predictions, len(test_features))
print(confusion_matrix)


# Metrics
recall = recordlinkage.recall(test_matches, predictions)
print('\nRecall:', recall)
precision = recordlinkage.precision(test_matches, predictions)
print('Precision:', precision)
fscore = recordlinkage.fscore(confusion_matrix)
print('F-score:', fscore)
accuracy = recordlinkage.accuracy(test_matches, predictions, len(test_features))
print('Accuracy:', accuracy)

[[ 31320      0]
 [     0 199863]]

Recall: 1.0
Precision: 1.0
F-score: 1.0
Accuracy: 1.0


In [33]:
false_negatives = test_matches.difference(predictions)
false_negatives

MultiIndex([], )

In [34]:
try:
    fn_from_dfA = false_negatives[0][0]
    fn_from_dfB = false_negatives[0][1]

    display(final[final.index == fn_from_dfA])
    display(final[final.index == fn_from_dfB])
except:
    print("No False Negatives Present")

No False Negatives Present


In [35]:
try: 
    fn_from_dfA = false_negatives[1][0]
    fn_from_dfB = false_negatives[1][1]

    display(final[final.index == fn_from_dfA])
    display(final[final.index == fn_from_dfB])
except: 
    print("No False Negatives Present")

No False Negatives Present
