## Record Linkage

In [59]:
!pip install recordlinkage --q

In [60]:
import numpy as np 
import pandas as pd
import recordlinkage
import time

In [61]:
t_start = time.time()

In [62]:
final = pd.read_csv("src/aligned_dataset.csv")
final = final.drop(columns=['Unnamed: 0'])
final.head()

Unnamed: 0,name,country,sector,founded,marketcap,revenue,employees,website,ceo
0,LACEWORK,UNITED STATES,CYBERSECURITY,2015.0,,,,,
1,TIPALTI,UNITED STATES,FINTECH,2010.0,,,,,
2,TEMPUS,UNITED STATES,HEALTH,2015.0,,,,,
3,ANDURIL,UNITED STATES,ARTIFICIAL INTELLIGENCE,2017.0,,,,,
4,BOLT,ESTONIA,AUTO TRANSPORTATION,2013.0,,,,,


In [63]:
final = final.sort_values('name')
final = final.reset_index(drop=True)
final.head()

Unnamed: 0,name,country,sector,founded,marketcap,revenue,employees,website,ceo
0,0044 LIMITED,UNITED KINGDOM,,,,,,,
1,0X,UNITED STATES,FINTECH,2016.0,,,,,
2,0X,UNITED STATES,FINTECH,,,,,,
3,1 1,GERMANY,,,2320000000.0,,,,
4,1 1,GERMANY,,,2270000000.0,,,,


In [64]:
# final.to_csv("./src/sorted_final_dataset.csv")

In [65]:
final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 173033 entries, 0 to 173032
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   name       173033 non-null  object 
 1   country    166096 non-null  object 
 2   sector     49000 non-null   object 
 3   founded    40090 non-null   float64
 4   marketcap  75005 non-null   float64
 5   revenue    51370 non-null   float64
 6   employees  42602 non-null   float64
 7   website    16799 non-null   object 
 8   ceo        21160 non-null   object 
dtypes: float64(4), object(5)
memory usage: 11.9+ MB


## Index

In [66]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [67]:
from recordlinkage.base import BaseIndexAlgorithm
class CustomIndex(BaseIndexAlgorithm):
    def _link_index(self, df_a, df_b):
        t0 = time.time()
        print(len(df_a))
        indici1=[]
        indici2=[]
        x = 50
        for i in range(0, len(df_a)):
            if(i%10000 == 0):
                print(i)
            if (i + x) >= len(df_a):
                    x -= 1
            for j in range(i, (i+x)):
                if(similar(df_a.loc[i, 'name'], df_a.loc[j, 'name'])>0.35):
                    indici1.append(i)
                    indici2.append(j)
        indici = [indici1, indici2]
        t1 = time.time()
        print(t1-t0)
        return pd.MultiIndex.from_arrays(indici, names=('first', 'second'))  

In [68]:
indexer = CustomIndex()
candidate_pairs = indexer.index(final, final)
pairs = candidate_pairs

173033
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
379.6686599254608


In [69]:
pairs

MultiIndex([(     0,      0),
            (     0,     17),
            (     0,     18),
            (     0,     19),
            (     0,     20),
            (     0,     21),
            (     0,     22),
            (     0,     23),
            (     0,     24),
            (     0,     32),
            ...
            (173027, 173030),
            (173028, 173028),
            (173028, 173029),
            (173028, 173030),
            (173029, 173029),
            (173029, 173030),
            (173029, 173031),
            (173030, 173030),
            (173030, 173031),
            (173031, 173031)],
           length=5582851)

In [70]:
# 80% training set, 20% test set
n_perc = int((len(pairs) * 80) / 100)
train_pairs = pairs[:n_perc]
test_pairs = pairs[n_perc:]

## Training 

In [71]:
from recordlinkage.base import BaseCompareFeature
class CompareNumbers(BaseCompareFeature):
    # Similarity score between two numbers
    def _compute_vectorized(self, s1, s2):
        s1 = abs(s1)
        s2 = abs(s2)
        return round(1 - abs(s1 - s2) / (s1 + s2))

In [72]:
# jarowinkler gives priority to the begining of the string
# levenshtein cares more about the order
# compare.string -> default: levenshtein
compare = recordlinkage.Compare()

compare.string('name', 'name', label="name", threshold=0.60)
compare.string('country', 'country', label="country", threshold=0.60, missing_value=np.nan)
compare.string('sector', 'sector', label="sector", threshold=0.50, missing_value=np.nan)
compare.string('website', 'website', method='jarowinkler', label="website", threshold=0.50, missing_value=np.nan)
compare.string('ceo', 'ceo', method='jarowinkler', label="ceo", threshold=0.60, missing_value=np.nan)

training_features = compare.compute(train_pairs, final, final)
training_features['score'] = training_features.loc[:, 'name':'ceo'].sum(axis=1)
training_features['null_values'] = training_features.loc[:, 'name':'ceo'].isnull().sum(axis=1)

In [73]:
training_features.head(10)

Unnamed: 0,Unnamed: 1,name,country,sector,website,ceo,score,null_values
0,0,1.0,1.0,,,,2.0,3
0,17,0.0,0.0,,,,0.0,3
0,18,0.0,0.0,,,,0.0,3
0,19,0.0,0.0,,,,0.0,3
0,20,0.0,0.0,,,,0.0,3
0,21,0.0,,,,,0.0,4
0,22,0.0,0.0,,,,0.0,3
0,23,0.0,0.0,,,,0.0,3
0,24,0.0,0.0,,,,0.0,3
0,32,0.0,0.0,,,,0.0,3


In [74]:
col = len(final.columns) - 4
training_matches = training_features[training_features['score'] >= (col - training_features['null_values'])/2]
matches_train = training_matches
training_matches = training_matches.reset_index()

In [75]:
# toDrop = ['name', 'country', 'sector', 'founded', 'marketcap', 'revenue', 'employees', 'website', 'ceo', 'score', 'null_values']
toDrop = ['name', 'country', 'sector', 'website', 'ceo', 'score', 'null_values']
training_matches = training_matches.drop(toDrop, axis=1)
training_matches.head()

Unnamed: 0,level_0,level_1
0,0,0
1,1,1
2,1,2
3,2,2
4,3,3


In [76]:
training_matches = pd.MultiIndex.from_frame(training_matches)

In [77]:
training_matches

MultiIndex([(     0,      0),
            (     1,      1),
            (     1,      2),
            (     2,      2),
            (     3,      3),
            (     3,      4),
            (     3,      5),
            (     3,      6),
            (     3,      7),
            (     3,      8),
            ...
            (138405, 138414),
            (138406, 138406),
            (138406, 138407),
            (138406, 138408),
            (138406, 138409),
            (138406, 138410),
            (138406, 138411),
            (138406, 138412),
            (138406, 138413),
            (138406, 138414)],
           names=['level_0', 'level_1'], length=1614487)

In [78]:
def createDictionary(matches):
    d = {}
    ignoreSet = set()
    for k, v in matches:
        if k not in d.keys() and k not in ignoreSet:
            d[k] = [k]
            ignoreSet.add(k)
        elif k in d.keys() and v not in ignoreSet:
            d[k].append(v)
            ignoreSet.add(v)
    return d
            

In [79]:
def collapseMatches(df, dict_matches):
    df_collapsed = pd.DataFrame(columns=['name', 'country', 'sector', 'founded', 'marketcap', 'revenue', 'employees', 'website', 'ceo'])
    indexesToDelete = []
    i = 0
    for k in dict_matches.keys():
        if(i%10000==0):
            print(i)
        i+=1
        indexesToDelete.append(k)
        row = df[df.index == k]
        for v in dict_matches[k]:
            indexesToDelete.append(v)
            for field in row:
                if(row[field].isnull().values.any()):
                    if(not df[df.index == v][field].isnull().values.any()):
                        row.at[k, field] = df[df.index==v][field].values[0]
        df_collapsed.loc[len(df_collapsed.index)]=row.values[0]
    return df_collapsed, indexesToDelete

In [80]:
dictionary = createDictionary(training_matches)

In [81]:
df_collapsed_train, indexesToDeleteGlobal = collapseMatches(final, dictionary)
#df_collapsed_train.to_csv('./src/train_matches_collapsed.csv', index=False)

0
10000
20000


In [82]:
df_collapsed_train.head(10)

Unnamed: 0,name,country,sector,founded,marketcap,revenue,employees,website,ceo
0,0044 LIMITED,UNITED KINGDOM,,,,,,,
1,0X,UNITED STATES,FINTECH,2016.0,,,,,
2,1 1,GERMANY,TELECOMMUNICATION INTERNET,1983.0,2320000000.0,4312000.0,3163.0,HTTPS://WWW.1UND1-DRILLISCH.DE,MR RALPH DOMMERMUTH
3,1 800 FLOWERS,USA,E COMMERCE INTERNET TECH,,650000000.0,1860000000.0,4700.0,HTTPS://WWW.1800FLOWERS.COM/,
4,1 INDIA FAMILY MART,INDIA,RETAIL,2012.0,,,,,
5,10 OR,TAIPEI,SMARTPHONES,2004.0,,,,,MR CHI KUAN YANG
6,100 PLUS,,SPORTS DRINK,,,,,,
7,1000MERCIS,FRANCE,,,,,386.0,,MS YSEULYS COSTES
8,1047 GAMES,UNITED STATES,INTERNET SOFTWARE SERVICES,2017.0,,,1239.0,,DR SERGE SAXONOV PH D
9,10DSPACEO 14318875,,,,,,,,


In [83]:
training_features = training_features.fillna(0)
training_features.head()

Unnamed: 0,Unnamed: 1,name,country,sector,website,ceo,score,null_values
0,0,1.0,1.0,0.0,0.0,0.0,2.0,3
0,17,0.0,0.0,0.0,0.0,0.0,0.0,3
0,18,0.0,0.0,0.0,0.0,0.0,0.0,3
0,19,0.0,0.0,0.0,0.0,0.0,0.0,3
0,20,0.0,0.0,0.0,0.0,0.0,0.0,3


#### Classifier

In [84]:
classifier = recordlinkage.NaiveBayesClassifier()
classifier.fit(training_features, training_matches)

## Testing

In [85]:
compare = recordlinkage.Compare()

compare.string('name', 'name', label="name", threshold=0.60)
compare.string('country', 'country', label="country", threshold=0.60, missing_value=np.nan)
compare.string('sector', 'sector', label="sector", threshold=0.50, missing_value=np.nan)
compare.string('website', 'website', method='jarowinkler', label="website", threshold=0.50, missing_value=np.nan)
compare.string('ceo', 'ceo', method='jarowinkler', label="ceo", threshold=0.60, missing_value=np.nan)

test_features = compare.compute(test_pairs, final, final)
test_features['score'] = test_features.loc[:, 'name':'ceo'].sum(axis=1)
test_features['null_values'] = test_features.loc[:, 'name':'ceo'].isnull().sum(axis=1)

In [86]:
col = len(final.columns) - 4
test_matches = test_features[test_features['score'] >= (col - test_features['null_values'])/2]
matches_test = test_matches
test_matches = test_matches.reset_index()

In [87]:
# toDrop = ['name', 'country', 'sector', 'founded', 'marketcap', 'revenue', 'employees', 'website', 'ceo', 'score', 'null_values']
toDrop = ['name', 'country', 'sector', 'website', 'ceo', 'score', 'null_values']
test_matches = test_matches.drop(toDrop, axis=1)
test_matches.head()

Unnamed: 0,level_0,level_1
0,138407,138407
1,138407,138408
2,138407,138409
3,138407,138410
4,138407,138411


In [88]:
test_matches = pd.MultiIndex.from_frame(test_matches)

In [89]:
dictionary_test = createDictionary(test_matches)

In [90]:
df_collapsed_test, indexesToDeleteTest = collapseMatches(final, dictionary_test)
#df_collapsed_test.to_csv('./src/test_matches_collapsed.csv', index=False)

0


In [91]:
test_features = test_features.fillna(0)
test_features.head()

Unnamed: 0,Unnamed: 1,name,country,sector,website,ceo,score,null_values
138406,138415,0.0,0.0,0.0,0.0,0.0,0.0,3
138406,138416,0.0,0.0,0.0,0.0,0.0,0.0,3
138406,138417,0.0,0.0,0.0,0.0,0.0,0.0,3
138406,138418,0.0,0.0,0.0,0.0,0.0,0.0,3
138406,138419,0.0,0.0,0.0,0.0,0.0,0.0,3


In [92]:
predictions = classifier.predict(test_features)

## Evaluation

In [93]:
# Confusion matrix
confusion_matrix = recordlinkage.confusion_matrix(test_matches, predictions, len(test_features))
print(confusion_matrix)


# Metrics
recall = recordlinkage.recall(test_matches, predictions)
print('\nRecall:', recall)
precision = recordlinkage.precision(test_matches, predictions)
print('Precision:', precision)
fscore = recordlinkage.fscore(confusion_matrix)
print('F-score:', fscore)
accuracy = recordlinkage.accuracy(test_matches, predictions, len(test_features))
print('Accuracy:', accuracy)

[[398934      0]
 [ 21868 695769]]

Recall: 1.0
Precision: 0.9480325663851408
F-score: 0.9733231186625938
Accuracy: 0.980415038542108


In [94]:
false_negatives = test_matches.difference(predictions)
false_negatives

MultiIndex([], )

In [95]:
try:
    fn_from_dfA = false_negatives[0][0]
    fn_from_dfB = false_negatives[0][1]

    display(final[final.index == fn_from_dfA])
    display(final[final.index == fn_from_dfB])
except:
    print("No False Negatives Present")

No False Negatives Present


In [96]:
try: 
    fn_from_dfA = false_negatives[1][0]
    fn_from_dfB = false_negatives[1][1]

    display(final[final.index == fn_from_dfA])
    display(final[final.index == fn_from_dfB])
except: 
    print("No False Negatives Present")

No False Negatives Present


## Linking tables

In [97]:
new_companies = pd.concat([df_collapsed_train, df_collapsed_test])

In [98]:
new_companies = new_companies.reset_index()

In [99]:
new_companies = new_companies.drop(columns=['index'])

In [100]:
new_companies.head()

Unnamed: 0,name,country,sector,founded,marketcap,revenue,employees,website,ceo
0,0044 LIMITED,UNITED KINGDOM,,,,,,,
1,0X,UNITED STATES,FINTECH,2016.0,,,,,
2,1 1,GERMANY,TELECOMMUNICATION INTERNET,1983.0,2320000000.0,4312000.0,3163.0,HTTPS://WWW.1UND1-DRILLISCH.DE,MR RALPH DOMMERMUTH
3,1 800 FLOWERS,USA,E COMMERCE INTERNET TECH,,650000000.0,1860000000.0,4700.0,HTTPS://WWW.1800FLOWERS.COM/,
4,1 INDIA FAMILY MART,INDIA,RETAIL,2012.0,,,,,


In [101]:
# new_companies.to_csv('src/first_new_dataset.csv')

## Seconda passata di Matching

In [102]:
indexer = CustomIndex()
candidate_pairs2 = indexer.index(new_companies, new_companies)
pairs2 = candidate_pairs2

35043
0
10000
20000
30000
82.35908126831055


In [103]:
pairs2

MultiIndex([(    0,     0),
            (    0,     3),
            (    0,     6),
            (    0,     8),
            (    0,    13),
            (    0,    15),
            (    0,    26),
            (    0,    38),
            (    0,    39),
            (    1,     1),
            ...
            (35037, 35037),
            (35037, 35038),
            (35037, 35040),
            (35038, 35038),
            (35038, 35040),
            (35039, 35039),
            (35039, 35041),
            (35040, 35040),
            (35040, 35041),
            (35041, 35041)],
           length=711633)

In [104]:
compare = recordlinkage.Compare()
compare.string('name', 'name', label="name", threshold=0.60)

features = compare.compute(pairs2, new_companies, new_companies)

In [105]:
features.head()

Unnamed: 0,Unnamed: 1,name
0,0,1.0
0,3,0.0
0,6,0.0
0,8,0.0
0,13,0.0


In [106]:
matches = features[features['name'] == 1]
matches = matches.drop(columns = ['name'])
matches = matches.reset_index()

In [107]:
matches.head()

Unnamed: 0,level_0,level_1
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


In [108]:
matches_index = pd.MultiIndex.from_frame(matches)

In [109]:
matches_index

MultiIndex([(    0,     0),
            (    1,     1),
            (    2,     2),
            (    3,     3),
            (    4,     4),
            (    5,     5),
            (    6,     6),
            (    7,     7),
            (    8,     8),
            (    9,     9),
            ...
            (35032, 35032),
            (35033, 35033),
            (35034, 35034),
            (35035, 35035),
            (35036, 35036),
            (35037, 35037),
            (35038, 35038),
            (35039, 35039),
            (35040, 35040),
            (35041, 35041)],
           names=['level_0', 'level_1'], length=41759)

In [110]:
dictionary_final = createDictionary(matches_index)

In [111]:
collapsed_df_final, indexesToDeleteFinal = collapseMatches(new_companies, dictionary_final)

0
10000
20000
30000


In [115]:
collapsed_df_final.founded = collapsed_df_final.founded.astype(object).astype('Int64')
collapsed_df_final.marketcap = collapsed_df_final.marketcap.astype(object).astype('Int64')
collapsed_df_final.revenue = collapsed_df_final.revenue.astype(object).astype('Int64')
collapsed_df_final.employees = collapsed_df_final.employees.astype(object).astype('Int64')

In [116]:
collapsed_df_final.to_csv('./src/linked_dataset.csv')

In [113]:
t_end = time.time()
print(t_end-t_start)

970.8166949748993
