## Record Linkage

In [None]:
!pip install recordlinkage --q

In [None]:
import numpy as np 
import pandas as pd
import recordlinkage
import time

In [None]:
t_start = time.time()

In [None]:
final = pd.read_csv("src/aligned_dataset.csv")
final = final.drop(columns=['Unnamed: 0'])
final.head()

In [None]:
final = final.sort_values('name')
final = final.reset_index(drop=True)
final.head()

In [None]:
# final.to_csv("./src/sorted_final_dataset.csv")

In [None]:
final.info()

## Index

In [None]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
from recordlinkage.base import BaseIndexAlgorithm
class CustomIndex(BaseIndexAlgorithm):
    def _link_index(self, df_a, df_b):
        t0 = time.time()
        print(len(df_a))
        indici1=[]
        indici2=[]
        x = 50
        for i in range(0, len(df_a)):
            if(i%10000 == 0):
                print(i)
            if (i + x) >= len(df_a):
                    x -= 1
            for j in range(i, (i+x)):
                if(similar(df_a.loc[i, 'name'], df_a.loc[j, 'name'])>0.35):
                    indici1.append(i)
                    indici2.append(j)
        indici = [indici1, indici2]
        t1 = time.time()
        print(t1-t0)
        return pd.MultiIndex.from_arrays(indici, names=('first', 'second'))  

In [None]:
indexer = CustomIndex()
candidate_pairs = indexer.index(final, final)
pairs = candidate_pairs

In [None]:
pairs

In [None]:
# 80% training set, 20% test set
n_perc = int((len(pairs) * 80) / 100)
train_pairs = pairs[:n_perc]
test_pairs = pairs[n_perc:]

## Training 

In [None]:
from recordlinkage.base import BaseCompareFeature
class CompareNumbers(BaseCompareFeature):
    # Similarity score between two numbers
    def _compute_vectorized(self, s1, s2):
        s1 = abs(s1)
        s2 = abs(s2)
        return round(1 - abs(s1 - s2) / (s1 + s2))

In [None]:
# jarowinkler gives priority to the begining of the string
# levenshtein cares more about the order
# compare.string -> default: levenshtein
compare = recordlinkage.Compare()

compare.string('name', 'name', label="name", threshold=0.60)
compare.string('country', 'country', label="country", threshold=0.60, missing_value=np.nan)
compare.string('sector', 'sector', label="sector", threshold=0.50, missing_value=np.nan)
compare.string('website', 'website', method='jarowinkler', label="website", threshold=0.50, missing_value=np.nan)
compare.string('ceo', 'ceo', method='jarowinkler', label="ceo", threshold=0.60, missing_value=np.nan)

training_features = compare.compute(train_pairs, final, final)
training_features['score'] = training_features.loc[:, 'name':'ceo'].sum(axis=1)
training_features['null_values'] = training_features.loc[:, 'name':'ceo'].isnull().sum(axis=1)

In [None]:
training_features.head(10)

In [None]:
col = len(final.columns) - 4
training_matches = training_features[training_features['score'] >= (col - training_features['null_values'])/2]
matches_train = training_matches
training_matches = training_matches.reset_index()

In [None]:
# toDrop = ['name', 'country', 'sector', 'founded', 'marketcap', 'revenue', 'employees', 'website', 'ceo', 'score', 'null_values']
toDrop = ['name', 'country', 'sector', 'website', 'ceo', 'score', 'null_values']
training_matches = training_matches.drop(toDrop, axis=1)
training_matches.head()

In [None]:
training_matches = pd.MultiIndex.from_frame(training_matches)

In [None]:
training_matches

In [None]:
def createDictionary(matches):
    d = {}
    ignoreSet = set()
    for k, v in matches:
        if k not in d.keys() and k not in ignoreSet:
            d[k] = [k]
            ignoreSet.add(k)
        elif k in d.keys() and v not in ignoreSet:
            d[k].append(v)
            ignoreSet.add(v)
    return d
            

In [None]:
def collapseMatches(df, dict_matches):
    df_collapsed = pd.DataFrame(columns=['name', 'country', 'sector', 'founded', 'marketcap', 'revenue', 'employees', 'website', 'ceo'])
    indexesToDelete = []
    i = 0
    for k in dict_matches.keys():
        if(i%10000==0):
            print(i)
        i+=1
        indexesToDelete.append(k)
        row = df[df.index == k]
        for v in dict_matches[k]:
            indexesToDelete.append(v)
            for field in row:
                if(row[field].isnull().values.any()):
                    if(not df[df.index == v][field].isnull().values.any()):
                        row.at[k, field] = df[df.index==v][field].values[0]
        df_collapsed.loc[len(df_collapsed.index)]=row.values[0]
    return df_collapsed, indexesToDelete

In [None]:
dictionary = createDictionary(training_matches)

In [None]:
df_collapsed_train, indexesToDeleteGlobal = collapseMatches(final, dictionary)
#df_collapsed_train.to_csv('./src/train_matches_collapsed.csv', index=False)

In [None]:
df_collapsed_train.head(10)

In [None]:
training_features = training_features.fillna(0)
training_features.head()

#### Classifier

In [None]:
classifier = recordlinkage.NaiveBayesClassifier()
classifier.fit(training_features, training_matches)

## Testing

In [None]:
compare = recordlinkage.Compare()

compare.string('name', 'name', label="name", threshold=0.60)
compare.string('country', 'country', label="country", threshold=0.60, missing_value=np.nan)
compare.string('sector', 'sector', label="sector", threshold=0.50, missing_value=np.nan)
compare.string('website', 'website', method='jarowinkler', label="website", threshold=0.50, missing_value=np.nan)
compare.string('ceo', 'ceo', method='jarowinkler', label="ceo", threshold=0.60, missing_value=np.nan)

test_features = compare.compute(test_pairs, final, final)
test_features['score'] = test_features.loc[:, 'name':'ceo'].sum(axis=1)
test_features['null_values'] = test_features.loc[:, 'name':'ceo'].isnull().sum(axis=1)

In [None]:
col = len(final.columns) - 4
test_matches = test_features[test_features['score'] >= (col - test_features['null_values'])/2]
matches_test = test_matches
test_matches = test_matches.reset_index()

In [None]:
# toDrop = ['name', 'country', 'sector', 'founded', 'marketcap', 'revenue', 'employees', 'website', 'ceo', 'score', 'null_values']
toDrop = ['name', 'country', 'sector', 'website', 'ceo', 'score', 'null_values']
test_matches = test_matches.drop(toDrop, axis=1)
test_matches.head()

In [None]:
test_matches = pd.MultiIndex.from_frame(test_matches)

In [None]:
dictionary_test = createDictionary(test_matches)

In [None]:
df_collapsed_test, indexesToDeleteTest = collapseMatches(final, dictionary_test)
#df_collapsed_test.to_csv('./src/test_matches_collapsed.csv', index=False)

In [None]:
test_features = test_features.fillna(0)
test_features.head()

In [None]:
predictions = classifier.predict(test_features)

## Evaluation

In [None]:
# Confusion matrix
confusion_matrix = recordlinkage.confusion_matrix(test_matches, predictions, len(test_features))
print(confusion_matrix)


# Metrics
recall = recordlinkage.recall(test_matches, predictions)
print('\nRecall:', recall)
precision = recordlinkage.precision(test_matches, predictions)
print('Precision:', precision)
fscore = recordlinkage.fscore(confusion_matrix)
print('F-score:', fscore)
accuracy = recordlinkage.accuracy(test_matches, predictions, len(test_features))
print('Accuracy:', accuracy)

In [None]:
false_negatives = test_matches.difference(predictions)
false_negatives

In [None]:
try:
    fn_from_dfA = false_negatives[0][0]
    fn_from_dfB = false_negatives[0][1]

    display(final[final.index == fn_from_dfA])
    display(final[final.index == fn_from_dfB])
except:
    print("No False Negatives Present")

In [None]:
try: 
    fn_from_dfA = false_negatives[1][0]
    fn_from_dfB = false_negatives[1][1]

    display(final[final.index == fn_from_dfA])
    display(final[final.index == fn_from_dfB])
except: 
    print("No False Negatives Present")

## Linking tables

In [None]:
new_companies = pd.concat([df_collapsed_train, df_collapsed_test])

In [None]:
new_companies = new_companies.reset_index()

In [None]:
new_companies = new_companies.drop(columns=['index'])

In [None]:
new_companies.head()

In [None]:
# new_companies.to_csv('src/first_new_dataset.csv')

## Seconda passata di Matching

In [None]:
indexer = CustomIndex()
candidate_pairs2 = indexer.index(new_companies, new_companies)
pairs2 = candidate_pairs2

In [None]:
pairs2

In [None]:
compare = recordlinkage.Compare()
compare.string('name', 'name', label="name", threshold=0.60)

features = compare.compute(pairs2, new_companies, new_companies)

In [None]:
features.head()

In [None]:
matches = features[features['name'] == 1]
matches = matches.drop(columns = ['name'])
matches = matches.reset_index()

In [None]:
matches.head()

In [None]:
matches_index = pd.MultiIndex.from_frame(matches)

In [None]:
matches_index

In [None]:
dictionary_final = createDictionary(matches_index)

In [None]:
collapsed_df_final, indexesToDeleteFinal = collapseMatches(new_companies, dictionary_final)

In [None]:
collapsed_df_final.founded = collapsed_df_final.founded.astype(object).astype('Int64')
collapsed_df_final.marketcap = collapsed_df_final.marketcap.astype(object).astype('Int64')
collapsed_df_final.revenue = collapsed_df_final.revenue.astype(object).astype('Int64')
collapsed_df_final.employees = collapsed_df_final.employees.astype(object).astype('Int64')

In [None]:
collapsed_df_final.to_csv('./src/2. linked_dataset.csv')

In [None]:
t_end = time.time()
print(t_end-t_start)