# Data Deduplication on students from Big Dive 7

In [1]:
import pandas as pd
import recordlinkage as rl

## loading dataframe from csv

In [2]:
students = pd.read_csv('big-dive-students.csv')
students.head()

Unnamed: 0,Timestamp,Full Name,Age,Employer,Job Title,Country,City
0,6/29/2018 10:03:48,pier paolo,43,scs,developer,italy,turin
1,6/29/2018 10:03:55,Stefano Menozzi,27,Way Srl,Data Engineer,Italy,Correggio
2,6/29/2018 10:04:00,Gabriele Pece,34,Pirelli Tyre,Product Owner,Italy,Milano
3,6/29/2018 10:04:08,Elisa Reale,28,Università degli studi di Torino,PhD,Italy,Torino
4,6/29/2018 10:04:10,ENRICO LOMBARDO,32,Vem Solutions S..pA,Junior Big Data Analyst,Italy,Turin


In [3]:
# snake_case column names
students = students.rename(lambda t: '_'.join(t.lower().split()), axis='columns')
students.head()

Unnamed: 0,timestamp,full_name,age,employer,job_title,country,city
0,6/29/2018 10:03:48,pier paolo,43,scs,developer,italy,turin
1,6/29/2018 10:03:55,Stefano Menozzi,27,Way Srl,Data Engineer,Italy,Correggio
2,6/29/2018 10:04:00,Gabriele Pece,34,Pirelli Tyre,Product Owner,Italy,Milano
3,6/29/2018 10:04:08,Elisa Reale,28,Università degli studi di Torino,PhD,Italy,Torino
4,6/29/2018 10:04:10,ENRICO LOMBARDO,32,Vem Solutions S..pA,Junior Big Data Analyst,Italy,Turin


In [4]:
students.dtypes

timestamp    object
full_name    object
age           int64
employer     object
job_title    object
country      object
city         object
dtype: object

In [5]:
students.head()

Unnamed: 0,timestamp,full_name,age,employer,job_title,country,city
0,6/29/2018 10:03:48,pier paolo,43,scs,developer,italy,turin
1,6/29/2018 10:03:55,Stefano Menozzi,27,Way Srl,Data Engineer,Italy,Correggio
2,6/29/2018 10:04:00,Gabriele Pece,34,Pirelli Tyre,Product Owner,Italy,Milano
3,6/29/2018 10:04:08,Elisa Reale,28,Università degli studi di Torino,PhD,Italy,Torino
4,6/29/2018 10:04:10,ENRICO LOMBARDO,32,Vem Solutions S..pA,Junior Big Data Analyst,Italy,Turin


## cleaning and preprocessing

In [6]:
students['sorted_name'] = students['full_name'].apply(lambda n: ' '.join(sorted(n.lower().split())))

In [7]:
students.head()

Unnamed: 0,timestamp,full_name,age,employer,job_title,country,city,sorted_name
0,6/29/2018 10:03:48,pier paolo,43,scs,developer,italy,turin,paolo pier
1,6/29/2018 10:03:55,Stefano Menozzi,27,Way Srl,Data Engineer,Italy,Correggio,menozzi stefano
2,6/29/2018 10:04:00,Gabriele Pece,34,Pirelli Tyre,Product Owner,Italy,Milano,gabriele pece
3,6/29/2018 10:04:08,Elisa Reale,28,Università degli studi di Torino,PhD,Italy,Torino,elisa reale
4,6/29/2018 10:04:10,ENRICO LOMBARDO,32,Vem Solutions S..pA,Junior Big Data Analyst,Italy,Turin,enrico lombardo


In [8]:
from recordlinkage.preprocessing import phonetic

students['phonetic_name'] = phonetic(students['full_name'], method='match_rating')
students.head()

Unnamed: 0,timestamp,full_name,age,employer,job_title,country,city,sorted_name,phonetic_name
0,6/29/2018 10:03:48,pier paolo,43,scs,developer,italy,turin,paolo pier,PRPL
1,6/29/2018 10:03:55,Stefano Menozzi,27,Way Srl,Data Engineer,Italy,Correggio,menozzi stefano,STFNZZ
2,6/29/2018 10:04:00,Gabriele Pece,34,Pirelli Tyre,Product Owner,Italy,Milano,gabriele pece,GBRLPC
3,6/29/2018 10:04:08,Elisa Reale,28,Università degli studi di Torino,PhD,Italy,Torino,elisa reale,ELSRL
4,6/29/2018 10:04:10,ENRICO LOMBARDO,32,Vem Solutions S..pA,Junior Big Data Analyst,Italy,Turin,enrico lombardo,ENRBRD


In [9]:
students['lower_city'] = students['city'].apply(lambda c: c.lower())

In [10]:
students.head()

Unnamed: 0,timestamp,full_name,age,employer,job_title,country,city,sorted_name,phonetic_name,lower_city
0,6/29/2018 10:03:48,pier paolo,43,scs,developer,italy,turin,paolo pier,PRPL,turin
1,6/29/2018 10:03:55,Stefano Menozzi,27,Way Srl,Data Engineer,Italy,Correggio,menozzi stefano,STFNZZ,correggio
2,6/29/2018 10:04:00,Gabriele Pece,34,Pirelli Tyre,Product Owner,Italy,Milano,gabriele pece,GBRLPC,milano
3,6/29/2018 10:04:08,Elisa Reale,28,Università degli studi di Torino,PhD,Italy,Torino,elisa reale,ELSRL,torino
4,6/29/2018 10:04:10,ENRICO LOMBARDO,32,Vem Solutions S..pA,Junior Big Data Analyst,Italy,Turin,enrico lombardo,ENRBRD,turin


## indexing

In [11]:
indexer = rl.FullIndex()

In [12]:
pairs = indexer.index(students)

## comparing

In [13]:
comparer = rl.Compare()

In [14]:
comparer.string('sorted_name', 'sorted_name', label='ld_name')
comparer.string('phonetic_name', 'phonetic_name', label='ph_name')
comparer.numeric('age', 'age', offset=1, scale=3, label='age')
comparer.string('country', 'country', label='country')
comparer.string('city', 'city', label='city')

<Compare>

In [15]:
features = comparer.compute(pairs, students)

## matching

In [16]:
# Sum the comparison results.
features.sum(axis=1).value_counts().sort_index(ascending=False)

5.000000     2
4.666667     2
4.500000     2
4.466667     1
4.441176     1
4.380952     1
4.133333     1
3.921569     1
3.833333     1
3.800000     1
3.785714     1
3.647186     1
3.633333     1
3.600000     2
3.533333     1
3.433333     1
3.392157     1
3.387879     1
3.366667     1
3.343137     1
3.333333     1
3.312500     1
3.300000     1
3.294118     1
3.284314     1
3.282456     1
3.271930     1
3.266667     1
3.263158     1
3.233333     1
            ..
0.467419     1
0.458333     2
0.452381     1
0.434046     1
0.427171     2
0.415584     1
0.410237     1
0.405357     1
0.401681     1
0.397436     1
0.384085     1
0.376471     1
0.376190     1
0.366667     2
0.357895     2
0.344444     1
0.333333     5
0.333333     2
0.319328     1
0.317647     1
0.309524     1
0.262500     1
0.258824     1
0.250000     1
0.241228     1
0.200000     3
0.181818     1
0.166667     5
0.142857     5
0.000000    42
Length: 581, dtype: int64

In [17]:
features[features.sum(axis=1) >= 3.6].index.to_frame()

Unnamed: 0,Unnamed: 1,0,1
0,20,0,20
2,17,2,17
2,23,2,23
4,32,4,32
6,21,6,21
8,42,8,42
10,38,10,38
11,26,11,26
14,15,14,15
14,18,14,18


## verifying results

In [18]:
students.loc[[16, 39]]

Unnamed: 0,timestamp,full_name,age,employer,job_title,country,city,sorted_name,phonetic_name,lower_city
16,6/29/2018 10:05:20,Marek Kufel,32,Capgemini,Software Developer,Poland,Wrocław,kufel marek,MRKKFL,wrocław
39,6/29/2018 10:07:46,Marek Fufel,27,Some big company,Developer,Poland,Wroklaw,fufel marek,MRKFFL,wroklaw


In [19]:
students.loc[[2, 23]]

Unnamed: 0,timestamp,full_name,age,employer,job_title,country,city,sorted_name,phonetic_name,lower_city
2,6/29/2018 10:04:00,Gabriele Pece,34,Pirelli Tyre,Product Owner,Italy,Milano,gabriele pece,GBRLPC,milano
23,6/29/2018 10:06:06,Gabriele Pece,27,Pirelli,Data Scientist,italy,Milano,gabriele pece,GBRLPC,milano
