In [2]:
import os 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data_dir = os.path.expanduser("~/data")
fname = os.path.join(data_dir,"restaurant-nophone-training.csv")
pd.__version__
%matplotlib inline
data = pd.read_csv(fname)
data.head()

Unnamed: 0,name,address,city,cuisine,unique_id
0,arnie morton's of chicago,"""435 s. la cienega blv.""","""los angeles""","""american""",'0'
1,arnie morton's of chicago,"""435 s. la cienega blvd.""","""los angeles""","""steakhouses""",'0'
2,art's delicatessen,"""12224 ventura blvd.""","""studio city""","""american""",'1'
3,art's deli,"""12224 ventura blvd.""","""studio city""","""delis""",'1'
4,hotel bel-air,"""701 stone canyon rd.""","""bel air""","""californian""",'2'


In [3]:
data.describe()

Unnamed: 0,name,address,city,cuisine,unique_id
count,864,864,864,864,864
unique,776,772,49,84,752
top,georgia grille,"""3570 las vegas blvd. s""","""new york""","""american""",'5'
freq,2,5,250,152,2


In [4]:
data['cuisine'].describe()

count             864
unique             84
top        "american"
freq              152
Name: cuisine, dtype: object

In [5]:
data.dtypes

name         object
address      object
city         object
cuisine      object
unique_id    object
dtype: object

In [9]:
from __future__ import print_function
from future.utils import viewitems
from builtins import range
from itertools import combinations, groupby
import os, re, csv, time, logging

import dedupe
import dedupe.core

log_level = logging.DEBUG
logging.getLogger().setLevel(log_level)
settings_file = 'canonical_learned_settings'
raw_data = os.path.expanduser('~/data/restaurant-nophone-training.csv')

def preProcess(column) :
  column = re.sub('  +', ' ', column)
  column = re.sub('\n', ' ', column)
  column = column.strip().strip('"').strip("'").lower()
  if not column :
    column = None
  return column

def canonicalImport(filename):
    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for (i, row) in enumerate(reader):
            clean_row = {k : preProcess(v) for (k, v) in
                         viewitems(row)}
            data_d[i] = clean_row

    return data_d, reader.fieldnames


def evaluateDuplicates(found_dupes, true_dupes):
    true_positives = found_dupes.intersection(true_dupes)
    false_positives = found_dupes.difference(true_dupes)
    uncovered_dupes = true_dupes.difference(found_dupes)
    print('found duplicate')
    print(len(found_dupes))
    print('precision')
    print(1 - len(false_positives) / float(len(found_dupes)))
    print('recall')
    print(len(true_positives) / float(len(true_dupes)))


data_d, header = canonicalImport(raw_data)

training_pairs = dedupe.trainingDataDedupe(
    data_d, 'unique_id', 5000)

duplicates = set()
for _, pair in groupby(sorted(data_d.items(),
                              key=lambda x: x[1]['unique_id']),
                       key=lambda x: x[1]['unique_id']):
    pair = list(pair)
    if len(pair) == 2:
        a, b = pair
        duplicates.add(frozenset((a[0], b[0])))

t0 = time.time()

print('number of known duplicate pairs', len(duplicates))

if os.path.exists(settings_file):
    with open(settings_file, 'rb') as f:
        deduper = dedupe.StaticDedupe(f, 1)
        
else:
    fields = [
        {'field' : 'name', 'type': 'String'},
        {'field' : 'name', 'type': 'Exact'},
        {'field' : 'address', 'type': 'String'},
        {'field' : 'cuisine', 'type': 'ShortString', 
         'has missing' : True},
        {'field' : 'city', 'type' : 'ShortString'}
              ]

    deduper = dedupe.Dedupe(fields, num_cores=5)
    deduper.sample(data_d, 10000)
    deduper.markPairs(training_pairs)
    deduper.train()
    with open(settings_file, 'wb') as f:
        deduper.writeSettings(f)

alpha = deduper.threshold(data_d, 1)

# print candidates
print('clustering...')
clustered_dupes = deduper.match(data_d, threshold=alpha)

print('Evaluate Clustering')
confirm_dupes = set([])
for dupes, score in clustered_dupes:
    for pair in combinations(dupes, 2):
        confirm_dupes.add(frozenset(pair))

evaluateDuplicates(confirm_dupes, duplicates)

print('ran in ', time.time() - t0, 'seconds')


number of known duplicate pairs 112


  % (sample_size, len(blocked_sample)))
INFO:rlr.crossvalidation:using cross validation to find optimum alpha...
INFO:rlr.crossvalidation:optimum alpha: 0.000010, score 0.965034558908
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.4, cuisine)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.8, cuisine)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.6, cuisine)
INFO:dedupe.blocking:Canopy: TfidfNGramCanopyPredicate: (0.2, cuisine)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (1, cuisine)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (3, cuisine)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (2, cuisine)
INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (4, cuisine)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.4, name)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.6, name)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredicate: (0.2, name)
INFO:dedupe.blocking:Canopy: TfidfTextCanopyPredica

clustering...


INFO:dedupe.blocking:Canopy: LevenshteinCanopyPredicate: (1, city)
DEBUG:dedupe.api:matching done, begin clustering


Evaluate Clustering
found duplicate
108
precision
0.962962962962963
recall
0.9285714285714286
ran in  132.25648617744446 seconds
