# Helpful Links
* https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/

In [2]:
# %load ../src/utils.py
from collections import defaultdict


def printUniqueTokens(series):
    unique_series = series.unique()
    token_count = {}
    for a in unique_series:
        tokens = a.split(' ')
        for t in tokens:
            if t not in token_count:
                token_count[t] = 1
            else:
                token_count[t] += 1

    for key, value in sorted(token_count.items(), key=lambda item: item[1]):
        print("%s: %s" % (key, value))


# Source: https://www.geeksforgeeks.org/python-merge-list-with-common-elements-in-a-list-of-lists/
# merge function to merge all sublist having common elements.
def merge_common(lists):
    neigh = defaultdict(set)
    visited = set()
    for each in lists:
        for item in each:
            neigh[item].update(each)

    def comp(node, neigh=neigh, visited=visited, vis=visited.add):
        nodes = {node}
        next_node = nodes.pop
        while nodes:
            node = next_node()
            vis(node)
            nodes |= neigh[node] - visited
            yield node

    for node in neigh:
        if node not in visited:
            yield sorted(comp(node))


In [23]:
# %load ../src/clean.py
from typing import List, Any, Set, Tuple

import pandas as pd
import textdistance as td
import numpy as np
import json
from itertools import combinations

def calcDistances(strings: Set[str], completelyInsideOtherBias: float = 0.5) -> List[Tuple[str, str, float]]:
    def bias(s1, s2):
        if s1 in s2 or s2 in s1:
            return completelyInsideOtherBias
        else:
            return 0

    stringCombinations = set(map(frozenset, combinations(set(strings), 2)))

    allDistances = []
    if READ_FROM_FILE:
        with open(PATH_PREFIX + "/" + ALGO + ".json", "r") as file:
            allDistances = json.loads(file.read())
    else:
        allDistances = [(s1, s2, s1 == s2 and -1 or td.jaccard(s1, s2)) for s1, s2 in stringCombinations]
        allDistances.sort(key=lambda x: x[2], reverse=True)
        with open(PATH_PREFIX + "/" + ALGO + ".json", "w+") as file:
            file.write(json.dumps(allDistances))

    if DO_BIAS:
        for i in range(len(allDistances)):
            allDistances[i][2] = allDistances[i][2] + bias(allDistances[i][0], allDistances[i][1])
    return allDistances


def convertToEqualityRings(distanceList: List[Tuple[str, str, float]]) -> List[Set[str]]:
    sets = map(lambda x: set(x[0:2]), distanceList)
    return list(merge_common(sets))


def loadGoldStandard() -> Tuple[List[set], List[int]]:
    dupedf = pd.read_csv(PATH_PREFIX + '/restaurants_DPL.tsv', delimiter='\t')
    dupedict = {}
    for i, dupeRow in dupedf.iterrows():
        if dupeRow[0] not in dupedict:
            dupedict[dupeRow[0]] = set()
            dupedict[dupeRow[0]].add(dupeRow[0])
        # dupedict[dupeRow[0]].add(dupeRow[0])
        dupedict[dupeRow[0]].add(dupeRow[1])

    dupesets: List[Set[Any]] = list(dupedict.values())
    dupeids: List[int] = [y for x in dupesets for y in x]
    return dupesets, dupeids


class Statics:
    CITY_REPLACE_DICT = {
        "w. hollywood": "west hollywood",
        "new york city": "new york",
        "west la": "los angeles",
        "la": "los angeles"
    }
    ADDRESS_REPLACE_DICT = {
        r"(ave|av)": "ave",
        r"(blvd|blv)": "blvd",
        r"(sts)": "st",
        r"s\.": "s",
        r" ?between.*$": ""
    }
    BRACKETS_REGEX = r" ?\(.*\)"
    NON_ALPHA_OR_SPACE_REGEX = r"[^a-zA-Z0-9 ]"
    NON_ALPHA_REGEX = r"[^a-zA-Z0-9]"


def preProcess(df):
    # reformat the index
    # df = df.drop(labels=['id'], axis=1)
    # df.set_index("id", inplace=True)

    for k, v in Statics.ADDRESS_REPLACE_DICT.items():
        df.address = df.address.str.replace(k, v, case=False, regex=True)
    for k, v in Statics.CITY_REPLACE_DICT.items():
        df.city = df.city.str.replace(k, v, case=False)
    df.type = df.type.fillna("")
    df.type = df.type.str.replace(Statics.BRACKETS_REGEX, '', case=False) \
        .str.replace(r"^.*[0-9] ?", "", case=False)
    df.phone = df.phone.str.replace(Statics.NON_ALPHA_REGEX, '', case=False)
    df.name = df.name.str.replace(Statics.BRACKETS_REGEX, '', case=False)

    df["cname"] = df.name.copy()
    df["caddress"] = df.address.copy()
    df.cname = df.cname.str.replace(Statics.NON_ALPHA_OR_SPACE_REGEX, '', case=False) \
        .str.replace('  the$', '', case=False)
    df.caddress = df.caddress.str.replace(Statics.NON_ALPHA_OR_SPACE_REGEX, '', case=False)
    return df


def compareToGold(df, dupesets, dupeids):
    recognizedDuplicates = list(df[df.id.map(len) > 1].id)
    recognizedNonDuplicates = list(df[df.id.map(len) <= 1].id)
    true_positive = set()
    true_negative = set()
    false_negative = set()
    false_positive = set()
    for dupeset in dupesets:
        if dupeset in recognizedDuplicates:
            true_positive.add(frozenset(dupeset))
        else:
            false_negative.add(frozenset(dupeset))
    for recdup in recognizedDuplicates:
        if recdup in dupesets:
            true_positive.add(frozenset(recdup))
        else:
            false_positive.add(frozenset(recdup))
    for recnondup in recognizedNonDuplicates:
        if recnondup not in dupeids:
            true_negative.add(frozenset(recnondup))
            

    # times 2 because we work with sets of 2
    tp = len([e for s in true_positive for e in s])
    tn = len([e for s in true_negative for e in s])
    fn = len([e for s in false_negative for e in s])
    fp = len([e for s in false_positive for e in s])
    print("True positives: " + str(tp))
    print("True negatives: " + str(tn))
    print("False positives: " + str(fp))
    print("False negatives: " + str(fn))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    fscore = (2 * precision * recall) / (precision + recall)
    print("Precision: " + str(precision))
    print("Recall: " + str(recall))
    print("Fscore: " + str(fscore))
    #print(ALGO,tp,tn,fp,fn,precision,recall,fscore, sep=",")
    listofparams = [ALGO,tp,tn,fp,fn,precision,recall,fscore]
    return true_positive, false_negative, false_positive, true_negative, listofparams


def __findClosestEqRingMatch(eqRing, searchString):
    for ring in eqRing:
        for e in ring:
            if e == searchString:
                return ring[0]
    return searchString


def dedupe(df, eqRing):
    df["tdkey"] = df.cname.copy()
    df["tdkey"] = df["tdkey"].apply(lambda s: __findClosestEqRingMatch(eqRing, s))
    df = df.groupby(["phone", "tdkey"]).agg(set).reset_index()
    return df


def clean(df, completelyInsideOtherBias=0.7, filterCutoff=0.65):
    df = preProcess(df)
    distances = calcDistances(df.cname.unique(), completelyInsideOtherBias)
    filteredDistances = list(filter(lambda x: x[2] >= filterCutoff, distances))
    eqRing = convertToEqualityRings(filteredDistances)
    return dedupe(df, eqRing)

PATH_PREFIX = "../data"
READ_FROM_FILE = True
DO_BIAS = True
ALGO = "jaccard"

if __name__ == '__main__':
    dupesets, dupeids = loadGoldStandard()
    dataframe = pd.read_csv(PATH_PREFIX + '/restaurants.tsv', delimiter='\t')
#    bias = 0.7
#    cutoff = 0.7
#    allparams = []
#    for bias in np.arange(0.0,1.1,0.1):
#        for cutoff in np.arange(0.0,1.1,0.1):
#            cleaned = clean(dataframe, bias, cutoff)
#            a,b,c,d,params = compareToGold(cleaned, dupesets, dupeids)
#            print(bias,cutoff,*params, sep=",")
#            allparams.append([bias,cutoff,*params])
#    print("##### ALL PARAMS:")
#    for p in allparams:
#        print(*p,sep=",")

# Results
ALGO,tp,tn,fp,fn,precision,recall,fscore

## with bias of 0.7 and cutoff 0.7
jaccard,210,654,0,14,1.0,0.9375,0.967741935483871
overlap,208,637,19,16,0.9162995594713657,0.9285714285714286,0.9223946784922396
tanimoto,176,688,0,48,1.0,0.7857142857142857,0.88
tversky,210,654,0,14,1.0,0.9375,0.967741935483871
monge_elkan,204,660,0,20,1.0,0.9107142857142857,0.9532710280373832

## without bias with cutoff 0.7
jaccard,182,682,0,42,1.0,0.8125,0.896551724137931
overlap,208,637,19,16,0.9162995594713657,0.9285714285714286,0.9223946784922396
tanimoto,176,688,0,48,1.0,0.7857142857142857,0.88
tversky,182,682,0,42,1.0,0.8125,0.896551724137931
monge_elkan,176,688,0,48,1.0,0.7857142857142857,0.88

## with bias of 0.7 and cutoff 0.7

In [13]:
dupesets, dupeids = loadGoldStandard()

In [14]:
df = pd.read_csv(PATH_PREFIX + '/restaurants.tsv', delimiter='\t')
df = preProcess(df)
#df = df.drop(["name", "address"], axis=1)
df.describe(include=['object'])

Unnamed: 0,name,address,city,phone,type,cname,caddress
count,864,864,864,864,864,864,864
unique,764,764,46,748,75,759,764
top,spago,3570 las vegas blvd. s,new york,4042372700,american,palm,3570 las vegas blvd s
freq,3,6,338,4,180,3,6


In [36]:
df = pd.read_csv('/data/restaurants.tsv', delimiter='\t')
cleaned = clean(df, 0.7, 0.65)
a, b, c, d, e = compareToGold(cleaned, dupesets, dupeids)
missingIds = [y for x in b for y in x]
falseIds = [y for x in c for y in x]

True positives: 212
True negatives: 652
False positives: 0
False negatives: 12
Precision: 1.0
Recall: 0.9464285714285714
Fscore: 0.9724770642201834


In [61]:
cc = cleaned.copy()
cc.city = cc.city.map(lambda x: next(iter(x)))
cc = cc.groupby(["city", "tdkey"]).agg(list).reset_index()
cc

Unnamed: 0,city,tdkey,phone,id,name,address,type,cname,caddress
0,atlos angelesnta,103 west,[4042335993],[{787}],[{103 west}],[{103 w. paces ferry rd.}],[{continental}],[{103 west}],[{103 w paces ferry rd}]
1,atlos angelesnta,abbey,[4048768532],[{492}],[{abbey}],[{163 ponce de leon ave.}],[{international}],[{abbey}],[{163 ponce de leon ave}]
2,atlos angelesnta,abruzzi,[4042618186],"[{149, 150}]",[{abruzzi}],"[{2355 peachtree rd. ne, 2355 peachtree rd. p...",[{italian}],[{abruzzi}],"[{2355 peachtree rd ne, 2355 peachtree rd pea..."
3,atlos angelesnta,adrianos ristorante,"[4042372663, 4042372700, 4042377601, 404261701...","[{153, 154}, {181, 182}, {831}, {793}, {496}, ...","[{bone's restaurant, bone's}, {dining room ri...","[{3130 piedmont road, 3130 piedmont rd. ne}, {...","[{american, steakhouses}, {american, internati...","[{bones, bones restaurant}, {dining room ritz...","[{3130 piedmont rd ne, 3130 piedmont road}, {3..."
4,atlos angelesnta,alecks barbecue heaven,[4045252062],[{493}],[{aleck's barbecue heaven}],[{783 martin luther king jr. dr.}],[{barbecue}],[{alecks barbecue heaven}],[{783 martin luther king jr dr}]
...,...,...,...,...,...,...,...,...,...
588,west hollywood,dukes,[3106523100],[{663}],[{duke's}],[{8909 sunset blvd.}],[{coffee shops}],[{dukes}],[{8909 sunset blvd}]
589,westlos angeleske villos angelesge,aja,[8054984049],[{648}],[{baja fresh}],[{3345 kimber dr.}],[{mexican}],[{baja fresh}],[{3345 kimber dr}]
590,westlos angeleske villos angelesge,local nochol,[8187067706],[{684}],[{local nochol}],[{30869 thousand oaks blvd.}],[{health food}],[{local nochol}],[{30869 thousand oaks blvd}]
591,westwood,antonios,[3102091422],[{662}],[{don antonio's}],[{1136 westwood blvd.}],[{italian}],[{don antonios}],[{1136 westwood blvd}]


In [19]:
print(missingIds)
print(falseIds)
print(b)

[129, 130, 179, 180, 139, 140, 144, 143, 121, 122, 141, 142]
[]
{frozenset({129, 130}), frozenset({179, 180}), frozenset({139, 140}), frozenset({144, 143}), frozenset({121, 122}), frozenset({141, 142})}


In [18]:
#df[df.id.isin(missingIds)]
df[df.id.isin(missingIds)]

Unnamed: 0,id,name,address,city,phone,type,cname,caddress,tdkey
120,121,shun lee west,43 w. 65th st.,new york,2123718844,asian,shun lee west,43 w 65th st,103 west
121,122,shun lee palace,155 e. 55th st.,new york,2123718844,chinese,shun lee palace,155 e 55th st,shun lee palace
128,129,uncle nick's,747 9th ave.,new york,2123151726,mediterranean,uncle nicks,747 9th ave,uncle nicks
129,130,uncle nick's,747 ninth ave.,new york,2122457992,greek,uncle nicks,747 ninth ave,uncle nicks
138,139,le montrachet,3000 w. paradise rd.,los angeless vegas,7027325111,continental,le montrachet,3000 w paradise rd,alons at the terrace
139,140,le montrachet bistro,3000 paradise rd.,los angeless vegas,7027325651,french bistro,le montrachet bistro,3000 paradise rd,alons at the terrace
140,141,palace court,3570 las vegas blvd. s,los angeless vegas,7027317547,continental,palace court,3570 las vegas blvd s,campton place
141,142,palace court,3570 las vegas blvd. s,los angeless vegas,7027317110,french,palace court,3570 las vegas blvd s,campton place
142,143,second street grille,200 e. fremont st.,los angeless vegas,7023853232,seafood,second street grille,200 e fremont st,ambassador grill
143,144,second street grill,200 e. fremont st.,los angeless vegas,7023856277,pacific rim,second street grill,200 e fremont st,ambassador grill


In [57]:
cleaned

Unnamed: 0,phone,tdkey,id,name,address,city,type,cname,caddress
0,1008138212,indian delights,{809},{indian delights},{3675 satellite blvd.},{},{indian},{indian delights},{3675 satellite blvd}
1,2122060059,big cup,{301},{big cup},{228 8th ave. },{},{coffee bar},{big cup},{228 8th ave }
2,2122132288,marnies noodle shop,{751},{sam's noodle shop},{411 third ave.},{},{chinese},{sams noodle shop},{411 third ave}
3,2122190500,bouterin,{743},{nobu},{105 hudson st.},{},{japanese},{nobu},{105 hudson st}
4,2122192777,alons at the terrace,"{99, 100}",{montrachet},"{239 w. broadway, 239 w. broadway }",{},"{french bistro, french}",{montrachet},"{239 w broadway, 239 w broadway }"
...,...,...,...,...,...,...,...,...,...
753,8188865679,ambassador grill,{653},{brent's deli},{19565 parthenia ave.},{},{delis},{brents deli},{19565 parthenia ave}
754,8189056515,alons at the terrace,{702},{rubin's red hots},{15322 ventura blvd.},{},{hot dogs},{rubins red hots},{15322 ventura blvd}
755,8189068881,mulberry st,{692},{mulberry st.},{17040 ventura blvd.},{},{pizza},{mulberry st},{17040 ventura blvd}
756,8189854669,cadel sol,{232},{ca'del sol},{4100 cahuenga blvd.},{},{italian},{cadel sol},{4100 cahuenga blvd}
