Inspired from [Dedupe's CSV example](https://dedupeio.github.io/dedupe-examples/docs/csv_example.html)

In [1]:
data = {}
for work in Work.objects.filter(category__slug='manga'):
    data[work.id] = {'title': work.title, 'vo_title': work.vo_title}
    for field in ['title', 'vo_title']:
        if not data[work.id][field]:
            data[work.id][field] = None

In [2]:
next(iter(data.values())).keys()

dict_keys(['title', 'vo_title'])

In [3]:
fields = [
    {'field': 'title', 'type': 'String'},
#    {'field': 'ext_poster', 'type': 'String'},
    {'field': 'vo_title', 'type': 'String'},
]

In [5]:
import os
from dedupe import Dedupe, consoleLabel

output_file = 'csv_example_output.csv'
settings_file = 'csv_example_learned_settings'
training_file = 'csv_example_training.json'

deduper = Dedupe(fields)
deduper.sample(data)
consoleLabel(deduper)

if os.path.exists(training_file):
    print('reading labeled examples from ', training_file)
    with open(training_file, 'rb') as f:
        deduper.readTraining(f)

deduper.train()

with open(training_file, 'w') as tf:
    deduper.writeTraining(tf)

with open(settings_file, 'wb') as sf:
    deduper.writeSettings(sf)

  % (sample_size, len(blocked_sample)))
title : IO Memories
vo_title : None

title : Bride Stories - Latitudes
vo_title : 乙嫁語り

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished


n


title : Kiss me princess
vo_title : None

title : Vampire Princess
vo_title : 吸血姫 - ヴァンパイア・プリンセス

0/10 positive, 1/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


title : Vampire
vo_title : ヴァンパイア

title : Vampire Doll
vo_title : バンパイアドール・ギルナザン

0/10 positive, 2/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


n


title : Angel Densetsu
vo_title : None

title : Angel Densetsu
vo_title : None

0/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


title : Ability Shop
vo_title : None

title : Ability Shop
vo_title : None

1/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


title : Bible (la) - Classique
vo_title : 聖書

title : Bible (la)
vo_title : None

2/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


y


title : Dogs
vo_title : Dogs

title : Dogs: Bullets & Carnage
vo_title : Dogs: Bullets & Carnage

3/10 positive, 3/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious


f


Finished labeling
reading training from file
using cross validation to find optimum alpha...


reading labeled examples from  csv_example_training.json


optimum alpha: 0.000010, score 0.486311623286
Canopy: TfidfTextCanopyPredicate: (0.8, vo_title)
Canopy: TfidfTextCanopyPredicate: (0.2, vo_title)
Canopy: TfidfTextCanopyPredicate: (0.6, vo_title)
Canopy: TfidfTextCanopyPredicate: (0.4, vo_title)
Canopy: LevenshteinCanopyPredicate: (3, vo_title)
Canopy: LevenshteinCanopyPredicate: (2, vo_title)
Canopy: LevenshteinCanopyPredicate: (4, vo_title)
Canopy: LevenshteinCanopyPredicate: (1, vo_title)
Canopy: TfidfNGramCanopyPredicate: (0.6, vo_title)
Canopy: TfidfNGramCanopyPredicate: (0.8, vo_title)
Canopy: TfidfNGramCanopyPredicate: (0.4, vo_title)
Canopy: TfidfNGramCanopyPredicate: (0.2, vo_title)
Canopy: TfidfTextCanopyPredicate: (0.8, title)
Canopy: TfidfTextCanopyPredicate: (0.6, title)
Canopy: TfidfTextCanopyPredicate: (0.2, title)
Canopy: TfidfTextCanopyPredicate: (0.4, title)
Canopy: LevenshteinCanopyPredicate: (2, title)
Canopy: LevenshteinCanopyPredicate: (1, title)
Canopy: LevenshteinCanopyPredicate: (3, title)
Canopy: LevenshteinCa

In [6]:
threshold = deduper.threshold(data, recall_weight=2)

print('clustering...')
clustered_dupes = deduper.match(data, threshold)

print('# duplicate sets', len(clustered_dupes))

Maximum expected recall and precision
recall: 0.960
precision: 0.609
With threshold: 0.173


clustering...
# duplicate sets 421


In [15]:
# Don't know if this is useful, anyway we don't have an input file
'''
from dedupe import canonicalize

cluster_membership = {}
cluster_id = 0
for (cluster_id, cluster) in enumerate(clustered_dupes):
    id_set, scores = cluster
    cluster_d = [data[c] for c in id_set]
    canonical_rep = canonicalize(cluster_d)
    for record_id, score in zip(id_set, scores):
        cluster_membership[record_id] = {
            "cluster id" : cluster_id,
            "canonical representation" : canonical_rep,
            "confidence": score
        }

singleton_id = cluster_id + 1

with open(output_file, 'w') as f_output, open(input_file) as f_input:
    writer = csv.writer(f_output)
    reader = csv.reader(f_input)

    heading_row = next(reader)
    heading_row.insert(0, 'confidence_score')
    heading_row.insert(0, 'Cluster ID')
    canonical_keys = canonical_rep.keys()
    for key in canonical_keys:
        heading_row.append('canonical_' + key)

    writer.writerow(heading_row)

    for row in reader:
        row_id = int(row[0])
        if row_id in cluster_membership:
            cluster_id = cluster_membership[row_id]["cluster id"]
            canonical_rep = cluster_membership[row_id]["canonical representation"]
            row.insert(0, cluster_membership[row_id]['confidence'])
            row.insert(0, cluster_id)
            for key in canonical_keys:
                row.append(canonical_rep[key].encode('utf8'))
        else:
            row.insert(0, None)
            row.insert(0, singleton_id)
            singleton_id += 1
            for key in canonical_keys:
                row.append(None)
        writer.writerow(row)'''
None

In [8]:
import csv
input_file = 'lol.csv'
with open(input_file, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['id', 'title', 'vo_title'])
    for work_id in data:
        writer.writerow([work_id, data[work_id]['title'], data[work_id]['vo_title']])

In [9]:
for ids, _ in clustered_dupes:
    if '5734' in ids:
        print(ids)
    else:
        print(ids)

(4218, 4219)
(15712, 15713)
(6477, 6478)
(4770, 4771, 4772)
(5574, 5575)
(5215, 5216)
(6656, 6657)
(5634, 5635)
(6015, 6016, 6017)
(4622, 4623)
(6434, 6435)
(5869, 5870, 5871)
(8050, 8051)
(6494, 6495)
(5354, 5355, 5356, 5357, 5358, 5359, 5360)
(4794, 4795, 4796)
(7373, 7374)
(5402, 5405)
(4544, 4545)
(6018, 6019)
(7661, 7662)
(5043, 5044)
(7561, 7562)
(15707, 15708)
(15733, 15734)
(5651, 5652)
(6195, 6196)
(4553, 4555)
(4557, 4558)
(4563, 15739)
(5659, 5660, 5661, 5662, 5663, 5664, 5665)
(4618, 4619)
(15735, 15736)
(4774, 4775)
(6308, 6309)
(4760, 4761)
(7682, 15945)
(7683, 7684)
(5361, 5362)
(5370, 5371)
(5489, 5490)
(5780, 5781)
(6345, 6346)
(7122, 7124)
(6045, 6046)
(7183, 7184)
(7824, 7825)
(6783, 6784)
(5926, 5927)
(5545, 5546)
(6695, 6696, 6697)
(5375, 5376)
(5548, 5549, 5550)
(6727, 6728, 6729, 6730, 6731, 6732, 6733, 6734)
(6190, 6191)
(15727, 15728)
(4347, 4348)
(5168, 5169)
(5502, 5503)
(15722, 15723)
(6074, 15718)
(5047, 5048)
(5106, 5107)
(5519, 5520)
(6708, 6709, 6710)
(4

In [18]:
for ids, _ in clustered_dupes[:10]:
    for wid in ids:
        print(data[wid]['title'])
    print('---')

Alive
Alive Last Evolution
---
Chaosic Rune
Chaosic Rune
---
Monde de Maliang (le)
Monde de Maliang (le) - Kantik
---
Chopperman
Chopperman - Coffret intégrale
Chopperman - le Super Docteur des petits et des grands
---
Gunsmith Cats Burst
Gunsmith Cats Revised
---
Empreinte de la passion (l')
Empreinte du mal (l')
---
Noritaka
Noritaka - Kiosque
---
Heaven Eleven
Heaven Eleven - Coffret
---
Kill me, kiss me
Kill me, kiss me - Coffret
Kill me, kiss me - Réédition
---
Brother
Brother X Brother
---


Looks like it was successful for some examples, but needed some more training for the others!