In [1]:
import csv
import logging
import optparse
import os
import re

import dedupe
from unidecode import unidecode

In [2]:
def preProcess(column):
    column = unidecode(column)
    column = re.sub("  +", " ", column)
    column = re.sub("\n", " ", column)
    column = column.strip().strip('"').strip("'").lower().strip()
# If data is missing, indicate that by setting the value to None

    if not column:
        column = None
    return column


In [3]:
# Read in our data from a CSV file and create a dictionary of records, where the key is a unique record ID and each value is dict
def readData(filename):
    data_d = {}
    with open(filename) as f:
        reader = csv.DictReader(f)
        for row in reader:
            clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
            row_id = row["ID"]
            data_d[row_id] = dict(clean_row)

    return data_d

In [4]:
import dedupe

In [5]:
# input_file = "science_with_duplicates.csv"
input_file = "combined_with_duplicates.csv"
output_file = "csv_example_output_combined.csv"
settings_file = "combined_csv_example_learned_settings"
training_file = "combined_csv_example_training.json"

print("importing data ...")
data_d = readData(input_file)

importing data ...


In [8]:
if os.path.exists(settings_file):
    print("reading from", settings_file)
    with open(settings_file, "rb") as f:
        deduper = dedupe.StaticDedupe(f)
else:
    fields = [
                dedupe.variables.String("Title"),
                dedupe.variables.String("Author", has_missing=True),
                dedupe.variables.String("Format", has_missing=True),
                dedupe.variables.String("Language"),
                dedupe.variables.String("Published/Created", has_missing=True),
                dedupe.variables.String("Date", has_missing=True),
                dedupe.variables.String("Description"),
                dedupe.variables.String("Edition", has_missing=True)
            ]
    # Create a new deduper object and pass our data model to it.

    deduper = dedupe.Dedupe(fields)

    if os.path.exists(training_file):
        print("reading labeled examples from ", training_file)
        with open(training_file, "rb") as f:
            deduper.prepare_training(data_d, f)
    else:
        deduper.prepare_training(data_d)

        print("starting active labeling...")

        dedupe.console_label(deduper)
        
        # Using the examples we just labeled, train the deduper and learn blocking predicates
        deduper.train()
        
        # When finished, save our training to disk
        with open(training_file, "w") as tf:
            deduper.write_training(tf)

        # Save our weights and predicates to disk. If the settings file exists, we will skip all the training and learning next time we run this file.
        with open(settings_file, "wb") as sf:
            deduper.write_settings(sf)


reading from combined_csv_example_learned_settings


In [9]:
        print("clustering...")
        clustered_dupes = deduper.partition(data_d, 0.5)
        
        print("# duplicate sets", len(clustered_dupes))
        # Writing Results
        # Write our original data back out to a CSV with a new column called ‘Cluster ID’ which indicates which records refer to each other.
        cluster_membership = {}
        for cluster_id, (records, scores) in enumerate(clustered_dupes):
            for record_id, score in zip(records, scores):
                cluster_membership[record_id] = {
                    "Cluster ID": cluster_id,
                    "confidence_score": score,
                }
        
        with open(output_file, "w") as f_output, open(input_file) as f_input:
        
            reader = csv.DictReader(f_input)
            fieldnames = ["Cluster ID", "confidence_score"] + reader.fieldnames
        
            writer = csv.DictWriter(f_output, fieldnames=fieldnames)
            writer.writeheader()
        
            for row in reader:
                row_id = row["ID"]
                row.update(cluster_membership[row_id])
                writer.writerow(row)

clustering...
# duplicate sets 143
