Skip to content
This repository has been archived by the owner on Jun 13, 2020. It is now read-only.

Commit

Permalink
include calculation of representative id from sketch duplicate sets
Browse files Browse the repository at this point in the history
  • Loading branch information
matpalm committed Jul 9, 2009
1 parent f624430 commit b6f6e7b
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 24 deletions.
34 changes: 34 additions & 0 deletions connected_components.rb
@@ -0,0 +1,34 @@
#!/usr/bin/env ruby

require 'rubygems'
require "rgl/adjacency"
require "rgl/traversal"
require "rgl/connected_components"

incident_edge_sum = {}
incident_edge_sum.default = 0

g = RGL::AdjacencyGraph.new

STDIN.each do |line|
id1,id2,res = line.split
id1,id2,res = id1.to_i, id2.to_i, res.to_f
incident_edge_sum[id1] += res
incident_edge_sum[id2] += res
g.add_vertex id1
g.add_vertex id2
g.add_edge id1, id2
end

g.each_connected_component do |vertexs|
max_sum = 0
max_vert = nil
vertexs.each do |vertex|
if incident_edge_sum[vertex] > max_sum
max_sum = incident_edge_sum[vertex]
max_vert = vertex
end
end
vertexs.delete max_vert
puts "#{max_vert} #{vertexs.join(' ')}"
end
33 changes: 23 additions & 10 deletions erl/map_reduce.rb
Expand Up @@ -6,11 +6,12 @@ def log msg
`echo #{msg} >> stats.out`
end

def run(command)
def run command
@cmd += 1
now = Time.now
puts "#{now} (#{now-@last}sec) #{@cmd} #{command}"
log "S #{@cmd}"
command += " >#{@cmd}.out" unless command.include? '>'
puts "#{now} (#{now-@last}sec) #{@cmd} #{command}"
`#{command}`
log "E #{@cmd} DU #{`du -sh mr`.chomp}"
@last = now
Expand All @@ -27,9 +28,9 @@ def run(command)
ADDR_WEIGHT = 5
PHONE_WEIGHT = 1

def sketch_dedup type
def sketch_dedup type, shingle_size
run "cat split/#{type}.unique | erl -noshell -pa ebin -s prepare -parser prepare_id_text -num_files #{NUM_FILES} -output_dir mr/#{type}.unique"
run "erl -noshell -pa ebin -s map_reduce_s -tasks shingler sketcher -shingle_size 3 -input_dirs mr/#{type}.unique -output_dir mr/#{type}.sketches"
run "erl -noshell -pa ebin -s map_reduce_s -tasks shingler sketcher -shingle_size #{shingle_size} -input_dirs mr/#{type}.unique -output_dir mr/#{type}.sketches"
run "erl -noshell -pa ebin -s shuffle -input_dirs mr/#{type}.sketches -output_dir mr/#{type}.shuffled"
run "erl -noshell -pa ebin -s map_reduce_s -tasks combos -input_dirs mr/#{type}.shuffled -output_dir mr/#{type}.all_combos"
run "erl -noshell -pa ebin -s shuffle -input_dirs mr/#{type}.all_combos -output_dir mr/#{type}.all_combos_shuffled"
Expand All @@ -44,7 +45,7 @@ def sketch_dedup type
#
run "cat mr/#{type}.unique/* > mr/#{type}.unique.all" # hack!
run "cat mr/#{type}.combos_pairs/* > mr/#{type}.combos_pairs.all" # hack2!
run "erl -noshell -pa ebin -s pair_to_jaccard -shingle_size 3 -type #{type} -id_name mr/#{type}.unique.all -id_pairs mr/#{type}.combos_pairs.all -output_file mr/#{type}.sketch.unexploded.result "
run "erl -noshell -pa ebin -s pair_to_jaccard -shingle_size #{shingle_size} -type #{type} -id_name mr/#{type}.unique.all -id_pairs mr/#{type}.combos_pairs.all -output_file mr/#{type}.sketch.unexploded.result "
end

####
Expand All @@ -63,9 +64,9 @@ def extract_exact_duplicates
end

def calculate_sketch_near_duplicates
['names','addresses'].each do |type|
sketch_dedup type # makes type.sketch.result
end
# makes type.sketch.result
sketch_dedup 'names', 3
sketch_dedup 'addresses', 6
end

def explode_sketch_results
Expand All @@ -85,21 +86,33 @@ def combine_results
# mr/result/<type>.sketch.result ; name/address
run "erl -noshell -pa ebin -s shuffle -input_dirs mr/result -output_dir mr/final_result"
run "cat mr/final_result/* > mr/final_result.all"
run "erl -noshell -pa ebin -s calculate_nap -file mr/final_result.all -n #{NAME_WEIGHT} -a #{ADDR_WEIGHT} -p #{PHONE_WEIGHT} | sort -nrk3 > final_res"
run "erl -noshell -pa ebin -s calculate_nap -file mr/final_result.all -n #{NAME_WEIGHT} -a #{ADDR_WEIGHT} -p #{PHONE_WEIGHT} | sort -nrk3 > final_similiarities"
end

####

def choose_representative_id_from_sketch_dups
run "mkdir ccgraph 2>/dev/null"
run "cat final_similiarities | ../filter_under.rb 0.6 > ccgraph/sketched"
run "cat ccgraph/sketched | ../connected_components.rb > sketch.dup.ids"
end

####

msg = "NUM_ENTRIES=#{NUM_ENTRIES} NUM_FILES=#{NUM_FILES} NAME_WEIGHT=#{NAME_WEIGHT} ADDR_WEIGHT=#{ADDR_WEIGHT} PHONE_WEIGHT=#{PHONE_WEIGHT}"
log msg

# TODO: where are combo.ids used? can we use dup.ids there instead??
`rm -rf mr split`
`mkdir mr split mr/result`
extract_exact_duplicates
calculate_sketch_near_duplicates
explode_sketch_results
combine_results
choose_representative_id_from_sketch_dups

# exact dups in split/nap.dup.ids
# skectch dups in sketch.dup.ids
# (both files have master id followed by slave ids)

#TODO: wc version of scat, look for header and then skip that many bytes

1 change: 1 addition & 0 deletions erl/src/sketcher.erl
Expand Up @@ -21,5 +21,6 @@ shingles_to_sketches([Seed|Seeds], Shingles, Sketches) ->
shingles_to_sketches(Seeds, Shingles, [Sketch|Sketches]).

shingles_to_sketch(Seed, Shingles) ->
io:format("processing ~p\n",[Shingles]),
Hashes = [ util:uhash(S,Seed) || S <- Shingles ],
lists:min(Hashes).
14 changes: 1 addition & 13 deletions find_dups.rb
Expand Up @@ -7,7 +7,6 @@

@unique_ids_file = File.open("split/#{PREFIX}.unique",'w')
@dup_ids_file = File.open("split/#{PREFIX}.dup.ids","w")
#@id_combos_file = File.open("#{PREFIX}.combos.ids","w")

def dump
ids = @dup_ids
Expand All @@ -20,17 +19,6 @@ def dump
@dup_ids_file.puts output_ids.join(' ')
end

=begin
ids.unshift master_id
while !ids.empty?
id1 = ids.shift
ids.each do |id2|
id1,id2 = id2,id1 if id1 > id2
@id_combos_file.puts "#{id1} #{id2}"
end
end
=end

@dup_ids = []
end

Expand All @@ -45,4 +33,4 @@ def dump

dump
@dup_ids_file.close
#@id_combos_file.close

14 changes: 14 additions & 0 deletions freq.rb
@@ -0,0 +1,14 @@
#!/usr/bin/env ruby
keys_first = false
(keys_first="false"==ARGV[0]) if ARGV[0]

freqs = {}
freqs.default = 0
STDIN.each do |line|
line.chomp!
freqs[line] += 1
end

freqs.keys.each do |k|
puts keys_first ? "#{k} #{freqs[k]}" : "#{freqs[k]} #{k}"
end
5 changes: 4 additions & 1 deletion split.rb
Expand Up @@ -14,7 +14,10 @@

id = cols[0]
name = cols[1]
address = [2,3,4,5].collect{|i| cols[i]}.join('|')

addr_join_char = single_export ? '|' : ' '
address = [2,3,4,5].collect{|i| cols[i]}.join(addr_join_char).strip

phone = cols[6]

if single_export
Expand Down

0 comments on commit b6f6e7b

Please sign in to comment.