-
Notifications
You must be signed in to change notification settings - Fork 0
/
distribution.rb
executable file
·105 lines (93 loc) · 2.64 KB
/
distribution.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env ruby
require 'getoptlong'
# Input: lines of format:
#<value1> <frequency1>
#<value2> <frequency2>
#...
#
#Output: various freq distributions
def safe_log(v)
v > 0 ? Math.log(v) : -1
end
field_split_char = ','
format_each_line = :VALUE_COUNT
percentile_output_file = 'percentiles.txt'
opts = GetoptLong.new(
[ '-p', '--percentiles_file_out', GetoptLong::OPTIONAL_ARGUMENT ],
[ '-t', '--splitchar', GetoptLong::OPTIONAL_ARGUMENT ],
[ '-v', '--value_per_line', GetoptLong::NO_ARGUMENT ]
)
opts.each do |opt, arg|
case opt
when '-t'
field_split_char = arg
when '-p'
percentile_output_file = arg
when '-v'
format_each_line = :ID_VALUE
end
end
#format_each_line = :ID_VALUE if each line is <id> <value> where <id> is to be thrown away
begin
lineno = 0
histo = Hash.new(0)
tot = wt_tot = 0
STDIN.each_line do |line|
lineno += 1
next if line =~ /^#/
(v, f) = line.split(field_split_char)
value = v.to_i
freq = f.to_i
if format_each_line == :ID_VALUE
value = f.to_i
freq = 1
end
histo[value] += freq
tot += freq
wt_tot += freq*value
$stderr.puts "Read #{lineno} lines" if lineno % 1000000 == 0
end
percentiles = [50,75,90,95,99,99.9]
hash_per = Hash.new{ |h,k| h[k] = {} }
percentiles.each do |per|
[:le,:wt_le].each do |sym|
hash_per[sym][per] = -1
end
end
le = wt_le = 0
#convert to doubles
tot *= 1.0
wt_tot *= 1.0
ge = tot
wt_ge = wt_tot
print "#val,freq,frac(freq),agg_freq_le,frac(agg_freq_le),|6|,"
print "agg_freq_ge,frac(agg_freq_ge),|9|,weight=freq*val,agg_weight_le,frac(agg_weight_le),|13|,"
print "agg_weight_ge,frac(agg_weight_ge),|16|,"
puts "log(val),log(freq),log(agg_freq_le),log(agg_freq_ge),log(agg_weight_le),log(agg_weight_ge)"
histo.sort.each do |k,v|
wt = v * k
le += v
wt_le += wt
percentiles.each do |per|
{ :le => le/tot, :wt_le => wt_le/wt_tot }.each do |sym,currval|
hash_per[sym][per] = k if (100.0*currval >= per) && (hash_per[sym][per] == -1)
end
end
print "#{k},#{v},#{v/tot},#{le},#{le/tot},|6|,"
print "#{ge},#{ge/tot},|9|,#{wt},#{wt_le},#{wt_le/wt_tot},|13|,"
print "#{wt_ge},#{wt_ge/wt_tot},|16|,"
puts "#{safe_log(k)},#{safe_log(v)},#{safe_log(le)},#{safe_log(ge)},#{safe_log(wt_le)},#{safe_log(wt_ge)}"
ge -= v
wt_ge -= wt
end
perfile = File.open(percentile_output_file, 'w')
perfile.puts "Average = #{wt_tot/tot}"
perfile.puts "All percentiles:"
percentiles.each do |p|
perfile.puts "#{p}%ile values:"
[:le, :wt_le].each do |sym|
perfile.print sym, " ", hash_per[sym][p]
perfile.puts
end
end
end