From 4bf6ab567b2358122139130dc02932048a2882e8 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sat, 11 Nov 2017 12:04:57 +0100 Subject: [PATCH] repetition rate --- repetition-rate | 48 +++++++++++++++++++++++++++++++++--------------- rr | 44 -------------------------------------------- 2 files changed, 33 insertions(+), 59 deletions(-) delete mode 100755 rr diff --git a/repetition-rate b/repetition-rate index b821782..87938ae 100755 --- a/repetition-rate +++ b/repetition-rate @@ -2,25 +2,43 @@ require 'zipf' -ng = [{},{},{},{}] - +windows = [] +cur = [] +cur_sz = 0 while line = STDIN.gets - ngrams(line, 4) { |g| - if ng[g.size-1].has_key? g - ng[g.size-1][g] += 1 - else - ng[g.size-1][g] = 1 - end - } + if cur_sz >= 1000 + windows << cur + cur = [] + cur_sz = 0 + end + cur << line.strip + cur_sz += cur.last.split.size end -rr = 1.0 -ng.each_with_index { |h,j| - singletons = ng[j].reject { |k,v| v > 1 }.size - rr *= (ng[j].size - singletons).to_f/ng[j].size.to_f +enums = [0.0]*4 +denoms = [0.0]*4 +windows.each { |w| + ng_by_n = [{}]*4 + w.each { |seg| + ngrams(seg, 4) { |ng| + if ng_by_n[ng.size-1].has_key? ng + ng_by_n[ng.size-1][ng] += 1 + else + ng_by_n[ng.size-1][ng] = 1 + end + } + } + ng_by_n.each_with_index { |ng,j| + singletons = ng.reject { |k,v| v > 1 }.size + enums[j] += ng.size - singletons + denoms[j] += ng.size.to_f + } } -rr = rr**0.25 +rr = 1.0 +enums.each_with_index { |i,j| + rr *= i/denoms[j] +} -puts rr +puts ((rr**0.25)*100).round 2 diff --git a/rr b/rr deleted file mode 100755 index 87938ae..0000000 --- a/rr +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env ruby - -require 'zipf' - -windows = [] -cur = [] -cur_sz = 0 -while line = STDIN.gets - if cur_sz >= 1000 - windows << cur - cur = [] - cur_sz = 0 - end - cur << line.strip - cur_sz += cur.last.split.size -end - -enums = [0.0]*4 -denoms = [0.0]*4 -windows.each { |w| - ng_by_n = [{}]*4 - w.each { |seg| - ngrams(seg, 4) { |ng| - if ng_by_n[ng.size-1].has_key? ng - ng_by_n[ng.size-1][ng] += 1 - else - ng_by_n[ng.size-1][ng] = 1 - end - } - } - ng_by_n.each_with_index { |ng,j| - singletons = ng.reject { |k,v| v > 1 }.size - enums[j] += ng.size - singletons - denoms[j] += ng.size.to_f - } -} - -rr = 1.0 -enums.each_with_index { |i,j| - rr *= i/denoms[j] -} - -puts ((rr**0.25)*100).round 2 -