Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

stopwords helpers in wordcount example

  • Loading branch information...
commit 69cf15c2416e56f9ed8a2815d1cd0c582feca202 1 parent f71b8c0
Philip (flip) Kromer authored
View
3  README-v2.md
@@ -32,8 +32,7 @@ You may also define:
* **easy way to dump to log** --
-* The **bad_record!** should have an easy hook
-
+* The **bad_record!** should have an easy hook to say "shit the bed if there are too many bad records"
on_bad_record do |junk|
@bad_record_count += 1 # ?? how do I make it not instance-y
View
44 examples/corpus/bnc_word_freq.rb
@@ -0,0 +1,44 @@
+#!/usr/bin/env ruby
+$: << File.dirname(__FILE__)
+require 'rubygems'
+require 'wukong/script'
+
+Settings.define :ripd_root, :default => '/data/chimpmark/ripd'
+BNC_SOURCE_FILE='ucrel.lancs.ac.uk/bncfreq/lists/1_1_all_fullalpha.txt'
+
+# File 1_1_all_fullalpha.txt -- 794771 lines
+#
+# cat /data/chimpmark/ripd/ucrel.lancs.ac.uk/bncfreq/lists/1_1_all_fullalpha.txt | ./bnc_word_freq.rb --map | sort -nk3 > /data/chimpmark/rawd/bnc_word_freq/bnc_word_freq.tsv
+
+class BncParser < Wukong::Streamer::RecordStreamer
+ def before_stream
+ @head_word, @part_of_speech, @head_word_stats = ["","",[]]
+ $stdin.readline
+ $stdin.readline
+ end
+
+ def process _, word, pos, variant, freq_ppm, range, dispersion
+ word_stats = [freq_ppm, range, dispersion]
+
+ unless word == "@" # lemma for a different head word
+ @head_word = word
+ @part_of_speech = pos
+ @head_word_stats = word_stats
+ end
+
+ weirdness = (@head_word =~ /[^a-zA-Z]/)
+
+ if variant == '%' # head word with lemmas
+ word_stats = ['','','']
+ elsif variant == ':' # head word with no lemmas
+ variant = word
+ else
+ weirdness = weirdness || (variant =~ /[^a-zA-Z]/)
+ end
+ yield [@head_word, @part_of_speech, @head_word_stats, variant, word_stats, (weirdness ? 1 : 0)].flatten.join("\t")
+ end
+end
+
+Wukong.run(
+ BncParser, nil
+ )
View
138 examples/corpus/stopwords.rb
@@ -0,0 +1,138 @@
+STOPWORDS_3 = %w[
+ the
+ of
+ and
+ a
+ in
+ to
+ it
+ is
+ was
+ I
+ for
+ that
+ you
+ he
+ be
+ with
+ on
+ by
+ at
+ have
+ are
+ not
+ this
+ but
+ had
+ they
+ his
+ from
+ she
+ which
+ or
+ we
+ an
+ were
+ as
+ do
+ been
+ their
+ has
+ would
+ there
+ what
+ will
+ all
+ if
+ can
+ her
+ said
+ who
+ one
+ so
+ up
+ them
+ when
+ some
+ could
+ him
+ into
+ its
+ then
+ two
+ out
+ time
+ my
+ about
+ did
+ your
+ now
+ me
+ other
+ only
+ just
+ more
+ these
+ also
+ any
+ see
+ very
+ may
+ well
+ should
+ than
+ how
+ get
+ way
+ our
+ made
+ got
+ after
+ many
+ those
+ go
+ being
+ because
+ down
+ such
+ through
+ over
+ must
+ still
+ even
+ take
+ too
+ here
+ come
+ own
+ last
+ does
+ oh
+ say
+ no
+ where
+ us
+ same
+ might
+ yes
+ however
+ put
+ world
+ another
+ want
+ most
+ again
+ never
+ under
+ much
+ why
+ each
+ while
+ off
+ went
+ used
+ without
+ give
+ within
+ ]
+RE_STOPWORDS_15 = '(?:'+STOPWORDS_3[0..15].join("|")+')'
View
10 examples/word_count.rb
@@ -23,10 +23,10 @@ def tokenize str
# this includes hyphens (words are split)
str = str.
gsub(/[^a-zA-Z0-9\']+/, ' ').
- gsub(/(\w)\'([std])\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
+ gsub(/(\w)\'([stdm]|re|ve|ll)\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
# Busticate at whitespace
words = str.split(/\s+/)
- words.reject!{|w| w.blank? }
+ words.reject!{|w| w.length < 3 }
words
end
@@ -45,7 +45,7 @@ def process line
#
class Reducer1 < Wukong::Streamer::ListReducer
def finalize
- yield [ key, values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot } ]
+ yield [ values.map(&:last).map(&:to_i).inject(0){|x,tot| x+tot }, key ]
end
end
@@ -56,7 +56,7 @@ class Reducer2 < Wukong::Streamer::AccumulatingReducer
def start!(*args) @key_count = 0 end
def accumulate(*args) @key_count += 1 end
def finalize
- yield [ key, @key_count ]
+ yield [ @key_count, key ]
end
end
@@ -71,5 +71,5 @@ class Reducer3 < Wukong::Streamer::CountKeys
# Execute the script
Wukong.run(
WordCount::Mapper,
- WordCount::Reducer
+ WordCount::Reducer2
)
View
7 lib/wukong/helper.rb
@@ -0,0 +1,7 @@
+module Wukong
+ module Helper
+
+ autoload :Tokenize, 'wukong/helper/tokenize'
+
+ end
+end
View
195 lib/wukong/helper/stopwords.rb
@@ -0,0 +1,195 @@
+module Wukong
+ module Corpus
+ STOPWORDS = %w[
+ the
+ of
+ and
+ a
+ in
+ to
+ it
+ is
+ was
+ I
+ for
+ that
+ you
+ he
+ be
+ with
+ on
+ by
+ at
+ have
+ are
+ not
+ this
+ but
+ had
+ they
+ his
+ from
+ she
+ which
+ or
+ we
+ an
+ were
+ as
+ do
+ been
+ their
+ has
+ would
+ there
+ what
+ will
+ all
+ if
+ can
+ her
+ said
+ who
+ so
+ up
+ them
+ when
+ some
+ could
+ him
+ into
+ its
+ then
+ out
+ my
+ about
+ did
+ your
+ me
+ other
+ just
+ more
+ these
+ also
+ any
+ see
+ very
+ may
+ well
+ should
+ than
+ how
+ get
+ way
+ our
+ made
+ got
+ after
+ many
+ those
+ go
+ being
+ because
+ down
+ such
+ over
+ must
+ still
+ even
+ too
+ here
+ come
+ own
+ last
+ does
+ oh
+ no
+ where
+ us
+ same
+ might
+ yes
+ put
+ another
+ most
+ again
+ under
+ much
+ why
+ each
+ while
+ off
+ went
+ used
+ without
+ give
+ within
+
+ am
+ aren't
+ between
+ both
+ can't
+ cannot
+ couldn't
+ didn't
+ doesn't
+ doing
+ don't
+ hadn't
+ hasn't
+ haven't
+ having
+ he'd
+ he'll
+ he's
+ here's
+ hers
+ how's
+ i'd
+ i'll
+ i'm
+ i've
+ isn't
+ it'd
+ it'll
+ it's
+ let's
+ once
+ only
+ ought
+ ours
+ she'd
+ she'll
+ she's
+ shouldn't
+ that's
+ theirs
+ there's
+ they'd
+ they'll
+ they're
+ they've
+ through
+ wasn't
+ we'd
+ we'll
+ we're
+ we've
+ weren't
+ what's
+ where's
+ who's
+ won't
+ wouldn't
+ you'd
+ you'll
+ you're
+ you've
+ yours
+
+ ].to_set
+ STOPWORDS_3 = STOPWORDS.reject{|w| w.length < 3 }.to_set
+
+ end
+end
View
35 lib/wukong/helper/tokenize.rb
@@ -0,0 +1,35 @@
+require File.expand_path('stopwords', File.dirname(__FILE__))
+module Wukong
+ module Helper
+
+ module Tokenize
+ #
+ # Split a string into its constituent words.
+ #
+ # This is pretty simpleminded:
+ # * downcase the word
+ # * Split at any non-alphanumeric boundary, including '_'
+ # * However, preserve the special cases of 's, 'd or 't at the end of a
+ # word.
+ #
+ # tokenize("Ability is a poor man's wealth #johnwoodenquote")
+ # # => ["ability", "is", "a", "poor", "man's", "wealth", "johnwoodenquote"]
+ #
+ def self.tokenize str
+ return [] if str.blank?
+ str = str.downcase;
+ # kill off all punctuation except [stuff]'s or [stuff]'t
+ # this includes hyphens (words are split)
+ str = str.
+ gsub(/[^a-zA-Z0-9\']+/, ' ').
+ gsub(/(\w)\'([stdm]|re|ve|ll)\b/, '\1!\2').gsub(/\'/, ' ').gsub(/!/, "'")
+ # Busticate at whitespace
+ words = str.split(/\s+/)
+ words.reject!{|w| w.length < 3 || Wukong::Corpus::STOPWORDS_3.include?(w) }
+ words
+ end
+
+ end
+
+ end
+end
Please sign in to comment.
Something went wrong with that request. Please try again.