Skip to content

Commit

Permalink
added new filter, put count and remove methods into text (what a clas…
Browse files Browse the repository at this point in the history
…s!!)
  • Loading branch information
Stephen Hardisty committed Sep 1, 2011
1 parent 9af88e0 commit ba2dd1f
Show file tree
Hide file tree
Showing 13 changed files with 92 additions and 28 deletions.
10 changes: 2 additions & 8 deletions lib/despamilator/filter/html_tags.rb
Expand Up @@ -6,8 +6,8 @@ def parse subject
text = subject.text.downcase

html_tags.each do |tag|
opening_elements = number_of_matches_for(text, /<\s*#{tag}\W/)
closing_elements = number_of_matches_for(text, /\W#{tag}\s*\/>/)
opening_elements = text.count(/<\s*#{tag}\W/)
closing_elements = text.count(/\W#{tag}\s*\/>/)

if opening_elements > 0 or closing_elements > 0
safest_element_count = opening_elements > closing_elements ? opening_elements : closing_elements
Expand Down Expand Up @@ -124,12 +124,6 @@ def html_tags

end

private

def number_of_matches_for text, regexp
text.scan(regexp).length
end

end

end
2 changes: 1 addition & 1 deletion lib/despamilator/filter/ip_address_url.rb
Expand Up @@ -15,7 +15,7 @@ def description
def parse subject
subject.register_match!({
:score => 0.5, :filter => self
}) if subject.text.downcase.scan(/http:\/\/\d+\.\d+\.\d+\.\d+/).length > 0
}) if subject.text.downcase.count(/http:\/\/\d+\.\d+\.\d+\.\d+/) > 0
end

end
Expand Down
11 changes: 2 additions & 9 deletions lib/despamilator/filter/mixed_case.rb
Expand Up @@ -11,18 +11,11 @@ def description

def parse subject
text = subject.text.without_uris
count = count_and_strip(text, /[a-z][A-Z]/)
count += count_and_strip(text, /[a-z][A-Z][a-z]/)
count = text.remove_and_count!(/[a-z][A-Z]/)
count += text.remove_and_count!(/[a-z][A-Z][a-z]/)
subject.register_match!({:score => 0.1 * count, :filter => self}) if count > 0
end

private

def count_and_strip text, regexp
count = text.scan(regexp).length
text.gsub!(regexp, '')
count
end
end

end
2 changes: 1 addition & 1 deletion lib/despamilator/filter/obfuscated_urls.rb
Expand Up @@ -21,7 +21,7 @@ def parse subject
private

def find_space_separated_parts text
text.scan(/www\s+\w+\s+com/).length
text.count(/www\s+\w+\s+com/)
end

def find_space_separated_characters text
Expand Down
2 changes: 1 addition & 1 deletion lib/despamilator/filter/prices.rb
Expand Up @@ -10,7 +10,7 @@ def description
end

def parse subject
price_count = subject.text.scan(/\$\s*\d+/).length
price_count = subject.text.count(/\$\s*\d+/)
subject.register_match!({:score => 0.075 * price_count, :filter => self}) if price_count > 0
end

Expand Down
2 changes: 1 addition & 1 deletion lib/despamilator/filter/shouting.rb
Expand Up @@ -19,7 +19,7 @@ def parse subject
return if text.length < 20

uppercased = text.scan(/[A-Z][A-Z]+/).join.length
lowercased = text.scan(/[a-z]/).length
lowercased = text.count(/[a-z]/)

if uppercased > 0
subject.register_match!({
Expand Down
2 changes: 1 addition & 1 deletion lib/despamilator/filter/spammy_tlds.rb
Expand Up @@ -13,7 +13,7 @@ def description
end

def parse subject
matches = subject.text.scan(/\w{5,}\.(info|biz)\b/).length
matches = subject.text.count(/\w{5,}\.(info|biz)\b/)
subject.register_match!({:score => 0.05 * matches, :filter => self}) if matches > 0
end

Expand Down
2 changes: 1 addition & 1 deletion lib/despamilator/filter/urls.rb
Expand Up @@ -14,7 +14,7 @@ def description

def parse subject
text = subject.text.downcase.gsub(/http:\/\/\d+\.\d+\.\d+\.\d+/, '')
matches = text.scan(/http:\/\//).length
matches = text.count(/https?:\/\//)
1.upto(matches > 2 ? 2 : matches) do
subject.register_match!({:score => 0.4, :filter => self})
end
Expand Down
38 changes: 38 additions & 0 deletions lib/despamilator/filter/weird_punctuation.rb
@@ -0,0 +1,38 @@
require 'despamilator/filter'

module DespamilatorFilter

class WeirdPunctuation < Despamilator::Filter

def name
'Weird Punctuation'
end

def description
'Detects unusual use of punctuation.'
end

def parse subject
text = subject.text.without_uris
text.gsub!(/\w&\w/, '')
matches = text.remove_and_count!(/(?:\W|\s|^)(#{punctuation})/)
matches += text.remove_and_count!(/(#{punctuation})(#{punctuation})/)
matches += text.remove_and_count!(/(#{punctuation})$/)
matches += text.remove_and_count!(/(?:\W|\s|^)\d+(#{punctuation})/)

subject.register_match!({:score => 0.015 * matches, :filter => self}) if matches > 0
end

private

def punctuation
@punctuation ||= %w{~ ` ! @ # $ % ^ & * _ - + = , / ? | \\ : ; ' "}.map do |punctuation_character|
Regexp.escape(punctuation_character)
end.join('|')

@punctuation
end

end

end
14 changes: 12 additions & 2 deletions lib/despamilator/subject/text.rb
Expand Up @@ -10,11 +10,21 @@ def initialize text
end

def without_uris
self.gsub(URI.regexp(['http', 'https', 'mailto', 'ftp']), '')
gsub(URI.regexp(['http', 'https', 'mailto', 'ftp']), '')
end

def words
self.split(/\W+/)
split(/\W+/)
end

def count pattern
scan(pattern).flatten.compact.length
end

def remove_and_count! pattern
count = count(pattern)
gsub!(pattern, '')
count
end

end
Expand Down
5 changes: 2 additions & 3 deletions spec/filters/script_tag_spec.rb
Expand Up @@ -8,12 +8,11 @@
a_single_match_of('<script>', should_score: 1)
a_multiple_match_of('<script></script> <script></script>', should_score: 1)

describe "detecting various script tags" do
context "detecting various script tags" do
['<script type="whatever">', '<script></script>', '</script>', '<script>', "<script\n>"].each do |script_tag|
[script_tag.upcase, script_tag.downcase].each do |script_tag|
it "should detect '#{script_tag}' of a script tag" do
dspam = Despamilator.new(script_tag)
dspam.score.should == 1
parsing(script_tag).should have_score(1)
end
end
end
Expand Down
18 changes: 18 additions & 0 deletions spec/filters/weird_punctuation_spec.rb
@@ -0,0 +1,18 @@
describe DespamilatorFilter::WeirdPunctuation do

the_name_should_be 'Weird Punctuation'
the_description_should_be 'Detects unusual use of punctuation.'

despamilator_should_apply_the_filter_for('^this^')

a_single_match_of('&gt', should_score: 0.015)
a_multiple_match_of('%D :-D &gt;:-[ 123, l 89.', should_score: 0.075)

it 'should ignore weird punctuation in urls' do
parsing('http://www.blah.com?x=1&y=z').should have_score(0)
end

it 'should ignore ampersands surrounded by letters' do
parsing('j&r').should have_score(0)
end
end
12 changes: 12 additions & 0 deletions spec/subject_text_spec.rb
Expand Up @@ -20,4 +20,16 @@
).words.should == %w{hello there you rule}
end

it 'should count the matches for a regular expression' do
Despamilator::Subject::Text.new(
'yXyXy'
).count(/X/).should == 2
end

it 'should count the matches for a regular expression' do
text = Despamilator::Subject::Text.new('yXyXy').dup
text.remove_count!(/X/).should == 2
text.should == 'yyy'
end

end

0 comments on commit ba2dd1f

Please sign in to comment.