diff --git a/colander.gemspec b/colander.gemspec index f14d9fb..a30caff 100644 --- a/colander.gemspec +++ b/colander.gemspec @@ -19,7 +19,7 @@ Gem::Specification.new do |s| s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) } s.require_paths = ["lib"] - s.add_dependency "roo", "~> 1.10.1" s.add_dependency "zip", "~> 2.0.2" s.add_development_dependency "rspec" + s.add_development_dependency "pry" end diff --git a/lib/colander/parser/base.rb b/lib/colander/parser/base.rb index d6f22cc..91110a5 100644 --- a/lib/colander/parser/base.rb +++ b/lib/colander/parser/base.rb @@ -8,8 +8,31 @@ def initialize(file_path) end def parse + @emails = collect_emails + rescue Exception => e + raise InvalidFile.new e + end + + def payload raise "plz implement me in" end + + protected + + def collect_emails + parse_file.scan(/\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\b/).flatten.uniq + end + + def parse_file + ic = Iconv.new("UTF-8//IGNORE", "UTF-8") + std_out, std_err, exit_status = Open3.capture3("strings", :stdin_data => payload) + if exit_status == 0 + ic.iconv(std_out) + else + raise RuntimeError.new(std_err) + end + end + end end -end \ No newline at end of file +end diff --git a/lib/colander/parser/xls.rb b/lib/colander/parser/xls.rb index 42a5683..5f1ac4f 100644 --- a/lib/colander/parser/xls.rb +++ b/lib/colander/parser/xls.rb @@ -1,31 +1,15 @@ require 'colander/invalid_file' require 'colander/parser/base' -require 'roo' -require 'iconv' - +require 'open3' module Colander module Parser class Xls < Base - def parse - spreadsheet = parse_file - @emails = collect_emails spreadsheet - rescue Exception => e - raise InvalidFile.new e - end protected - def parse_file - Excel.new(@file_path,nil,:ignore) + def payload + File.read(@file_path) end - - def collect_emails(spreadsheet) - spreadsheet.sheets.map do |sheet| - spreadsheet.default_sheet = sheet - spreadsheet.to_yaml.scan(/\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,4}\b/) - end.flatten - end - end end -end \ No newline at end of file +end diff --git a/lib/colander/parser/xlsx.rb b/lib/colander/parser/xlsx.rb index 257df72..aab3e73 100644 --- a/lib/colander/parser/xlsx.rb +++ b/lib/colander/parser/xlsx.rb @@ -1,13 +1,21 @@ require 'colander/parser/base' -require 'roo' - +require 'zip' +require 'iconv' module Colander module Parser class Xlsx < Xls + protected - def parse_file - Excelx.new(@file_path,nil,:ignore) + + def payload + ''.tap do |string| + Zip::ZipInputStream::open(@file_path) do |io| + while (entry = io.get_next_entry) + string << io.read + end + end + end end end end -end \ No newline at end of file +end diff --git a/lib/colander/version.rb b/lib/colander/version.rb index 2c14ab3..1be4fce 100644 --- a/lib/colander/version.rb +++ b/lib/colander/version.rb @@ -1,3 +1,3 @@ module Colander - VERSION = "0.0.3" + VERSION = "0.1.0" end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 30297f6..c7f86ca 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -1,3 +1,4 @@ $:.push File.expand_path("../lib", __FILE__) -require 'colander' \ No newline at end of file +require 'colander' +require 'pry' diff --git a/spec/xls_spec.rb b/spec/xls_spec.rb index 669fb8f..cf825d0 100644 --- a/spec/xls_spec.rb +++ b/spec/xls_spec.rb @@ -3,9 +3,8 @@ describe Colander::Parser::Xls do describe "#parse" do it "stores found emails" do - Excel.stub(:new) parser = Colander::Parser::Xls.new("file/path") - parser.should_receive(:collect_emails).and_return(["bruce@wayne.com"]) + parser.should_receive(:parse_file).and_return("bruce@wayne.com") parser.parse parser.emails.sort.should eql(["bruce@wayne.com"]) end @@ -26,21 +25,19 @@ end it "retreives emails from an 95-excel spreadsheet" do - pending "handle encoding error" parser = Colander::Parser::Xls.new("spec/fixtures/excel95.xls") parser.parse - parser.emails.should eql(["markus.nordin@mynewsdesk.com", "markus@hej.se", "sven@bertil.se", "Adam.A@hotmail.com", "apa@elabs.se", "liam@neeson.net", "david@mynewsdesk.com"]) + parser.emails.sort.should eql(["markus.nordin@mynewsdesk.com", "markus@hej.se", "sven@bertil.se", "Adam.A@hotmail.com", "apa@elabs.se", "liam@neeson.net", "david@mynewsdesk.com"].sort) end it "retreives emails from an xls spreadsheet" do parser = Colander::Parser::Xls.new("spec/fixtures/old-format.xls") parser.parse - parser.emails.should eql(["markus.nordin@mynewsdesk.com", "markus@hej.se", "sven@bertil.se", "Adam.A@hotmail.com", "apa@elabs.se", "liam@neeson.net", "david@mynewsdesk.com"]) + parser.emails.sort.should eql(["markus.nordin@mynewsdesk.com", "markus@hej.se", "sven@bertil.se", "Adam.A@hotmail.com", "apa@elabs.se", "liam@neeson.net", "david@mynewsdesk.com"].sort) end it "retreives emails from an 95-excel spreadsheet without file suffix" do - pending "handle encoding error" parser = Colander::Parser::Xls.new("spec/fixtures/excel95-without-file-suffix") parser.parse - parser.emails.should eql(["markus.nordin@mynewsdesk.com", "markus@hej.se", "sven@bertil.se", "Adam.A@hotmail.com", "apa@elabs.se", "liam@neeson.net", "david@mynewsdesk.com"]) + parser.emails.sort.should eql(["markus.nordin@mynewsdesk.com", "markus@hej.se", "sven@bertil.se", "Adam.A@hotmail.com", "apa@elabs.se", "liam@neeson.net", "david@mynewsdesk.com"].sort) end end end diff --git a/spec/xlsx_spec.rb b/spec/xlsx_spec.rb index 852440d..dc7c8a8 100644 --- a/spec/xlsx_spec.rb +++ b/spec/xlsx_spec.rb @@ -8,7 +8,7 @@ it "retreives emails from an xlsx spreadsheet" do parser = Colander::Parser::Xlsx.new("spec/fixtures/new-format.xlsx") parser.parse - parser.emails.should eql(["markus.nordin@mynewsdesk.com", "markus@hej.se", "sven@bertil.se", "Adam.A@hotmail.com", "apa@elabs.se", "liam@neeson.net", "david@mynewsdesk.com"]) + parser.emails.sort.should eql(["markus.nordin@mynewsdesk.com", "markus@hej.se", "sven@bertil.se", "Adam.A@hotmail.com", "apa@elabs.se", "liam@neeson.net", "david@mynewsdesk.com"].sort) end end end