Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for .xls (Excel 1997-2004) #599

Merged
merged 2 commits into from
May 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,10 @@ GEM
roo (2.10.1)
nokogiri (~> 1)
rubyzip (>= 1.3.0, < 3.0.0)
roo-xls (1.2.0)
nokogiri
roo (>= 2.0.0, < 3)
spreadsheet (> 0.9.0)
rspec (3.13.0)
rspec-core (~> 3.13.0)
rspec-expectations (~> 3.13.0)
Expand Down Expand Up @@ -374,6 +378,7 @@ GEM
ruby-next-core (1.0.1)
ruby-next-parser (3.2.2.0)
parser (>= 3.0.3.1)
ruby-ole (1.2.13.1)
ruby-openai (6.4.0)
event_stream_parser (>= 0.3.0, < 2.0.0)
faraday (>= 1)
Expand All @@ -389,6 +394,9 @@ GEM
faraday (>= 0.17.5, < 3.a)
jwt (>= 1.5, < 3.0)
multi_json (~> 1.10)
spreadsheet (1.3.1)
bigdecimal
ruby-ole
standard (1.34.0)
language_server-protocol (~> 3.17.0.2)
lint_roller (~> 1.0)
Expand Down Expand Up @@ -488,6 +496,7 @@ DEPENDENCIES
rdiscount (~> 2.2.7)
replicate-ruby (~> 0.2.2)
roo (~> 2.10.0)
roo-xls (~> 1.2.0)
rspec (~> 3.0)
rubocop
ruby-openai (~> 6.4.0)
Expand Down
1 change: 1 addition & 0 deletions langchain.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ Gem::Specification.new do |spec|
spec.add_development_dependency "replicate-ruby", "~> 0.2.2"
spec.add_development_dependency "qdrant-ruby", "~> 0.9.4"
spec.add_development_dependency "roo", "~> 2.10.0"
spec.add_development_dependency "roo-xls", "~> 1.2.0"
spec.add_development_dependency "ruby-openai", "~> 6.4.0"
spec.add_development_dependency "safe_ruby", "~> 1.0.4"
spec.add_development_dependency "sequel", "~> 5.68.0"
Expand Down
27 changes: 27 additions & 0 deletions lib/langchain/processors/xls.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# frozen_string_literal: true

module Langchain
module Processors
class Xls < Base
EXTENSIONS = [".xls"].freeze
CONTENT_TYPES = ["application/vnd.ms-excel"].freeze

def initialize(*)
depends_on "roo"
depends_on "roo-xls"
end

# Parse the document and return the text
# @param [File] data
# @return [Array<Array<String>>] Array of rows, each row is an array of cells
def parse(data)
xls_file = Roo::Spreadsheet.open(data)
xls_file.each_with_pagename.flat_map do |_, sheet|
sheet.map do |row|
row.map { |i| i.to_s.strip }
end
end
end
end
end
end
Binary file added spec/fixtures/loaders/sample.xls
Binary file not shown.
29 changes: 29 additions & 0 deletions spec/langchain/processors/xls_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# frozen_string_literal: true

RSpec.describe Langchain::Processors::Xls do
describe "#parse" do
let(:file) { File.open("spec/fixtures/loaders/sample.xls") }
let(:data) {
[
["Username", "Identifier", "First name", "Last name"],
["booker12", "9012", "Rachel", "Booker"],
["grey07", "2070", "Laura", "Grey"],
["johnson81", "4081", "Craig", "Johnson"],
["jenkins46", "9346", "Mary", "Jenkins"],
["smith79", "5079", "Jamie", "Smith"],

["FirstName", "LastName", "Street", "Town", "ZIP"],
["John", "Doe", "120 jefferson st.", "Riverside", "8075"],
["Jack", "McGinnis", "220 hobo Av.", "Phila", "9119"],
['John "Da Man"', "Repici", "120 Jefferson St.", "Riverside", "8075"],
["Stephen", "Tyler", '7452 Terrace "At the Plaza" road', "SomeTown", "91234"],
["", "Blankman", "", "SomeTown", "298"],
['Joan "the bone", Anne', "Jet", "9th, at Terrace plc", "Desert City", "123"]
]
}

it "parses the file and returns the text" do
expect(described_class.new.parse(file)).to eq(data)
end
end
end