Permalink
Browse files

added notes in solr schema; changed field names; updated routes and m…

…apper
  • Loading branch information...
1 parent 5b3fa40 commit 320904086ea1cb86d9272ec9e02e6688529c385a @mwmitchell committed Dec 21, 2009
View
@@ -368,7 +368,7 @@
Longer patterns will be matched first. if equal size patterns
both match, the first appearing in the schema will be used. -->
<dynamicField name="*_i" type="sint" indexed="true" stored="true"/>
- <dynamicField name="*_s" type="string" indexed="true" stored="true" termVectors="true"/>
+ <dynamicField name="*_s" type="string" indexed="true" stored="true" termVectors="true" multiValued="false"/>
<dynamicField name="*_mvs" type="string" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_l" type="slong" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text" indexed="true" stored="true"/>
@@ -380,7 +380,8 @@
<dynamicField name="*_facet" type="string" indexed="true" stored="true" multiValued="true"/>
- <dynamicField name="*_id" type="string" indexed="true" stored="true" termVectors="true"/>
+ <!-- a single-valued string with a convenient name -->
+ <dynamicField name="*_id" type="string" indexed="true" stored="true" termVectors="true" multiValued="false"/>
<dynamicField name="random*" type="random" />
@@ -406,16 +407,22 @@
<!-- copyField commands copy one field to another at the time a document
is added to the index. It's used either to index the same field differently,
or to add multiple fields to the same field for easier/faster searching. -->
- <copyField source="title" dest="text"/>
+
+ <!-- <copyField source="title" dest="text"/>
<copyField source="title" dest="titleSort"/>
<copyField source="title" dest="alphaTitleSort"/>
- <copyField source="title" dest="spell"/>
+ <copyField source="title" dest="spell"/> -->
- <copyField source="*_t" dest="text"/>
+ <!-- make these string based fields more searchable -->
+ <copyField source="*_facet" dest="text"/>
<copyField source="*_s" dest="text"/>
<copyField source="*_mvs" dest="text"/>
+ <!-- <copyField source="*_t" dest="text"/>
+ <copyField source="*_s" dest="text"/>
+ <copyField source="*_mvs" dest="text"/> -->
+
<!-- Similarity is the scoring routine for each document vs. a query.
A custom similarity may be specified here, but the default is fine
for most applications. -->
@@ -5,13 +5,12 @@ def index
end
def poem
- @response = Swinburne.find_by_poem_slug params[:poem_slug]
+ @response = Swinburne.find_by_poem_title_id params[:poem_title_id]
end
def poem_page
- @response = Swinburne.find_by_local_id params[:local_id]
- doc = @response.docs.first
- @relatives = Swinburne.find :fq => [%(collapse_id:"#{doc[:collapse_id]}"), %(poem_title_facet:"#{doc[:poem_title_facet]}")], :rows => 999999
+ @response = Swinburne.find_by_local_id params[:poem_title_id]
+ @relatives = Swinburne.find_relatives_of @response.docs.first
end
end
@@ -7,25 +7,25 @@ def self.find input_params
:q => input_params[:q],
:qt => "dismax",
:fq => %(collection_id:"swinburne"),
- 'facet.field' => ['poem_title_facet'],
+ 'facet.field' => ['poem_title_s'],
'facet' => true,
'facet.mincount' => 1,
:rows => 2_000_000_000,
'hl' => 'true',
'hl.fl' => 'xml_t',
'hl.fragsize' => 100,
- :fl => 'id,score,poem_title_facet,local_id,page_s'
+ :fl => 'id,score,poem_title_s,local_id,page_number_s'
}.merge(input_params)
connection.find search_params
end
- def self.find_by_poem_slug slug
+ def self.find_by_poem_title_id title_id
connection.find(
- :q=>%(poem_slug_s:"#{slug}"),
+ :q=>%(poem_title_id:"#{poem_title_id}"),
:fq => %(collection_id:"swinburne"),
:rows => 2_000_000_000,
'facet' => true,
- 'facet.field' => ['variant_facet'],
+ 'facet.field' => ['variant_s'],
'facet' => true,
'facet.mincount' => 1
)
@@ -35,4 +35,9 @@ def self.find_by_local_id local_id
connection.find :q => %(id:"swinburne-#{local_id}"), :rows => 1
end
+ # think "more like this"...
+ def self.find_relatives_of solr_doc
+ Swinburne.find :fq => [%(collapse_id:"#{solr_doc[:collapse_id]}"), %(poem_title_facet:"#{solr_doc[:poem_title_facet]}")], :rows => 999999
+ end
+
end
View
@@ -3,7 +3,16 @@
map.root :controller => 'pages', :action => 'index'
map.swinburne '/swinburne', :controller => 'swinburne', :action => 'index'
- map.swinburne_poem '/swinburne/:poem_slug', :controller => 'swinburne', :action => 'poem'
- map.swinburne_poem_page '/swinburne/:poem_slug/:local_id', :controller => 'swinburne', :action => 'poem_page'
+ map.swinburne_poem '/swinburne/:poem_title_id', :controller => 'swinburne', :action => 'poem'
+ map.swinburne_poem_page '/swinburne/:poem_title_id/:local_id', :controller => 'swinburne', :action => 'poem_page'
+
+ # /swinburne
+ # /swinburne/:variant_id
+ # /swinburne/:variant_id/:poem_id
+ # /swinburne/:variant_id/:poem_id/:page_number
+
+ # poem, all variants -- used for comparisons/text-diffs
+ # /swinburne/:poem_id
+ # /swinburne/:poem_id/:page_number
end
@@ -1,5 +1,7 @@
require 'raven'
require 'nokogiri_fragmenter'
+
+# string_ext brings in the to_slug method for strings
require 'string_ext'
class SwinburneMapper
@@ -17,12 +19,19 @@ def shared_fields
@shared_fields ||= (
fname = File.basename(xml_file)
{
+ # the string id of the entire swinburne collection... represents all poems, all variants, all pages etc..
:collection_id => collection_id,
+ # the file path where this info came from
:file_s => xml_file.sub("#{Rails.root}/", ''),
+ # the file-name
:filename_s => fname,
- :variant_facet => variant_id,
+ # the variant (better name?) which currently comes from the file name
+ :variant_s => variant_id,
+ # used to tie similar results together -- a source file's contents should be grouped together using this
:collapse_id => "#{collection_id}-#{variant_id}",
+ # the friendly title of this collection
:collection_title_t => xml.at('//sourceDesc/citnstruct/title').text,
+
:author_t => xml.at('//citnstruct/author').text,
:publisher_t => xml.at('//citnstruct/imprint/publisher').text,
:printer_t => xml.at('//citnstruct/imprint/printer').text,
@@ -42,9 +51,11 @@ def map &block
xml.search('//text').each do |text|
# create a title for the poem
poem_title = text['n'].nil? ? 'n/a' : text['n']
+
+ poem_id = poem_title.to_slug
+
puts "\n** processing new poem... #{poem_title}\n"
# individual pages broken up by tei pb tags....
-
NokogiriFragmenter.fragment(text, 'pb') do |page_fragment|
pb = page_fragment.at('pb')
@@ -57,19 +68,33 @@ def map &block
# the page number label
page_num = pb ? page_fragment.at('pb')['n'].scan(/[0-9]+/).first : 'n/a'
- # the actual page break solr document
+
+ # the TEI page-break solr document id
local_id = "#{variant_id}-#{doc_index}"
+
yield shared_fields.merge({
+ # absolute id, unique to ever solr document
:id => "#{collection_id}-#{local_id}",
+ # a short, unique id, local to this collection's poem
:local_id => local_id,
+ # used for displaying/transforming the raw xml
:xml_s => page_fragment.to_xml,
+
+ # raw xml within text field -- seems to work well forsource highlighting?
+ :xml_source_t => page_fragment.to_xml,
+
+ # the xml *text only*, used for highlighing and searching
:xml_t => page_fragment.text,
+ # push the xml text into the main "text" field for easy searching
:text => page_fragment.text,
- :poem_title_t => poem_title,
- :poem_title_facet => poem_title,
- :poem_slug_s => poem_title.to_slug,
- :title => "#{poem_title}, Page #{page_num}",
- :page_s => page_num,
+ # the poem title, stored as a facet
+ :poem_title_s => poem_title,
+ # the poem title, transformed into a url friendly value
+ :poem_title_id => poem_id,
+ # this solr document title
+ :title => "#{poem_title}, p. #{page_num}",
+ # the page number of this poem fragment
+ :page_number_s => page_num,
})
doc_index += 1
puts "..."

0 comments on commit 3209040

Please sign in to comment.