/
story_parser.rb
1025 lines (883 loc) · 39.7 KB
/
story_parser.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Parse stories from other websites and uploaded files, looking for metadata to harvest
# and put into the archive.
#
class StoryParser
require 'timeout'
require 'nokogiri'
require 'mechanize'
require 'open-uri'
include HtmlCleaner
META_PATTERNS = {:title => 'Title',
:notes => 'Note',
:summary => 'Summary',
:freeform_string => 'Tag',
:fandom_string => 'Fandom',
:rating_string => 'Rating',
:warning_string => 'Warning',
:relationship_string => 'Relationship|Pairing',
:character_string => 'Character',
:revised_at => 'Date|Posted|Posted on|Posted at',
:chapter_title => 'Chapter Title'
}
# Use this for raising custom error messages
# (so that we can distinguish them from unexpected exceptions due to
# faulty code)
class Error < StandardError
end
# These attributes need to be moved from the work to the chapter
# format: {:work_attribute_name => :chapter_attribute_name} (can be the same)
CHAPTER_ATTRIBUTES_ONLY = {}
# These attributes need to be copied from the work to the chapter
CHAPTER_ATTRIBUTES_ALSO = {:revised_at => :published_at}
### NOTE ON KNOWN SOURCES
# These lists will stop with the first one it matches, so put more-specific matches
# towards the front of the list.
# places for which we have a custom parse_story_from_[source] method
# for getting information out of the downloaded text
KNOWN_STORY_PARSERS = %w(deviantart dw lj lotrfanfiction twilightarchives)
# places for which we have a custom parse_author_from_[source] method
# which returns an external_author object including an email address
KNOWN_AUTHOR_PARSERS= %w(lj minotaur)
# places for which we have a download_story_from_[source]
# used to customize the downloading process
KNOWN_STORY_LOCATIONS = %w(lj)
# places for which we have a download_chaptered_from
# to get a set of chapters all together
CHAPTERED_STORY_LOCATIONS = %w(ffnet thearchive_net efiction)
# regular expressions to match against the URLS
SOURCE_LJ = '((live|dead|insane)?journal(fen)?\.com)|dreamwidth\.org'
SOURCE_DW = 'dreamwidth\.org'
SOURCE_FFNET = '(^|[^A-Za-z0-9-])fanfiction\.net'
SOURCE_MINOTAUR = '(bigguns|firstdown).slashdom.net'
SOURCE_DEVIANTART = 'deviantart\.com'
SOURCE_LOTRFANFICTION = 'lotrfanfiction\.com'
SOURCE_TWILIGHTARCHIVES = 'twilightarchives\.com'
SOURCE_THEARCHIVE_NET = 'the\-archive\.net'
SOURCE_EFICTION = 'viewstory\.php'
# time out if we can't download fast enough
STORY_DOWNLOAD_TIMEOUT = 60
MAX_CHAPTER_COUNT = 200
# To check for duplicate chapters, take a slice this long out of the story
# (in characters)
DUPLICATE_CHAPTER_LENGTH = 10000
# Import many stories
def import_from_urls(urls, options = {})
# Try to get the works
works = []
failed_urls = []
errors = []
urls.each do |url|
begin
work = download_and_parse_story(url, options)
if work && work.save
work.chapters.each {|chap| chap.save}
works << work
else
failed_urls << url
errors << work.errors.values.join(", ")
work.delete if work
end
rescue Timeout::Error
failed_urls << url
errors << "Import has timed out. This may be due to connectivity problems with the source site. Please try again in a few minutes, or check Known Issues to see if there are import problems with this site."
work.delete if work
rescue Error => exception
failed_urls << url
errors << "We couldn't successfully import that work, sorry: #{exception.message}"
work.delete if work
end
end
return [works, failed_urls, errors]
end
### DOWNLOAD-AND-PARSE WRAPPERS
# General pathway for story importing:
#
# Starting points:
# - import_from_urls --> repeatedly calls download_and_parse_story
# - download_and_parse_story
# - download_and_parse_chapters_into_story
# - (download_and_parse_chapter_of_work -- requires existing work)
#
# Each of these will download the content and then hand it off to a parser.
#
# Parsers:
# - parse_story: for a work of one single chapter downloaded as a single text string
# - parse_chapters_into_story: for a work of multiple chapters downloaded as an array of text strings (the separate chapter contents)
# - parse_chapter_of_work: essentially duplicates parse_story, but turns the content into a chapter of an existing work
#
# All of these parsers then go into
# - parse_common: processes a single text string, cleaning up HTML and looking for meta information
# - sanitize_params: after processing, clean up the params and strip out bad HTML
#
# If the story is from a known source, parse_common hands off to a custom parser built just for that source,
# including parse_story_from_lj. If not known, it falls
# back on parse_story_from_unknown.
#
# The various parsers use different methods to collect up metadata, and generically we also use:
# - scan_text_for_meta: looks for text patterns like [metaname]: [value] eg, "Fandom: Highlander"
#
# Shared options:
#
# :do_not_set_current_author - true means do not save the current user as an author
# :importing for others - true means try and add external author for the work
# :pseuds - a list of pseuds to set as authors
# :set_tags, :fandom, :rating, :warning, :character, :relationship - sets these tags
# :override_tags - set tag values even if some were parsed out of the work
# :post_without_preview - if true, mark the story as posted without previewing
#
# Downloads a story and passes it on to the parser.
# If the URL of the story is from a site for which we have special rules
# (eg, downloading from a livejournal clone, you want to use ?format=light
# to get a nice and consistent post format), it will pre-process the url
# according to the rules for that site.
def download_and_parse_story(location, options = {})
check_for_previous_import(location)
work = nil
source = get_source_if_known(CHAPTERED_STORY_LOCATIONS, location)
if source.nil?
story = download_text(location)
work = parse_story(story, location, options)
else
work = download_and_parse_chaptered_story(source, location, options)
end
return work
end
# download and add a new chapter to the end of a work
def download_and_parse_chapter_of_work(work, location, options = {})
chapter_content = download_text(location)
return parse_chapter_of_work(work, chapter_content, location, options)
end
# Given an array of urls for chapters of a single story,
# download them all and combine into a single work
def download_and_parse_chapters_into_story(locations, options = {})
check_for_previous_import(locations.first)
chapter_contents = []
locations.each do |location|
chapter_contents << download_text(location)
end
return parse_chapters_into_story(locations.first, chapter_contents, options)
end
### PARSING METHODS
# Parses the text of a story, optionally from a given location.
def parse_story(story, location, options = {})
work_params = parse_common(story, location, options[:encoding])
# move any attributes from work to chapter if necessary
return set_work_attributes(Work.new(work_params), location, options)
end
# parses and adds a new chapter to the end of the work
def parse_chapter_of_work(work, chapter_content, location, options = {})
tmp_work_params = parse_common(chapter_content, location, options[:encoding])
chapter = get_chapter_from_work_params(tmp_work_params)
work.chapters << set_chapter_attributes(work, chapter, location, options)
return work
end
def parse_chapters_into_story(location, chapter_contents, options = {})
work = nil
chapter_contents.each do |content|
work_params = parse_common(content, location, options[:encoding])
if work.nil?
# create the new work
work = Work.new(work_params)
else
new_chapter = get_chapter_from_work_params(work_params)
work.chapters << set_chapter_attributes(work, new_chapter, location, options)
end
end
return set_work_attributes(work, location, options)
end
# tries to create an external author for a given url
def parse_author(location,external_author_name,external_author_email)
#If e_email option value is present (archivist importing from somewhere not supported for auto autho grab)
#will have value there, otherwise continue as usual. If filled, just pass values to create or find external author
#Stephanie 8-1-2013
#might want to add check for external author name also here, steph 12/10/2013
if external_author_email.present?
return parse_author_common(external_author_email,external_author_name)
else
source = get_source_if_known(KNOWN_AUTHOR_PARSERS, location)
if !source.nil?
return eval("parse_author_from_#{source.downcase}(location)")
end
return parse_author_from_unknown(location)
end
end
# Everything below here is protected and should not be touched by outside
# code -- please use the above functions to parse external works.
#protected
# download an entire story from an archive type where we know how to parse multi-chaptered works
# this should only be called from download_and_parse_story
def download_and_parse_chaptered_story(source, location, options = {})
chapter_contents = eval("download_chaptered_from_#{source.downcase}(location)")
return parse_chapters_into_story(location, chapter_contents, options)
end
# our custom url finder checks for previously imported URL in almost any format it may have been presented
def check_for_previous_import(location)
if Work.find_by_url(location).present?
raise Error, "A work has already been imported from #{location}."
end
end
def set_chapter_attributes(work, chapter, location, options = {})
chapter.position = work.chapters.length + 1
chapter.posted = true # if options[:post_without_preview]
return chapter
end
def set_work_attributes(work, location="", options = {})
raise Error, "Work could not be downloaded" if work.nil?
work.imported_from_url = location
work.expected_number_of_chapters = work.chapters.length
# set authors for the works
pseuds = []
pseuds << User.current_user.default_pseud unless options[:do_not_set_current_author] || User.current_user.nil?
pseuds << options[:archivist].default_pseud if options[:archivist]
pseuds += options[:pseuds] if options[:pseuds]
pseuds = pseuds.uniq
raise Error, "A work must have at least one author specified" if pseuds.empty?
pseuds.each do |pseud|
work.pseuds << pseud unless work.pseuds.include?(pseud)
work.chapters.each {|chapter| chapter.pseuds << pseud unless chapter.pseuds.include?(pseud)}
end
# handle importing works for others
# build an external creatorship for each author
if options[:importing_for_others]
external_author_names = options[:external_author_names] || parse_author(location, options[:external_author_name], options[:external_author_email])
# convert to an array if not already one
external_author_names = [external_author_names] if external_author_names.is_a?(ExternalAuthorName)
if options[:external_coauthor_name].present?
external_author_names << parse_author(location, options[:external_coauthor_name], options[:external_coauthor_email])
end
external_author_names.each do |external_author_name|
if external_author_name && external_author_name.external_author
if external_author_name.external_author.do_not_import
# we're not allowed to import works from this address
raise Error, "Author #{external_author_name.name} at #{external_author_name.external_author.email} does not allow importing their work to this archive."
end
work.external_creatorships.build(external_author_name: external_author_name, archivist: (options[:archivist] || User.current_user))
end
end
end
# lock to registered users if specified or importing for others
work.restricted = options[:restricted] || options[:importing_for_others] || false
# set default values for required tags for any works that don't have them
work.fandom_string = (options[:fandom].blank? ? ArchiveConfig.FANDOM_NO_TAG_NAME : options[:fandom]) if (options[:override_tags] || work.fandoms.empty?)
work.rating_string = (options[:rating].blank? ? ArchiveConfig.RATING_DEFAULT_TAG_NAME : options[:rating]) if (options[:override_tags] || work.ratings.empty?)
work.warning_strings = (options[:warning].blank? ? ArchiveConfig.WARNING_DEFAULT_TAG_NAME : options[:warning]) if (options[:override_tags] || work.warnings.empty?)
work.category_string = options[:category] if !options[:category].blank? && (options[:override_tags] || work.categories.empty?)
work.character_string = options[:character] if !options[:character].blank? && (options[:override_tags] || work.characters.empty?)
work.relationship_string = options[:relationship] if !options[:relationship].blank? && (options[:override_tags] || work.relationships.empty?)
work.freeform_string = options[:freeform] if !options[:freeform].blank? && (options[:override_tags] || work.freeforms.empty?)
work.summary = options[:summary] if !options[:summary].blank?
# set collection name if present
work.collection_names = get_collection_names(options[:collection_names]) if !options[:collection_names].blank?
# set default language (English)
work.language_id = options[:language_id] || Language.default.id
# set default value for title
work.title = "Untitled Imported Work" if work.title.blank?
work.posted = true if options[:post_without_preview]
work.chapters.each do |chapter|
if chapter.content.length > ArchiveConfig.CONTENT_MAX
# TODO: eventually: insert a new chapter
chapter.content.truncate(ArchiveConfig.CONTENT_MAX, :omission => "<strong>WARNING: import truncated automatically because chapter was too long! Please add a new chapter for remaining content.</strong>", :separator => "</p>")
end
chapter.posted = true
# ack! causing the chapters to exist even if work doesn't get created!
# chapter.save
end
return work
end
def parse_author_from_lj(location)
if location.match( /^(http:\/\/)?([^\.]*).(livejournal.com|dreamwidth.org|insanejournal.com|journalfen.net)/)
email = name = ""
lj_name = $2
site_name = $3
if lj_name == "community"
# whups
post_text = download_text(location)
doc = Nokogiri.parse(post_text)
lj_name = doc.xpath("/html/body/div[2]/div/div/div/table/tbody/tr/td[2]/span/a[2]/b").content
end
profile_url = "http://#{lj_name}.#{site_name}/profile"
lj_profile = download_text(profile_url)
doc = Nokogiri.parse(lj_profile)
contact = doc.css('div.contact').inner_html
contact.gsub! '<p class="section_body_title">Contact:</p>', ""
contact.gsub! /<\/?(span|i)>/, ""
contact.gsub! /\n/, ""
contact.gsub! /<br\/>/, ""
if contact.match(/(.*@.*\..*)/)
email = $1
end
if email.blank?
email = "#{lj_name}@#{site_name}"
end
return parse_author_common(email, lj_name)
end
end
def parse_author_from_unknown(location)
# for now, nothing
return nil
end
# custom author parser for the whitfic and grahamslash archives we're rescuing
# known problem: this will only find the first author for a given story, not coauthors
def parse_author_from_minotaur(location)
# get the index page of the archive
# and the relative link for story we are downloading
if location =~ /firstdown/
author_index = download_text("http://firstdown.slashdom.net/authors.html")
storylink = location.gsub("http://firstdown.slashdom.net/", "")
elsif location =~ /bigguns/
author_index = download_text("http://bigguns.slashdom.net/stories/authors.html")
storylink = location.gsub("http://bigguns.slashdom.net/stories/", "")
end
doc = Nokogiri.parse(author_index)
# find the author just before the story
doc.search("a").each do |node|
if node[:href] =~ /mailto:(.*)/
authornode = node
end
if node[:href] == storylink
# the last-found authornode is the right one
break
end
end
email = authornode[:href].gsub("mailto:", '')
name = authornode.inner_text
return parse_author_common(email, name)
end
def parse_author_common(email, name)
# convert to ASCII and strip out invalid characters (everything except alphanumeric characters, _, @ and -)
name = name.to_ascii.gsub(/[^\w[ \-@\.]]/u, "")
external_author = ExternalAuthor.find_or_create_by_email(email)
unless name.blank?
external_author_name = ExternalAuthorName.where(name: name, external_author_id: external_author.id).first ||
ExternalAuthorName.new(name: name)
external_author.external_author_names << external_author_name
external_author.save
end
external_author_name || external_author.default_name
end
def get_chapter_from_work_params(work_params)
@chapter = Chapter.new(work_params[:chapter_attributes])
# don't override specific chapter params (eg title) with work params
chapter_params = work_params.delete_if {|name, param| !@chapter.attribute_names.include?(name.to_s) || !@chapter.send(name.to_s).blank?}
@chapter.update_attributes(chapter_params)
return @chapter
end
def download_text(location)
story = ""
source = get_source_if_known(KNOWN_STORY_LOCATIONS, location)
if source.nil?
story = download_with_timeout(location)
else
story = eval("download_from_#{source.downcase}(location)")
end
return story
end
# canonicalize the url for downloading from lj or clones
def download_from_lj(location)
url = location
url.gsub!(/\#(.*)$/, "") # strip off any anchor information
url.gsub!(/\?(.*)$/, "") # strip off any existing params at the end
url.gsub!('_', '-') # convert underscores in usernames to hyphens
url += "?format=light" # go to light format
text = download_with_timeout(url)
if text.match(/adult_check/)
Timeout::timeout(STORY_DOWNLOAD_TIMEOUT) {
begin
agent = Mechanize.new
url.include?("dreamwidth") ? form = agent.get(url).forms.first : form = agent.get(url).forms.third
page = agent.submit(form, form.buttons.first) # submits the adult concepts form
text = page.body.force_encoding(agent.page.encoding)
rescue
text = ""
end
}
end
return text
end
# grab all the chapters of the story from ff.net
def download_chaptered_from_ffnet(location)
raise Error, "Sorry, Fanfiction.net does not allow imports from their site."
end
# this is an efiction archive but it doesn't handle chapters normally
# best way to handle is to get the full story printable version
# We have to make it a download-chaptered because otherwise it gets sent to the
# generic efiction version since chaptered sources are checked first
def download_chaptered_from_thearchive_net(location)
if location.match(/^(.*)\/.*viewstory\.php.*[^p]sid=(\d+)($|&)/i)
location = "#{$1}/viewstory.php?action=printable&psid=#{$2}"
end
text = download_with_timeout(location)
text.sub!('</style>', '</style></head>') unless text.match('</head>')
return [text]
end
# grab all the chapters of a story from an efiction-based site
def download_chaptered_from_efiction(location)
@chapter_contents = []
if location.match(/^(.*)\/.*viewstory\.php.*sid=(\d+)($|&)/i)
site = $1
storyid = $2
chapnum = 1
last_body = ""
Timeout::timeout(STORY_DOWNLOAD_TIMEOUT) {
loop do
url = "#{site}/viewstory.php?action=printable&sid=#{storyid}&chapter=#{chapnum}"
body = download_with_timeout(url)
# get a section to check that this isn't a duplicate of previous chapter
body_to_check = body.slice(10,DUPLICATE_CHAPTER_LENGTH)
if body.nil? || body_to_check == last_body || chapnum > MAX_CHAPTER_COUNT || body.match(/<div class='chaptertitle'> by <\/div>/) || body.match(/Access denied./) || body.match(/Chapter : /)
break
end
# save the value to check for duplicate chapter
last_body = body_to_check
# clean up the broken head in many efiction printable sites
body.sub!('</style>', '</style></head>') unless body.match('</head>')
@chapter_contents << body
chapnum = chapnum + 1
end
}
end
return @chapter_contents
end
# This is the heavy lifter, invoked by all the story and chapter parsers.
# It takes a single string containing the raw contents of a story, parses it with
# Nokogiri into the @doc object, and then and calls a subparser.
#
# If the story source can be identified as one of the sources we know how to parse in some custom/
# special way, parse_common calls the customized parse_story_from_[source] method.
# Otherwise, it falls back to parse_story_from_unknown.
#
# This produces a hash equivalent to the params hash that is normally created by the standard work
# upload form.
#
# parse_common then calls sanitize_params (which would also be called on the standard work upload
# form results) and returns the final sanitized hash.
#
def parse_common(story, location = nil, encoding = nil)
work_params = { :title => "UPLOADED WORK", :chapter_attributes => {:content => ""} }
@doc = Nokogiri::HTML.parse(story, nil, encoding) rescue ""
# Try to convert all relative links to absolute
base = @doc.at_css('base') ? @doc.css('base')[0]['href'] : location.split('?').first
if base.present?
@doc.css('a').each do |link|
if link['href'].present?
begin
query = link['href'].match(/(\?.*)$/) ? $1 : ''
link['href'] = URI.join(base, link['href'].gsub(/(\?.*)$/, '')).to_s + query
rescue
end
end
end
end
if location && (source = get_source_if_known(KNOWN_STORY_PARSERS, location))
params = eval("parse_story_from_#{source.downcase}(story)")
work_params.merge!(params)
else
work_params.merge!(parse_story_from_unknown(story))
end
return shift_chapter_attributes(sanitize_params(work_params))
end
# our fallback: parse a story from an unknown source, so we have no special
# rules.
def parse_story_from_unknown(story)
work_params = {:chapter_attributes => {}}
storyhead = @doc.css("head").inner_html if @doc.css("head")
# Story content - Look for progressively less specific containers or grab everything
element = @doc.at_css('.chapter-content') || @doc.at_css('body') || @doc.at_css('html') || @doc
storytext = element.inner_html
meta = {}
meta.merge!(scan_text_for_meta(storyhead)) unless storyhead.blank?
meta.merge!(scan_text_for_meta(story))
meta[:title] ||= @doc.css('title').inner_html
work_params[:chapter_attributes][:title] = meta.delete(:chapter_title)
work_params[:chapter_attributes][:content] = clean_storytext(storytext)
work_params = work_params.merge!(meta)
work_params
end
# Parses a story from livejournal or a livejournal equivalent (eg, dreamwidth, insanejournal)
# Assumes that we have downloaded the story from one of those equivalents (ie, we've downloaded
# it in format=light which is a stripped-down plaintext version.)
#
def parse_story_from_lj(story)
work_params = {:chapter_attributes => {}}
# in LJ "light" format, the story contents are in the second div
# inside the body.
body = @doc.css("body")
storytext = body.css("article.b-singlepost-body").inner_html
storytext = body.inner_html if storytext.empty?
# cleanup the text
# storytext.gsub!(/<br\s*\/?>/i, "\n") # replace the breaks with newlines
storytext = clean_storytext(storytext)
work_params[:chapter_attributes][:content] = storytext
work_params[:title] = @doc.css("title").inner_html
work_params[:title].gsub! /^[^:]+: /, ""
work_params.merge!(scan_text_for_meta(storytext))
date = @doc.css("time.b-singlepost-author-date")
unless date.empty?
work_params[:revised_at] = convert_revised_at(date.first.inner_text)
end
return work_params
end
def parse_story_from_dw(story)
work_params = {:chapter_attributes => {}}
body = @doc.css("body")
content_divs = body.css("div.contents")
unless content_divs[0].nil?
# Get rid of the DW metadata table
content_divs[0].css("div.currents, ul.entry-management-links, div.header.inner, span.restrictions, h3.entry-title").each do |node|
node.remove
end
storytext = content_divs[0].inner_html
else
storytext = body.inner_html
end
# cleanup the text
# storytext.gsub!(/<br\s*\/?>/i, "\n") # replace the breaks with newlines
storytext = clean_storytext(storytext)
work_params[:chapter_attributes][:content] = storytext
work_params[:title] = @doc.css("title").inner_html
work_params[:title].gsub! /^[^:]+: /, ""
work_params.merge!(scan_text_for_meta(storytext))
font_blocks = @doc.xpath('//font')
unless font_blocks.empty?
date = font_blocks.first.inner_text
work_params[:revised_at] = convert_revised_at(date)
end
# get the date
date = @doc.css("span.date").inner_text
work_params[:revised_at] = convert_revised_at(date)
return work_params
end
def parse_story_from_deviantart(story)
work_params = {:chapter_attributes => {}}
storytext = ""
notes = ""
body = @doc.css("body")
title = @doc.css("title").inner_html.gsub /\s*on deviantart$/i, ""
# Find the image (original size) if it's art
image_full = body.css("div.dev-view-deviation img.dev-content-full")
unless image_full[0].nil?
storytext = "<center><img src=\"#{image_full[0]["src"]}\"></center>"
end
# Find the fic text if it's fic (needs the id for disambiguation, the "deviantART loves you" bit in the footer has the same class path)
text_table = body.css(".grf-indent > div:nth-child(1)")[0]
unless text_table.nil?
# Try to remove some metadata (title and author) from the work's text, if possible
# Try to remove the title: if it exists, and if it's the same as the browser title
if text_table.css("h1")[0].present? && title && title.match(text_table.css("h1")[0].text)
text_table.css("h1")[0].remove
end
# Try to remove the author: if it exists, and if it follows a certain pattern
if text_table.css("small")[0].present? && text_table.css("small")[0].inner_html.match(/by ~.*?<a class="u" href=/m)
text_table.css("small")[0].remove
end
storytext = text_table.inner_html
end
# cleanup the text
storytext.gsub!(/<br\s*\/?>/i, "\n") # replace the breaks with newlines
storytext = clean_storytext(storytext)
work_params[:chapter_attributes][:content] = storytext
# Find the notes
content_divs = body.css("div.text-ctrl div.text")
unless content_divs[0].nil?
notes = content_divs[0].inner_html
end
# cleanup the notes
notes.gsub!(/<br\s*\/?>/i, "\n") # replace the breaks with newlines
notes = clean_storytext(notes)
work_params[:notes] = notes
work_params.merge!(scan_text_for_meta(notes))
work_params[:title] = title
body.css("div.dev-title-container h1 a").each do |node|
if node["class"] != "u"
work_params[:title] = node.inner_html
end
end
tags = []
@doc.css("div.dev-about-cat-cc a.h").each { |node| tags << node.inner_html }
work_params[:freeform_string] = clean_tags(tags.join(ArchiveConfig.DELIMITER_FOR_OUTPUT))
details = @doc.css("div.dev-right-bar-content span[title]")
unless details[0].nil?
work_params[:revised_at] = convert_revised_at(details[0].inner_text)
end
return work_params
end
def parse_story_from_lotrfanfiction(story)
work_params = parse_story_from_modified_efiction(story, "lotrfanfiction")
work_params[:fandom_string] = "Lord of the Rings"
work_params
end
def parse_story_from_twilightarchives(story)
work_params = parse_story_from_modified_efiction(story, "twilightarchives")
work_params[:fandom_string] = "Twilight"
work_params
end
def parse_story_from_modified_efiction(story, site = "")
work_params = {:chapter_attributes => {}}
storytext = @doc.css("div.chapter").inner_html
storytext = clean_storytext(storytext)
work_params[:chapter_attributes][:content] = storytext
work_params[:title] = @doc.css("html body div#pagetitle a").first.inner_text.strip
work_params[:chapter_attributes][:title] = @doc.css(".chaptertitle").inner_text.gsub(/ by .*$/, '').strip
# harvest data
info = @doc.css(".infobox .content").inner_html
if info.match(/Summary:.*?>(.*?)<br>/m)
work_params[:summary] = clean_storytext($1)
end
infotext = @doc.css(".infobox .content").inner_text
# Turn categories, genres, warnings into freeform tags
tags = []
if infotext.match(/Categories: (.*) Characters:/)
tags += $1.split(',').map {|c| c.strip}.uniq unless $1 == "None"
end
if infotext.match(/Genres: (.*)Warnings/)
tags += $1.split(',').map {|c| c.strip}.uniq unless $1 == "None"
end
if infotext.match(/Warnings: (.*)Challenges/)
tags += $1.split(',').map {|c| c.strip}.uniq unless $1 == "None"
end
work_params[:freeform_string] = clean_tags(tags.join(ArchiveConfig.DELIMITER_FOR_OUTPUT))
# use last updated date as revised_at date
if site == "lotrfanfiction" && infotext.match(/Updated: (\d\d)\/(\d\d)\/(\d\d)/)
# need yy/mm/dd to convert
work_params[:revised_at] = convert_revised_at("#{$3}/#{$2}/#{$1}")
elsif site == "twilightarchives" && infotext.match(/Updated: (.*)$/)
work_params[:revised_at] = convert_revised_at($1)
end
# get characters
if infotext.match(/Characters: (.*)Genres:/)
work_params[:character_string] = $1.split(',').map {|c| c.strip}.uniq.join(',') unless $1 == "None"
end
# save the readcount
readcount = 0
if infotext.match(/Read: (\d+)/)
readcount = $1
end
work_params[:notes] = (readcount == 0 ? "" : "<p>This work was imported from another site, where it had been read #{readcount} times.</p>")
# story notes, chapter notes, end notes
@doc.css(".notes").each do |note|
if note.inner_html.match(/Story Notes/)
work_params[:notes] += note.css('.noteinfo').inner_html
elsif note.inner_html.match(/(Chapter|Author\'s) Notes/)
work_params[:chapter_attributes][:notes] = note.css('.noteinfo').inner_html
elsif note.inner_html.match(/End Notes/)
work_params[:chapter_attributes][:endnotes] = note.css('.noteinfo').inner_html
end
end
if infotext.match(/Completed: No/)
work_params[:complete] = false
else
work_params[:complete] = true
end
return work_params
end
# Move and/or copy any meta attributes that need to be on the chapter rather
# than on the work itself
def shift_chapter_attributes(work_params)
CHAPTER_ATTRIBUTES_ONLY.each_pair do |work_attrib, chapter_attrib|
if work_params[work_attrib] && !work_params[:chapter_attributes][chapter_attrib]
work_params[:chapter_attributes][chapter_attrib] = work_params[work_attrib]
work_params.delete(work_attrib)
end
end
# copy any attributes from work to chapter as necessary
CHAPTER_ATTRIBUTES_ALSO.each_pair do |work_attrib, chapter_attrib|
if work_params[work_attrib] && !work_params[:chapter_attributes][chapter_attrib]
work_params[:chapter_attributes][chapter_attrib] = work_params[work_attrib]
end
end
work_params
end
# Find any cases of the given pieces of meta in the given text
# and return a hash of meta values
def scan_text_for_meta(text)
# break up the text with some extra newlines to make matching more likely
# and strip out some tags
text = text.gsub(/<br/, "\n<br")
text.gsub!(/<p/, "\n<p")
text.gsub!(/<\/?(label|span|div|b)(.*?)?>/, '')
meta = {}
metapatterns = META_PATTERNS
is_tag = Hash.new.tap do |h|
%w(fandom_string relationship_string freeform_string rating_string warning_string).each do |c|
h[c.to_sym] = true
end
end
handler = Hash.new.tap do |h|
%w(rating_string revised_at).each do |c|
h[c.to_sym] = "convert_#{c.to_s.downcase}"
end
end
# 1. Look for Pattern: (whatever), optionally followed by a closing p or div tag
# 2. Set meta[:metaname] = whatever
# eg, if it finds Fandom: Stargate SG-1 it will set meta[:fandom] = Stargate SG-1
# 3. convert_<metaname> for cleanup if such a function is defined (eg convert_rating_string)
metapatterns.each do |metaname, pattern|
metapattern = Regexp.new("(?:#{pattern}|#{pattern.pluralize})\s*:\s*(.*?)(?:</(?:p|div)>)?$", Regexp::IGNORECASE)
if text.match(metapattern)
value = $1
value = clean_tags(value) if is_tag[metaname]
value = clean_close_html_tags(value)
value.strip! # lose leading/trailing whitespace
value = send(handler[metaname], value) if handler[metaname]
meta[metaname] = value
end
end
return post_process_meta meta
end
def download_with_timeout(location, limit = 10)
story = ""
Timeout::timeout(STORY_DOWNLOAD_TIMEOUT) {
begin
# we do a little cleanup here in case the user hasn't included the 'http://'
# or if they've used capital letters or an underscore in the hostname
uri = URI.parse(location)
uri = URI.parse('http://' + location) if uri.class.name == "URI::Generic"
uri.host.downcase!
uri.host.gsub!(/_/, '-')
response = Net::HTTP.get_response(uri)
case response
when Net::HTTPSuccess
story = response.body
when Net::HTTPRedirection
if limit > 0
story = download_with_timeout(response['location'], limit - 1)
else
nil
end
else
nil
end
rescue Errno::ECONNREFUSED
nil
rescue SocketError
nil
rescue EOFError
nil
end
}
if story.blank?
raise Error, "We couldn't download anything from #{location}. Please make sure that the URL is correct and complete, and try again."
end
# clean up any erroneously included string terminator (Issue 785)
story.gsub!("\000", "")
story
end
def get_last_modified(location)
Timeout::timeout(STORY_DOWNLOAD_TIMEOUT) {
resp = open(location)
resp.last_modified
}
end
def get_source_if_known(known_sources, location)
known_sources.each do |source|
pattern = Regexp.new(eval("SOURCE_#{source.upcase}"), Regexp::IGNORECASE)
if location.match(pattern)
return source
end
end
nil
end
def clean_close_html_tags(value)
# if there are any closing html tags at the start of the value let's ditch them
value.gsub(/^(\s*<\/[^>]+>)+/, '')
end
# We clean the text as if it had been submitted as the content of a chapter
def clean_storytext(storytext)
storytext = storytext.encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "") unless storytext.encoding.name == "UTF-8"
return sanitize_value("content", storytext)
end
# works conservatively -- doesn't split on
# spaces and truncates instead.
def clean_tags(tags)
tags = Sanitize.clean(tags) # no html allowed in tags
if tags.match(/,/)
tagslist = tags.split(/,/)
else
tagslist = [tags]
end
newlist = []
tagslist.each do |tag|
tag.gsub!(/[\*\<\>]/, '')
tag = truncate_on_word_boundary(tag, ArchiveConfig.TAG_MAX)
newlist << tag unless tag.blank?
end
return newlist.join(ArchiveConfig.DELIMITER_FOR_OUTPUT)
end
def truncate_on_word_boundary(text, max_length)
return if text.blank?
words = text.split()
truncated = words.first
if words.length > 1
words[1..words.length].each do |word|
truncated += " " + word if truncated.length + word.length + 1 <= max_length
end
end
truncated[0..max_length-1]
end
# convert space-separated tags to comma-separated
def clean_and_split_tags(tags)
if !tags.match(/,/) && tags.match(/\s/)
tags = tags.split(/\s+/).join(',')
end
return clean_tags(tags)
end
# Convert the common ratings into whatever ratings we're
# using on this archive.
def convert_rating_string(rating)
rating = rating.downcase
if rating.match(/^(nc-?1[78]|x|ma|explicit)/)
ArchiveConfig.RATING_EXPLICIT_TAG_NAME
elsif rating.match(/^(r|m|mature)/)
ArchiveConfig.RATING_MATURE_TAG_NAME
elsif rating.match(/^(pg-?1[35]|t|teen)/)
ArchiveConfig.RATING_TEEN_TAG_NAME
elsif rating.match(/^(pg|g|k+|k|general audiences)/)
ArchiveConfig.RATING_GENERAL_TAG_NAME
else
ArchiveConfig.RATING_DEFAULT_TAG_NAME
end
end
def convert_revised_at(date_string)
begin
date = nil
if date_string.match(/^(\d+)$/)
# probably seconds since the epoch
date = Time.at($1.to_i)
end
date ||= Date.parse(date_string)
return '' if date > Date.today
return date
rescue ArgumentError, TypeError
return ''
end
end
# Additional processing for meta - currently to make sure warnings
# that aren't Archive warnings become additional tags instead
def post_process_meta(meta)
if meta[:warning_string]
new_warning = ''
meta[:warning_string].split(/\s?,\s?/).each do |warning|