Skip to content

Commit

Permalink
Latest freelists were not being imported; Added middle name column
Browse files Browse the repository at this point in the history
  • Loading branch information
nigelhorne committed Apr 7, 2021
1 parent 221cd02 commit 5ccaec5
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 26 deletions.
2 changes: 2 additions & 0 deletions Changes
Expand Up @@ -3,6 +3,8 @@ Revision history for Genealogy::ObituaryDailyTimes
0.06
Reduce the amount of memory that bin/create_db uses
Error if 'last' is not given to search()
Latest freelists were not being imported
Added middle name column

0.05 Thu Mar 25 18:02:30 EDT 2021
2021 is v26 on freelists
Expand Down
47 changes: 27 additions & 20 deletions bin/create_db.PL
Expand Up @@ -80,14 +80,14 @@ die "$filename: $!" if(!defined($dbh));

$dbh->do('PRAGMA cache_size = -65536'); # 64MB
$dbh->do('PRAGMA journal_mode = OFF');
$dbh->do('CREATE TABLE obituaries(first VARCHAR, last VARCHAR NOT NULL, maiden VARCHAR, age INTEGER, place VARCHAR, newspaper VARCHAR NOT NULL, date DATE NOT NULL, url VARCHAR NOT NULL)');
$dbh->do('CREATE TABLE obituaries(first VARCHAR, middle, VARCHAR, last VARCHAR NOT NULL, maiden VARCHAR, age INTEGER, place VARCHAR, newspaper VARCHAR NOT NULL, date DATE NOT NULL, url VARCHAR NOT NULL)');

my @queue;
my $page = 1;
# while(mlarchives($ua, $page)) {
# $page++;
# flush($dbh) if(scalar(@queue) > 200_000);
# };
while(mlarchives($ua, $page)) {
$page++;
flush($dbh) if(scalar(@queue) > 200_000);
};

print ' ' x 78, "\r";

Expand Down Expand Up @@ -251,52 +251,53 @@ sub freelists($$) {
# print $data;
# exit;

my @lines = split(/\n|<br>|<br \/>$/ms, $data);
my @lines = split(/<br>|<br \/>$/ms, $data);

use Data::Dumper;
print Data::Dumper->new([\@lines])->Dump();
exit;
my $rc;

my $stage = 0;
while(my $line = shift(@lines)) {
# LAST NAME, First Name (MAIDEN); Age; Place of Death; Newspaper Name; Newspaper date; tagname

$line =~ s/^\n//;
next if($line eq '');
if($line =~ /\-\-\-\-\-\-\-\-/) {
$stage++;
last if($stage >= 3);
next if($stage != 2);
$line =~ s/^.*\-+//;
next if($line eq '');
}
next if($stage != 2);
$line =~ s/&nbsp;/ /g;
$line =~ s/\x{a0}/ /g;
$line = decode_entities($line);
$line =~ s/'/''/g;
my ($name, $age, $place, $newspaper, $date, $tag) = split(/;\s?/, $line);
# print __LINE__, ": $name, $age, $place, $newspaper, $date, $tag\n";
print __LINE__, "($stage): $line\n";
if((!defined($tag)) || ($tag eq '')) {
my $cont = shift(@lines);

$cont =~ s/^\n//;
last if($cont =~ /^\-\-\-\-\-\-\-\-/);
$cont =~ s/&nbsp;/ /g;
$cont =~ s/\x{a0}/ /g;
$cont =~ s/&gt;/>/g;
$line =~ s/&quot;/"/g;
$line .= $cont;
($name, $age, $place, $newspaper, $date, $tag) = split(/;\s?/, $line);
}
use Data::Dumper;
# print "$line\n";
# print __LINE__, "($stage): $line: $name, $age, $place, $newspaper, $date, $tag\n";
# use Data::Dumper;
# print __LINE__, ": $issue: ", Data::Dumper->new([split(/;\s?/, $line)])->Dump();
next unless(defined($name));
my ($last, $first) = split(',\s', $name);
next unless(defined($last));
next unless(defined($first));

# Parsing completely broken
# e.g. https://www.freelists.org/post/obitdailytimes/Obituary-Daily-Times-v25no010
last unless(defined($newspaper));

print __LINE__, "\n";

my $maiden;
if($first) {
if($first =~ /(.+)\s\((.+)\)$/) {
Expand All @@ -308,8 +309,9 @@ exit;
}
# $age //= '';

print "$line->>>>>>>>>>\n",
"\"$last\",\"$first\",\"$maiden\",\"$age\",\"$place\",\"$date\",\"$newspaper\"\n" if($last =~ /^[A-Z]/);
# print "$line:\n",
# "\t\"$last\",\"$first\",\"$age\",\"$place\",\"$date\",\"$newspaper\"\n" if($last =~ /^[A-Z]/);
# # "\t\"$last\",\"$first\",\"$maiden\",\"$age\",\"$place\",\"$date\",\"$newspaper\"\n" if($last =~ /^[A-Z]/);
queue($first, normalise_name($last), $maiden, $age, $place, $date, $newspaper, $url) if($last =~ /^[A-Z]/);
$rc = 1;
}
Expand Down Expand Up @@ -352,16 +354,21 @@ sub flush($)

while(my $row = pop @queue) {
if(!defined($query)) {
$query = 'INSERT INTO obituaries(first, last, maiden, age, place, date, newspaper, url) VALUES (';
$query = 'INSERT INTO obituaries(first, middle, last, maiden, age, place, date, newspaper, url) VALUES (';
} else {
$query .= ',(';
}
my %columns = %{$row};

if($columns{'first'}) {
$query .= "'" . $columns{'first'} . "','" . $columns{'last'} . "',";
if($columns{'first'} =~ /(.+)\s(.+)/) {
# Has a middle name
$query .= "'$1','$2','" . $columns{'last'} . "',";
} else {
$query .= "'" . $columns{'first'} . "',NULL,'" . $columns{'last'} . "',";
}
} else {
$query .= "NULL,'" . $columns{'last'} . "',";
$query .= "NULL,NULL,'" . $columns{'last'} . "',";
}
if($columns{'maiden'} && ($columns{'maiden'} ne ' ')) {
$query .= "'" . $columns{'maiden'} . "',";
Expand Down
15 changes: 9 additions & 6 deletions t/search.t
Expand Up @@ -3,7 +3,7 @@
use strict;

use lib 'lib';
use Test::Most tests => 6;
use Test::Most tests => 8;
use lib 't/lib';
use MyLogger;

Expand All @@ -12,7 +12,7 @@ BEGIN {
}

SKIP: {
skip 'Database not installed', 5, if(!-r 'lib/Genealogy/ObituaryDailyTimes/database/obituaries.sql');
skip 'Database not installed', 7, if(!-r 'lib/Genealogy/ObituaryDailyTimes/database/obituaries.sql');

if($ENV{'TEST_VERBOSE'}) {
Genealogy::ObituaryDailyTimes::DB::init(logger => MyLogger->new());
Expand All @@ -30,12 +30,15 @@ SKIP: {
is($smiths[0]->{'last'}, 'Smith', 'Returned Smiths');

my $baal = $search->search({ first => 'Eric', last => 'Baal' });
is($baal->{'url'}, 'https://mlarchives.rootsweb.com/listindexes/emails?listname=gen-obit&page=96', 'Check URL');
is($baal->{'url'}, 'https://mlarchives.rootsweb.com/listindexes/emails?listname=gen-obit&page=96', 'Check Baal URL');

my $coppage = $search->search({ first => 'John', last => 'Coppage' });
my $coppage = $search->search({ first => 'John', middle => 'W', last => 'Coppage' });
is($coppage->{'middle'}, 'W', 'Test middle initial');
is($coppage->{'url'}, 'https://www.freelists.org/post/obitdailytimes/Obituary-Daily-Times-v26no080', 'Check Coppage URL');

use Data::Dumper;
diag(Data::Dumper->new([$coppage])->Dump());
if($ENV{'TEST_VERBOSE'}) {
diag(Data::Dumper->new([$coppage])->Dump());
}

my @empty = $search->search(last => 'xyzzy');
is(scalar(@empty), 0, 'Search for xyzzy should return an empty list');
Expand Down

0 comments on commit 5ccaec5

Please sign in to comment.