From e5f01480e193be67f72802db47e95271fdffcf51 Mon Sep 17 00:00:00 2001 From: Matthew Somerville Date: Wed, 20 May 2015 16:18:02 +0100 Subject: [PATCH 1/2] Fix whitespace. --- scripts/mpinfoin.pl | 1016 ++++++++--------- scripts/xml2db.pl | 2660 +++++++++++++++++++++---------------------- 2 files changed, 1838 insertions(+), 1838 deletions(-) diff --git a/scripts/mpinfoin.pl b/scripts/mpinfoin.pl index 18334cd148..9e12f14a9a 100755 --- a/scripts/mpinfoin.pl +++ b/scripts/mpinfoin.pl @@ -28,35 +28,35 @@ my %action; my $verbose; foreach (@ARGV) { - if ($_ eq 'publicwhip') { - $action{'pw'} = 1; - } elsif ($_ eq 'expenses') { - $action{'expenses'} = 1; - } elsif ($_ eq 'regmem') { - $action{'regmem'} = 1; - } elsif ($_ eq 'links') { - $action{'links'} = 1; - } elsif ($_ eq 'writetothem') { - $action{'wtt'} = 1; - } elsif ($_ eq 'rankings') { - $action{'rankings'} = 1; - } elsif ($_ eq 'speaker_candidates') { - $action{'speaker_candidates'} = 1; - } elsif ($_ eq 'verbose') { - $verbose = 1; - } else { - print "Action '$_' not known\n"; - exit(0); - } -} -if (scalar(@ARGV) == 0) { + if ($_ eq 'publicwhip') { $action{'pw'} = 1; + } elsif ($_ eq 'expenses') { $action{'expenses'} = 1; + } elsif ($_ eq 'regmem') { $action{'regmem'} = 1; + } elsif ($_ eq 'links') { $action{'links'} = 1; + } elsif ($_ eq 'writetothem') { $action{'wtt'} = 1; + } elsif ($_ eq 'rankings') { $action{'rankings'} = 1; + } elsif ($_ eq 'speaker_candidates') { $action{'speaker_candidates'} = 1; + } elsif ($_ eq 'verbose') { + $verbose = 1; + } else { + print "Action '$_' not known\n"; + exit(0); + } +} +if (scalar(@ARGV) == 0) { + $action{'pw'} = 1; + $action{'expenses'} = 1; + $action{'regmem'} = 1; + $action{'links'} = 1; + $action{'wtt'} = 1; + $action{'rankings'} = 1; + $action{'speaker_candidates'} = 1; } # Fat old hashes intotwixt all the XML is loaded and colated before being squirted to the DB @@ -71,94 +71,94 @@ # Read in all the files my $twig = XML::Twig->new( - twig_handlers => { - 'memberinfo' => \&loadmemberinfo, - 'personinfo' => \&loadpersoninfo, - 'consinfo' => \&loadconsinfo, - 'speakercandidateinfo' => \&loadspeakercandidateinfo, - 'regmem' => \&loadregmeminfo - }, output_filter => 'safe' ); + twig_handlers => { + 'memberinfo' => \&loadmemberinfo, + 'personinfo' => \&loadpersoninfo, + 'consinfo' => \&loadconsinfo, + 'speakercandidateinfo' => \&loadspeakercandidateinfo, + 'regmem' => \&loadregmeminfo + }, output_filter => 'safe' ); if ($action{'regmem'}) { - # TODO: Parse ALL regmem in forwards chronological order, so each MP (even ones left parl) gets their most recent one - print "Parsing register of members' interests\n" if $verbose; - $twig->parsefile(mySociety::Config::get('RAWDATA') . "scrapedxml/regmem/$regmemfile", ErrorContext => 2); + # TODO: Parse ALL regmem in forwards chronological order, so each MP (even ones left parl) gets their most recent one + print "Parsing register of members' interests\n" if $verbose; + $twig->parsefile(mySociety::Config::get('RAWDATA') . "scrapedxml/regmem/$regmemfile", ErrorContext => 2); } if ($action{'links'}) { - print "Parsing links\n" if $verbose; - print " MLA Wikipedia\n" if $verbose; - $twig->parsefile($pwmembers . "wikipedia-mla.xml", ErrorContext => 2); - print " MSP \"\n" if $verbose; - $twig->parsefile($pwmembers . "wikipedia-msp.xml", ErrorContext => 2); - print " MP \"\n" if $verbose; - $twig->parsefile($pwmembers . "wikipedia-commons.xml", ErrorContext => 2); - print " Lords \"\n" if $verbose; - $twig->parsefile($pwmembers . "wikipedia-lords.xml", ErrorContext => 2); - #print " MPs standing down\n" if $verbose; - #$twig->parsefile($pwmembers . "wikipedia-standingdown.xml", ErrorContext => 2); - print " Bishops\n" if $verbose; - $twig->parsefile($pwmembers . "diocese-bishops.xml", ErrorContext => 2); - print " BBC\n" if $verbose; - $twig->parsefile($pwmembers . "bbc-links.xml", ErrorContext => 2); - print " BBC IDs\n" if $verbose; - $twig->parsefile($pwmembers . "bbc-constituency-ids.xml", ErrorContext => 2); - print " PA/Guardian constituency IDs\n" if $verbose; - $twig->parsefile($pwmembers . "constituency-links.xml", ErrorContext => 2); - print " dates of birth\n" if $verbose; - $twig->parsefile($pwmembers . "dates-of-birth.xml", ErrorContext => 2); - # TODO: Update websites (esp. with new MPs) - print " Personal websites\n" if $verbose; - $twig->parsefile($pwmembers . 'websites.xml', ErrorContext => 2); - print " MSP websites\n" if $verbose; - $twig->parsefile($pwmembers . 'websites-sp.xml', ErrorContext => 2); - print " MSP Twitter username\n" if $verbose; - $twig->parsefile($pwmembers . 'twitter.xml', ErrorContext => 2); - chdir $FindBin::Bin; - print " Lords biographies\n" if $verbose; - $twig->parsefile($pwmembers . 'lordbiogs.xml', ErrorContext => 2); - print " Journalisted\n" if $verbose; - $twig->parsefile($pwmembers . 'journa-list.xml', ErrorContext => 2); + print "Parsing links\n" if $verbose; + print " MLA Wikipedia\n" if $verbose; + $twig->parsefile($pwmembers . "wikipedia-mla.xml", ErrorContext => 2); + print " MSP \"\n" if $verbose; + $twig->parsefile($pwmembers . "wikipedia-msp.xml", ErrorContext => 2); + print " MP \"\n" if $verbose; + $twig->parsefile($pwmembers . "wikipedia-commons.xml", ErrorContext => 2); + print " Lords \"\n" if $verbose; + $twig->parsefile($pwmembers . "wikipedia-lords.xml", ErrorContext => 2); + #print " MPs standing down\n" if $verbose; + #$twig->parsefile($pwmembers . "wikipedia-standingdown.xml", ErrorContext => 2); + print " Bishops\n" if $verbose; + $twig->parsefile($pwmembers . "diocese-bishops.xml", ErrorContext => 2); + print " BBC\n" if $verbose; + $twig->parsefile($pwmembers . "bbc-links.xml", ErrorContext => 2); + print " BBC IDs\n" if $verbose; + $twig->parsefile($pwmembers . "bbc-constituency-ids.xml", ErrorContext => 2); + print " PA/Guardian constituency IDs\n" if $verbose; + $twig->parsefile($pwmembers . "constituency-links.xml", ErrorContext => 2); + print " dates of birth\n" if $verbose; + $twig->parsefile($pwmembers . "dates-of-birth.xml", ErrorContext => 2); + # TODO: Update websites (esp. with new MPs) + print " Personal websites\n" if $verbose; + $twig->parsefile($pwmembers . 'websites.xml', ErrorContext => 2); + print " MSP websites\n" if $verbose; + $twig->parsefile($pwmembers . 'websites-sp.xml', ErrorContext => 2); + print " MSP Twitter username\n" if $verbose; + $twig->parsefile($pwmembers . 'twitter.xml', ErrorContext => 2); + chdir $FindBin::Bin; + print " Lords biographies\n" if $verbose; + $twig->parsefile($pwmembers . 'lordbiogs.xml', ErrorContext => 2); + print " Journalisted\n" if $verbose; + $twig->parsefile($pwmembers . 'journa-list.xml', ErrorContext => 2); } if ($action{'wtt'}) { - print "Parsing WTT stats\n" if $verbose; - $twig->parseurl("http://www.writetothem.com/stats/2005/mps?xml=1"); - $twig->parseurl("http://www.writetothem.com/stats/2006/mps?xml=1"); - $twig->parseurl("http://www.writetothem.com/stats/2007/mps?xml=1"); - $twig->parseurl("http://www.writetothem.com/stats/2008/mps?xml=1"); + print "Parsing WTT stats\n" if $verbose; + $twig->parseurl("http://www.writetothem.com/stats/2005/mps?xml=1"); + $twig->parseurl("http://www.writetothem.com/stats/2006/mps?xml=1"); + $twig->parseurl("http://www.writetothem.com/stats/2007/mps?xml=1"); + $twig->parseurl("http://www.writetothem.com/stats/2008/mps?xml=1"); } if ($action{'speaker_candidates'}) { - print "Parsing speaker candidates\n" if $verbose; - $twig->parsefile($pwmembers . 'speaker-candidates.xml', ErrorContext => 2); + print "Parsing speaker candidates\n" if $verbose; + $twig->parsefile($pwmembers . 'speaker-candidates.xml', ErrorContext => 2); } if ($action{'pw'}) { - my $ua = LWP::UserAgent->new( agent => 'mySociety/1.0 (TheyWorkForYou)' ); - print "Parsing Public Whip attendance and policies\n" if $verbose; - $twig->parseurl("http://www.publicwhip.org.uk/feeds/mp-info.xml", $ua); - $twig->parseurl("http://www.publicwhip.org.uk/feeds/mp-info.xml?house=lords", $ua); - # Various policy IDs, see http://www.publicwhip.org.uk/policies.php for what they are - foreach my $dreamid (@policyids) { - $twig->parseurl("http://www.publicwhip.org.uk/feeds/mpdream-info.xml?id=$dreamid", $ua); - } + my $ua = LWP::UserAgent->new( agent => 'mySociety/1.0 (TheyWorkForYou)' ); + print "Parsing Public Whip attendance and policies\n" if $verbose; + $twig->parseurl("http://www.publicwhip.org.uk/feeds/mp-info.xml", $ua); + $twig->parseurl("http://www.publicwhip.org.uk/feeds/mp-info.xml?house=lords", $ua); + # Various policy IDs, see http://www.publicwhip.org.uk/policies.php for what they are + foreach my $dreamid (@policyids) { + $twig->parseurl("http://www.publicwhip.org.uk/feeds/mpdream-info.xml?id=$dreamid", $ua); + } } if ($action{'expenses'}) { - print "Parsing expenses\n" if $verbose; - $twig->parsefile($pwmembers . "expenses200809.xml", ErrorContext => 2); - $twig->parsefile($pwmembers . "expenses200708.xml", ErrorContext => 2); - $twig->parsefile($pwmembers . "expenses200607.xml", ErrorContext => 2); - $twig->parsefile($pwmembers . "expenses200506.xml", ErrorContext => 2); - $twig->parsefile($pwmembers . "expenses200506former.xml", ErrorContext => 2); - $twig->parsefile($pwmembers . "expenses200405.xml", ErrorContext => 2); - $twig->parsefile($pwmembers . "expenses200304.xml", ErrorContext => 2); - $twig->parsefile($pwmembers . "expenses200203.xml", ErrorContext => 2); - $twig->parsefile($pwmembers . "expenses200102.xml", ErrorContext => 2); - makerankings_expenses(); + print "Parsing expenses\n" if $verbose; + $twig->parsefile($pwmembers . "expenses200809.xml", ErrorContext => 2); + $twig->parsefile($pwmembers . "expenses200708.xml", ErrorContext => 2); + $twig->parsefile($pwmembers . "expenses200607.xml", ErrorContext => 2); + $twig->parsefile($pwmembers . "expenses200506.xml", ErrorContext => 2); + $twig->parsefile($pwmembers . "expenses200506former.xml", ErrorContext => 2); + $twig->parsefile($pwmembers . "expenses200405.xml", ErrorContext => 2); + $twig->parsefile($pwmembers . "expenses200304.xml", ErrorContext => 2); + $twig->parsefile($pwmembers . "expenses200203.xml", ErrorContext => 2); + $twig->parsefile($pwmembers . "expenses200102.xml", ErrorContext => 2); + makerankings_expenses(); } # Get any data from the database @@ -166,8 +166,8 @@ my $dbh = DBI->connect($dsn, mySociety::Config::get('TWFY_DB_USER'), mySociety::Config::get('TWFY_DB_PASS'), { RaiseError => 1, PrintError => 0 }); #DBI->trace(2); if ($action{'rankings'}) { - print "Making rankings\n" if $verbose; - makerankings($dbh); + print "Making rankings\n" if $verbose; + makerankings($dbh); } # XXX: Will only ever add/update data now - need way to remove without dropping whole table... @@ -183,42 +183,42 @@ # Write to database - members foreach my $mp_id (keys %$memberinfohash) { - (my $mp_id_num = $mp_id) =~ s#uk.org.publicwhip/(member|lord)/##; - my $data = $memberinfohash->{$mp_id}; - foreach my $key (keys %$data) { - my $new_value = $data->{$key}; - my $curr_value = $dbh->selectrow_array($memberinfocheck, {}, $mp_id_num, $key); - if (!defined $curr_value) { - $memberinfoadd->execute($mp_id_num, $key, $new_value); - } elsif ($curr_value ne $new_value) { - $memberinfoupdate->execute($new_value, $mp_id_num, $key); - } + (my $mp_id_num = $mp_id) =~ s#uk.org.publicwhip/(member|lord)/##; + my $data = $memberinfohash->{$mp_id}; + foreach my $key (keys %$data) { + my $new_value = $data->{$key}; + my $curr_value = $dbh->selectrow_array($memberinfocheck, {}, $mp_id_num, $key); + if (!defined $curr_value) { + $memberinfoadd->execute($mp_id_num, $key, $new_value); + } elsif ($curr_value ne $new_value) { + $memberinfoupdate->execute($new_value, $mp_id_num, $key); } + } } # Write to database - people foreach my $person_id (keys %$personinfohash) { - (my $person_id_num = $person_id) =~ s#uk.org.publicwhip/person/##; - my $data = $personinfohash->{$person_id}; - foreach my $key (keys %$data) { - my $new_value = $data->{$key}; - my $curr_value = $dbh->selectrow_array($personinfocheck, {}, $person_id_num, $key); - if (!defined $curr_value) { - $personinfoadd->execute($person_id_num, $key, $new_value); - } elsif ($curr_value ne $new_value) { - $personinfoupdate->execute($new_value, $person_id_num, $key); - } + (my $person_id_num = $person_id) =~ s#uk.org.publicwhip/person/##; + my $data = $personinfohash->{$person_id}; + foreach my $key (keys %$data) { + my $new_value = $data->{$key}; + my $curr_value = $dbh->selectrow_array($personinfocheck, {}, $person_id_num, $key); + if (!defined $curr_value) { + $personinfoadd->execute($person_id_num, $key, $new_value); + } elsif ($curr_value ne $new_value) { + $personinfoupdate->execute($new_value, $person_id_num, $key); } + } } # Write to database - cons foreach my $constituency (keys %$consinfohash) { - my $data = $consinfohash->{$constituency}; - $constituency = Encode::encode('iso-8859-1', $constituency); - foreach my $key (keys %$data) { - my $value = $data->{$key}; - $consinfoadd->execute($constituency, $key, $value, $value); - } + my $data = $consinfohash->{$constituency}; + $constituency = Encode::encode('iso-8859-1', $constituency); + foreach my $key (keys %$data) { + my $value = $data->{$key}; + $consinfoadd->execute($constituency, $key, $value, $value); + } } # just temporary to check cron working @@ -227,50 +227,50 @@ # Handler for loading data pertaining to a member id sub loadmemberinfo { - my ($twig, $memberinfo) = @_; - my $id = $memberinfo->att('id'); - foreach my $attname ($memberinfo->att_names()) - { - next if $attname eq "id"; - my $value = $memberinfo->att($attname); - $memberinfohash->{$id}->{$attname} = $value; - } + my ($twig, $memberinfo) = @_; + my $id = $memberinfo->att('id'); + foreach my $attname ($memberinfo->att_names()) + { + next if $attname eq "id"; + my $value = $memberinfo->att($attname); + $memberinfohash->{$id}->{$attname} = $value; + } } # Handler for loading data pertaining to a person id sub loadpersoninfo { - my ($twig, $personinfo) = @_; - my $id = $personinfo->att('id'); - foreach my $attname ($personinfo->att_names()) - { - next if $attname eq "id"; - my $value = $personinfo->att($attname); - $personinfohash->{$id}->{$attname} = $value; - } + my ($twig, $personinfo) = @_; + my $id = $personinfo->att('id'); + foreach my $attname ($personinfo->att_names()) + { + next if $attname eq "id"; + my $value = $personinfo->att($attname); + $personinfohash->{$id}->{$attname} = $value; + } } # Handler for loading data pertaining to a speaker candidate sub loadspeakercandidateinfo { - my ($twig, $speakerinfo) = @_; - my $id = $speakerinfo->att('id'); - foreach my $attname ($speakerinfo->att_names()) - { - next if $attname eq "id"; - my $value = $speakerinfo->att($attname); - $personinfohash->{$id}->{$attname} = $value; - } - my $speaker_candidate_response = $speakerinfo->first_child('speakercandidateresponse'); - if ($speaker_candidate_response){ - $personinfohash->{$id}->{'speaker_candidate_response'} = $speaker_candidate_response->xml_string(); - my $speaker_candidate_response_summary = $speakerinfo->first_child('speakercandidateresponsesummary'); - $personinfohash->{$id}->{'speaker_candidate_response_summary'} = $speaker_candidate_response_summary->xml_string(); - } else { - $personinfohash->{$id}->{'speaker_candidate_response'} = ""; - $personinfohash->{$id}->{'speaker_candidate_response_summary'} = ""; - } + my ($twig, $speakerinfo) = @_; + my $id = $speakerinfo->att('id'); + foreach my $attname ($speakerinfo->att_names()) + { + next if $attname eq "id"; + my $value = $speakerinfo->att($attname); + $personinfohash->{$id}->{$attname} = $value; + } + my $speaker_candidate_response = $speakerinfo->first_child('speakercandidateresponse'); + if ($speaker_candidate_response){ + $personinfohash->{$id}->{'speaker_candidate_response'} = $speaker_candidate_response->xml_string(); + my $speaker_candidate_response_summary = $speakerinfo->first_child('speakercandidateresponsesummary'); + $personinfohash->{$id}->{'speaker_candidate_response_summary'} = $speaker_candidate_response_summary->xml_string(); + } else { + $personinfohash->{$id}->{'speaker_candidate_response'} = ""; + $personinfohash->{$id}->{'speaker_candidate_response_summary'} = ""; + } } @@ -278,405 +278,405 @@ sub loadspeakercandidateinfo # Handler for loading data pertaining to a canonical constituency name sub loadconsinfo { - my ($twig, $consinfo) = @_; - my $id = $consinfo->att('canonical'); - foreach my $attname ($consinfo->att_names()) - { - next if $attname eq "canonical"; - my $value = $consinfo->att($attname); - $consinfohash->{$id}->{$attname} = $value; - } + my ($twig, $consinfo) = @_; + my $id = $consinfo->att('canonical'); + foreach my $attname ($consinfo->att_names()) + { + next if $attname eq "canonical"; + my $value = $consinfo->att($attname); + $consinfohash->{$id}->{$attname} = $value; + } } # Handler for loading register of members interests sub loadregmeminfo { - my ($twig, $regmem) = @_; - my $id = $regmem->att('personid'); - - my $htmlcontent = ""; - - for (my $category = $regmem->first_child('category'); $category; - $category = $category->next_sibling('category')) + my ($twig, $regmem) = @_; + my $id = $regmem->att('personid'); + + my $htmlcontent = ""; + + for (my $category = $regmem->first_child('category'); $category; + $category = $category->next_sibling('category')) + { + $htmlcontent .= '
'; + $htmlcontent .= $category->att("type") . ". " . $category->att("name"); + $htmlcontent .= "
\n"; + foreach my $item ($category->descendants('item')) { - $htmlcontent .= '
'; - $htmlcontent .= $category->att("type") . ". " . $category->att("name"); - $htmlcontent .= "
\n"; - foreach my $item ($category->descendants('item')) - { - $htmlcontent .= '
'; - if ($item->att("subcategory")) - { - $htmlcontent .= "(" . $item->att("subcategory") . ") "; - } - $htmlcontent .= $item->sprint(1); - $htmlcontent .= "
\n"; - } + $htmlcontent .= '
'; + if ($item->att("subcategory")) + { + $htmlcontent .= "(" . $item->att("subcategory") . ") "; + } + $htmlcontent .= $item->sprint(1); + $htmlcontent .= "
\n"; } + } - $personinfohash->{$id}->{"register_member_interests_html"} = $htmlcontent; - $personinfohash->{$id}->{"register_member_interests_date"} = $regmem->att('date'); + $personinfohash->{$id}->{"register_member_interests_html"} = $htmlcontent; + $personinfohash->{$id}->{"register_member_interests_date"} = $regmem->att('date'); } # Generate rankings of number of times spoken sub makerankings { - my $dbh = shift; - - # Loop through MPs - my $query = "select person_id,entered_house,left_house from member where person_id in "; - my $sth = $dbh->prepare($query . - #"( 10001 )"); - '(select person_id from member where house=1 AND curdate() <= left_house) order by person_id, entered_house'); - $sth->execute(); - if ($sth->rows == 0 && mySociety::Config::get('DISSOLUTION_DATE')) { - $sth = $dbh->prepare($query . - '(select person_id from member where left_house = ?)'); - $sth->execute(mySociety::Config::get('DISSOLUTION_DATE')); - if ($sth->rows == 0) { - print "Failed to find any MPs for rankings, change dissolution date if you are near one"; - return; - } + my $dbh = shift; + + # Loop through MPs + my $query = "select person_id,entered_house,left_house from member where person_id in "; + my $sth = $dbh->prepare($query . + #"( 10001 )"); + '(select person_id from member where house=1 AND curdate() <= left_house) order by person_id, entered_house'); + $sth->execute(); + if ($sth->rows == 0 && mySociety::Config::get('DISSOLUTION_DATE')) { + $sth = $dbh->prepare($query . + '(select person_id from member where left_house = ?)'); + $sth->execute(mySociety::Config::get('DISSOLUTION_DATE')); + if ($sth->rows == 0) { + print "Failed to find any MPs for rankings, change dissolution date if you are near one"; + return; } - my %first_member; - while ( my @row = $sth->fetchrow_array() ) - { - my $person_id = $row[0]; - my $entered_house = $row[1]; - my $left_house = $row[2]; - my $person_fullid = "uk.org.publicwhip/person/$person_id"; - - my $q = $dbh->prepare('select gid from hansard where major=1 and person_id=? order by hdate,hpos limit 1'); - $q->execute($person_id); - if ($q->rows > 0) { - my @row = $q->fetchrow_array(); - my $maidenspeech = $row[0]; - $personinfohash->{$person_fullid}->{'maiden_speech'} = $maidenspeech; - } - - my $tth = $dbh->prepare("select count(*) from hansard, epobject - where hansard.epobject_id = epobject.epobject_id and person_id = ? and (major = 1 or major = 2) and - hdate >= date_sub(curdate(), interval 1 year) and - body not like '%rose—%' group by section_id"); - my $rows = $tth->execute($person_id); - $personinfohash->{$person_fullid}->{"debate_sectionsspoken_inlastyear"} = int($rows); - - $tth = $dbh->prepare(" - select count(*) from hansard, comments where hansard.epobject_id = comments.epobject_id and visible - and person_id = ?"); - $tth->execute($person_id); - my @thisrow = $tth->fetchrow_array(); - my $comments = $thisrow[0]; - $personinfohash->{$person_fullid}->{"comments_on_speeches"} = int($comments); - - $tth = $dbh->prepare("select count(*) from hansard where person_id = ? and major = 3 and minor = 1 and - hdate >= date_sub(curdate(), interval 1 year) - "); - $tth->execute($person_id); - @thisrow = $tth->fetchrow_array(); - my $speeches = $thisrow[0]; - $personinfohash->{$person_fullid}->{"wrans_asked_inlastyear"} = $speeches; - - $tth = $dbh->prepare("select count(*) from hansard where person_id = ? and major = 3 and minor = 2 and - hdate >= date_sub(curdate(), interval 1 year)"); - $tth->execute($person_id); - @thisrow = $tth->fetchrow_array(); - $speeches = $thisrow[0]; - $personinfohash->{$person_fullid}->{"wrans_answered_inlastyear"} = $speeches; - - $tth = $dbh->prepare("select count(*) as c, body from hansard as h1 - left join epobject on h1.section_id = epobject.epobject_id - where h1.major = 3 and h1.minor = - 1 and h1.person_id = ? group by body"); - $tth->execute($person_id); - while (my @row = $tth->fetchrow_array()) { - my $count = $row[0]; - my $dept = $row[1]; - $personinfohash->{$person_fullid}->{"wrans_departments"}->{$dept} = $count; - } - - $tth = $dbh->prepare("select count(*) as c, body from hansard as h1 - left join epobject on h1.subsection_id = epobject.epobject_id - where h1.major = 3 and h1.minor = - 1 and h1.person_id = ? group by body"); - $tth->execute($person_id); - while (my @row = $tth->fetchrow_array()) { - my $count = $row[0]; - my $subject = $row[1]; - $personinfohash->{$person_fullid}->{"wrans_subjects"}->{$subject} = $count; - } - - $tth = $dbh->prepare("select body from epobject,hansard where hansard.epobject_id = epobject.epobject_id and person_id=? and (major=1 or major=2)"); - $tth->execute($person_id); - $personinfohash->{$person_fullid}->{'three_word_alliterations'} = 0; - $personinfohash->{$person_fullid}->{'three_word_alliteration_content'} = ""; - my $words = 0; my $syllables = 0; my $sentences = 0; - while (my @row = $tth->fetchrow_array()) { - my $body = $row[0]; - $body =~ s/<\/p>/\n\n/g; - $body =~ s/<\/?p[^>]*>//g; - $body =~ s/ hon\. / honourable /g; - if ($body =~ m/\b((\w)\w*\s+\2\w*\s+\2\w*)\b/) { - $personinfohash->{$person_fullid}->{'three_word_alliterations'} += 1; - $personinfohash->{$person_fullid}->{'three_word_alliteration_content'} .= ":$1"; - } - - my @sent = split(/(?:(?{$person_fullid}->{'total_words'} = $words; - $personinfohash->{$person_fullid}->{'total_sents'} = $sentences; - $personinfohash->{$person_fullid}->{'total_sylls'} = $syllables; - - $tth = $dbh->prepare("select count(*) from moffice where person=? and source='chgpages/selctee' and to_date='9999-12-31'"); - $tth->execute($person_id); - my $selctees = ($tth->fetchrow_array())[0]; - $personinfohash->{$person_fullid}->{'select_committees'} = $selctees; - $tth = $dbh->prepare("select count(*) from moffice where person=? and source='chgpages/selctee' and to_date='9999-12-31' and position='Chairman'"); - $tth->execute($person_id); - $selctees = ($tth->fetchrow_array())[0]; - $personinfohash->{$person_fullid}->{'select_committees_chair'} = $selctees; + } + my %first_member; + while ( my @row = $sth->fetchrow_array() ) + { + my $person_id = $row[0]; + my $entered_house = $row[1]; + my $left_house = $row[2]; + my $person_fullid = "uk.org.publicwhip/person/$person_id"; + + my $q = $dbh->prepare('select gid from hansard where major=1 and person_id=? order by hdate,hpos limit 1'); + $q->execute($person_id); + if ($q->rows > 0) { + my @row = $q->fetchrow_array(); + my $maidenspeech = $row[0]; + $personinfohash->{$person_fullid}->{'maiden_speech'} = $maidenspeech; } - # Consolidate wrans departments and subjects, to pick top 5 - foreach (keys %$personinfohash) { - my $key = $_; - my $dept = $personinfohash->{$key}->{'wrans_departments'}; - if (defined($dept)) { - my @ordered = sort { $dept->{$b} <=> $dept->{$a} } keys %$dept; - @ordered = @ordered[0..4] if (scalar(@ordered) > 5); - $personinfohash->{$key}->{'wrans_departments'} = join(', ', @ordered); - } - my $subj = $personinfohash->{$key}->{'wrans_subjects'}; - if (defined($subj)) { - my @ordered = sort { $subj->{$b} <=> $subj->{$a} } keys %$subj; - @ordered = @ordered[0..4] if (scalar(@ordered) > 5); - $personinfohash->{$key}->{'wrans_subjects'} = join(', ', @ordered); - } - #$personinfohash->{$key}->{'reading_ease'} = -1; - if ($personinfohash->{$key}->{'total_sents'} && $personinfohash->{$key}->{'total_words'}) { - $personinfohash->{$key}->{'reading_ease'} = 206.835 - - 1.015 * ($personinfohash->{$key}->{'total_words'} / $personinfohash->{$key}->{'total_sents'}) - - 84.6 * ($personinfohash->{$key}->{'total_sylls'} / $personinfohash->{$key}->{'total_words'}); - $personinfohash->{$key}->{'reading_year'} = 1 -15.59 - + 0.39 * ($personinfohash->{$key}->{'total_words'} / $personinfohash->{$key}->{'total_sents'}) - + 11.8 * ($personinfohash->{$key}->{'total_sylls'} / $personinfohash->{$key}->{'total_words'}); - } - delete $personinfohash->{$key}->{'total_words'}; - delete $personinfohash->{$key}->{'total_sylls'}; - delete $personinfohash->{$key}->{'total_sents'}; + my $tth = $dbh->prepare("select count(*) from hansard, epobject + where hansard.epobject_id = epobject.epobject_id and person_id = ? and (major = 1 or major = 2) and + hdate >= date_sub(curdate(), interval 1 year) and + body not like '%rose—%' group by section_id"); + my $rows = $tth->execute($person_id); + $personinfohash->{$person_fullid}->{"debate_sectionsspoken_inlastyear"} = int($rows); + + $tth = $dbh->prepare(" + select count(*) from hansard, comments where hansard.epobject_id = comments.epobject_id and visible + and person_id = ?"); + $tth->execute($person_id); + my @thisrow = $tth->fetchrow_array(); + my $comments = $thisrow[0]; + $personinfohash->{$person_fullid}->{"comments_on_speeches"} = int($comments); + + $tth = $dbh->prepare("select count(*) from hansard where person_id = ? and major = 3 and minor = 1 and + hdate >= date_sub(curdate(), interval 1 year) + "); + $tth->execute($person_id); + @thisrow = $tth->fetchrow_array(); + my $speeches = $thisrow[0]; + $personinfohash->{$person_fullid}->{"wrans_asked_inlastyear"} = $speeches; + + $tth = $dbh->prepare("select count(*) from hansard where person_id = ? and major = 3 and minor = 2 and + hdate >= date_sub(curdate(), interval 1 year)"); + $tth->execute($person_id); + @thisrow = $tth->fetchrow_array(); + $speeches = $thisrow[0]; + $personinfohash->{$person_fullid}->{"wrans_answered_inlastyear"} = $speeches; + + $tth = $dbh->prepare("select count(*) as c, body from hansard as h1 + left join epobject on h1.section_id = epobject.epobject_id + where h1.major = 3 and h1.minor = + 1 and h1.person_id = ? group by body"); + $tth->execute($person_id); + while (my @row = $tth->fetchrow_array()) { + my $count = $row[0]; + my $dept = $row[1]; + $personinfohash->{$person_fullid}->{"wrans_departments"}->{$dept} = $count; } - # Loop through Lords - $query = "select person_id from member where person_id in "; - $sth = $dbh->prepare($query . - '(select person_id from member where house=2 AND curdate() <= left_house)'); - $sth->execute(); - while ( my @row = $sth->fetchrow_array() ) { - my $person_id = $row[0]; - my $person_fullid = "uk.org.publicwhip/person/$person_id"; - - my $tth = $dbh->prepare("select count(*) from hansard, epobject - where hansard.epobject_id = epobject.epobject_id and person_id = ? and major = 101 and - hdate >= date_sub(curdate(), interval 1 year) and - body not like '%rose—%' group by section_id"); - my $rows = $tth->execute($person_id); - $personinfohash->{$person_fullid}->{"Ldebate_sectionsspoken_inlastyear"} += int($rows); - - $tth = $dbh->prepare(" - select count(*) from hansard, comments where hansard.epobject_id = comments.epobject_id and visible - and person_id = ?"); - $tth->execute($person_id); - my @thisrow = $tth->fetchrow_array(); - my $comments = $thisrow[0]; - $personinfohash->{$person_fullid}->{"Lcomments_on_speeches"} += int($comments); - - $tth = $dbh->prepare("select count(*) from hansard where person_id = ? and major = 3 and minor = 1 and - hdate >= date_sub(curdate(), interval 1 year) - "); - $tth->execute($person_id); - @thisrow = $tth->fetchrow_array(); - my $speeches = $thisrow[0]; - $personinfohash->{$person_fullid}->{"Lwrans_asked_inlastyear"} += $speeches; - - $tth = $dbh->prepare("select count(*) from hansard where person_id = ? and major = 3 and minor = 2 and - hdate >= date_sub(curdate(), interval 1 year)"); - $tth->execute($person_id); - @thisrow = $tth->fetchrow_array(); - $speeches = $thisrow[0]; - $personinfohash->{$person_fullid}->{"Lwrans_answered_inlastyear"} += $speeches; - - $tth = $dbh->prepare("select body from epobject,hansard where hansard.epobject_id = epobject.epobject_id and person_id=? and major=101"); - $tth->execute($person_id); - $personinfohash->{$person_fullid}->{'Lthree_word_alliterations'} = 0; - while (my @row = $tth->fetchrow_array()) { - my $body = $row[0]; - if ($body =~ m/\b((\w)\w*\s+\2\w*\s+\2\w*)\b/) { - $personinfohash->{$person_fullid}->{'Lthree_word_alliterations'} += 1 - } - } + $tth = $dbh->prepare("select count(*) as c, body from hansard as h1 + left join epobject on h1.subsection_id = epobject.epobject_id + where h1.major = 3 and h1.minor = + 1 and h1.person_id = ? group by body"); + $tth->execute($person_id); + while (my @row = $tth->fetchrow_array()) { + my $count = $row[0]; + my $subject = $row[1]; + $personinfohash->{$person_fullid}->{"wrans_subjects"}->{$subject} = $count; } - enrankify($personinfohash, "debate_sectionsspoken_inlastyear", 0); - enrankify($personinfohash, "comments_on_speeches", 0); - enrankify($personinfohash, "wrans_asked_inlastyear", 0); - enrankify($personinfohash, "Ldebate_sectionsspoken_inlastyear", 0); - enrankify($personinfohash, "Lcomments_on_speeches", 0); - enrankify($personinfohash, "Lwrans_asked_inlastyear", 0); - enrankify($personinfohash, "three_word_alliterations", 0); - enrankify($personinfohash, "ending_with_a_preposition", 0); - enrankify($personinfohash, "only_asked_why", 0); - enrankify($personinfohash, "Lthree_word_alliterations", 0); - enrankify($personinfohash, "Lending_with_a_preposition", 0); - enrankify($personinfohash, "reading_ease", 0); - enrankify($personinfohash, "reading_year", 0); - enrankify($personinfohash, "writetothem_responsiveness_mean_2005", 0); -} + $tth = $dbh->prepare("select body from epobject,hansard where hansard.epobject_id = epobject.epobject_id and person_id=? and (major=1 or major=2)"); + $tth->execute($person_id); + $personinfohash->{$person_fullid}->{'three_word_alliterations'} = 0; + $personinfohash->{$person_fullid}->{'three_word_alliteration_content'} = ""; + my $words = 0; my $syllables = 0; my $sentences = 0; + while (my @row = $tth->fetchrow_array()) { + my $body = $row[0]; + $body =~ s/<\/p>/\n\n/g; + $body =~ s/<\/?p[^>]*>//g; + $body =~ s/ hon\. / honourable /g; + if ($body =~ m/\b((\w)\w*\s+\2\w*\s+\2\w*)\b/) { + $personinfohash->{$person_fullid}->{'three_word_alliterations'} += 1; + $personinfohash->{$person_fullid}->{'three_word_alliteration_content'} .= ":$1"; + } -sub makerankings_expenses { - foreach my $mp_id (keys %$personinfohash) { - if (defined($personinfohash->{$mp_id}->{'expenses2007_col5a'})) { - my $total = 0; - foreach my $let ('a'..'f') { - $total += $personinfohash->{$mp_id}->{'expenses2007_col5'.$let}; - } - $personinfohash->{$mp_id}->{'expenses2007_col5'} = $total; - } + my @sent = split(/(?:(?{$person_fullid}->{'total_words'} = $words; + $personinfohash->{$person_fullid}->{'total_sents'} = $sentences; + $personinfohash->{$person_fullid}->{'total_sylls'} = $syllables; + + $tth = $dbh->prepare("select count(*) from moffice where person=? and source='chgpages/selctee' and to_date='9999-12-31'"); + $tth->execute($person_id); + my $selctees = ($tth->fetchrow_array())[0]; + $personinfohash->{$person_fullid}->{'select_committees'} = $selctees; + $tth = $dbh->prepare("select count(*) from moffice where person=? and source='chgpages/selctee' and to_date='9999-12-31' and position='Chairman'"); + $tth->execute($person_id); + $selctees = ($tth->fetchrow_array())[0]; + $personinfohash->{$person_fullid}->{'select_committees_chair'} = $selctees; + } + + # Consolidate wrans departments and subjects, to pick top 5 + foreach (keys %$personinfohash) { + my $key = $_; + my $dept = $personinfohash->{$key}->{'wrans_departments'}; + if (defined($dept)) { + my @ordered = sort { $dept->{$b} <=> $dept->{$a} } keys %$dept; + @ordered = @ordered[0..4] if (scalar(@ordered) > 5); + $personinfohash->{$key}->{'wrans_departments'} = join(', ', @ordered); + } + my $subj = $personinfohash->{$key}->{'wrans_subjects'}; + if (defined($subj)) { + my @ordered = sort { $subj->{$b} <=> $subj->{$a} } keys %$subj; + @ordered = @ordered[0..4] if (scalar(@ordered) > 5); + $personinfohash->{$key}->{'wrans_subjects'} = join(', ', @ordered); + } + #$personinfohash->{$key}->{'reading_ease'} = -1; + if ($personinfohash->{$key}->{'total_sents'} && $personinfohash->{$key}->{'total_words'}) { + $personinfohash->{$key}->{'reading_ease'} = 206.835 + - 1.015 * ($personinfohash->{$key}->{'total_words'} / $personinfohash->{$key}->{'total_sents'}) + - 84.6 * ($personinfohash->{$key}->{'total_sylls'} / $personinfohash->{$key}->{'total_words'}); + $personinfohash->{$key}->{'reading_year'} = 1 -15.59 + + 0.39 * ($personinfohash->{$key}->{'total_words'} / $personinfohash->{$key}->{'total_sents'}) + + 11.8 * ($personinfohash->{$key}->{'total_sylls'} / $personinfohash->{$key}->{'total_words'}); + } + delete $personinfohash->{$key}->{'total_words'}; + delete $personinfohash->{$key}->{'total_sylls'}; + delete $personinfohash->{$key}->{'total_sents'}; + } + + # Loop through Lords + $query = "select person_id from member where person_id in "; + $sth = $dbh->prepare($query . + '(select person_id from member where house=2 AND curdate() <= left_house)'); + $sth->execute(); + while ( my @row = $sth->fetchrow_array() ) { + my $person_id = $row[0]; + my $person_fullid = "uk.org.publicwhip/person/$person_id"; + + my $tth = $dbh->prepare("select count(*) from hansard, epobject + where hansard.epobject_id = epobject.epobject_id and person_id = ? and major = 101 and + hdate >= date_sub(curdate(), interval 1 year) and + body not like '%rose—%' group by section_id"); + my $rows = $tth->execute($person_id); + $personinfohash->{$person_fullid}->{"Ldebate_sectionsspoken_inlastyear"} += int($rows); + + $tth = $dbh->prepare(" + select count(*) from hansard, comments where hansard.epobject_id = comments.epobject_id and visible + and person_id = ?"); + $tth->execute($person_id); + my @thisrow = $tth->fetchrow_array(); + my $comments = $thisrow[0]; + $personinfohash->{$person_fullid}->{"Lcomments_on_speeches"} += int($comments); + + $tth = $dbh->prepare("select count(*) from hansard where person_id = ? and major = 3 and minor = 1 and + hdate >= date_sub(curdate(), interval 1 year) + "); + $tth->execute($person_id); + @thisrow = $tth->fetchrow_array(); + my $speeches = $thisrow[0]; + $personinfohash->{$person_fullid}->{"Lwrans_asked_inlastyear"} += $speeches; + + $tth = $dbh->prepare("select count(*) from hansard where person_id = ? and major = 3 and minor = 2 and + hdate >= date_sub(curdate(), interval 1 year)"); + $tth->execute($person_id); + @thisrow = $tth->fetchrow_array(); + $speeches = $thisrow[0]; + $personinfohash->{$person_fullid}->{"Lwrans_answered_inlastyear"} += $speeches; + + $tth = $dbh->prepare("select body from epobject,hansard where hansard.epobject_id = epobject.epobject_id and person_id=? and major=101"); + $tth->execute($person_id); + $personinfohash->{$person_fullid}->{'Lthree_word_alliterations'} = 0; + while (my @row = $tth->fetchrow_array()) { + my $body = $row[0]; + if ($body =~ m/\b((\w)\w*\s+\2\w*\s+\2\w*)\b/) { + $personinfohash->{$person_fullid}->{'Lthree_word_alliterations'} += 1 + } + } + } + + enrankify($personinfohash, "debate_sectionsspoken_inlastyear", 0); + enrankify($personinfohash, "comments_on_speeches", 0); + enrankify($personinfohash, "wrans_asked_inlastyear", 0); + enrankify($personinfohash, "Ldebate_sectionsspoken_inlastyear", 0); + enrankify($personinfohash, "Lcomments_on_speeches", 0); + enrankify($personinfohash, "Lwrans_asked_inlastyear", 0); + enrankify($personinfohash, "three_word_alliterations", 0); + enrankify($personinfohash, "ending_with_a_preposition", 0); + enrankify($personinfohash, "only_asked_why", 0); + enrankify($personinfohash, "Lthree_word_alliterations", 0); + enrankify($personinfohash, "Lending_with_a_preposition", 0); + enrankify($personinfohash, "reading_ease", 0); + enrankify($personinfohash, "reading_year", 0); + enrankify($personinfohash, "writetothem_responsiveness_mean_2005", 0); +} - for (my $year=2002; $year<=2007; ++$year) { - foreach my $mp_id (keys %$personinfohash) { - if (defined($personinfohash->{$mp_id}->{'expenses'.$year.'_col1'})) { - my $total = 0; my $num; - for (my $col=1; $col<=9; ++$col) { - $num = $personinfohash->{$mp_id}->{'expenses'.$year.'_col'.$col}; - $total += $num; - } - if ($year>=2004) { - $num = $personinfohash->{$mp_id}->{'expenses'.$year.'_col7a'}; - $total += $num; - } - $personinfohash->{$mp_id}->{'expenses'.$year.'_total'} = $total; - } - } +sub makerankings_expenses { + foreach my $mp_id (keys %$personinfohash) { + if (defined($personinfohash->{$mp_id}->{'expenses2007_col5a'})) { + my $total = 0; + foreach my $let ('a'..'f') { + $total += $personinfohash->{$mp_id}->{'expenses2007_col5'.$let}; + } + $personinfohash->{$mp_id}->{'expenses2007_col5'} = $total; } + } + for (my $year=2002; $year<=2007; ++$year) { foreach my $mp_id (keys %$personinfohash) { - for (my $year=2008; $year<=2009; $year++) { - my $prefix = "expenses$year"; - if (defined($personinfohash->{$mp_id}->{$prefix . '_colmp_reg_travel_a'})) { - - my $total = 0; - foreach my $let ('a'..'d') { - $total += $personinfohash->{$mp_id}->{$prefix . '_colmp_reg_travel_'.$let}; - $total += $personinfohash->{$mp_id}->{$prefix . '_colmp_other_travel_'.$let}; - } - $personinfohash->{$mp_id}->{$prefix . '_col5'} = $total; - $personinfohash->{$mp_id}->{$prefix . '_col6'} = $personinfohash->{$mp_id}->{$prefix . '_colemployee_travel_a'}; - $personinfohash->{$mp_id}->{$prefix . '_total'} = $personinfohash->{$mp_id}->{$prefix . '_coltotal_inc_travel'} - if $personinfohash->{$mp_id}->{$prefix . '_coltotal_inc_travel'}; + if (defined($personinfohash->{$mp_id}->{'expenses'.$year.'_col1'})) { + my $total = 0; my $num; + for (my $col=1; $col<=9; ++$col) { + $num = $personinfohash->{$mp_id}->{'expenses'.$year.'_col'.$col}; + $total += $num; + } + if ($year>=2004) { + $num = $personinfohash->{$mp_id}->{'expenses'.$year.'_col7a'}; + $total += $num; } + $personinfohash->{$mp_id}->{'expenses'.$year.'_total'} = $total; } } + } - for (my $year=2002; $year<=2009; ++$year) { - next if $year == 2006; - for (my $col=1; $col<=9; ++$col) { - enrankify($personinfohash, 'expenses'.$year.'_col'.$col, 0); - } - enrankify($personinfohash, 'expenses'.$year.'_col7a', 0) if ($year>=2004); - enrankify($personinfohash, 'expenses'.$year.'_total', 0); + foreach my $mp_id (keys %$personinfohash) { + for (my $year=2008; $year<=2009; $year++) { + my $prefix = "expenses$year"; + if (defined($personinfohash->{$mp_id}->{$prefix . '_colmp_reg_travel_a'})) { + + my $total = 0; + foreach my $let ('a'..'d') { + $total += $personinfohash->{$mp_id}->{$prefix . '_colmp_reg_travel_'.$let}; + $total += $personinfohash->{$mp_id}->{$prefix . '_colmp_other_travel_'.$let}; + } + $personinfohash->{$mp_id}->{$prefix . '_col5'} = $total; + $personinfohash->{$mp_id}->{$prefix . '_col6'} = $personinfohash->{$mp_id}->{$prefix . '_colemployee_travel_a'}; + $personinfohash->{$mp_id}->{$prefix . '_total'} = $personinfohash->{$mp_id}->{$prefix . '_coltotal_inc_travel'} + if $personinfohash->{$mp_id}->{$prefix . '_coltotal_inc_travel'}; } - foreach my $let ('a'..'f') { - enrankify($personinfohash, 'expenses2007_col5'.$let, 0); } + } - foreach my $let ('a'..'d') { - for (my $year=2008; $year<=2009; $year++) { - enrankify($personinfohash, 'expenses' . $year . '_colmp_reg_travel_'.$let, 0); - enrankify($personinfohash, 'expenses' . $year . '_colmp_other_travel_'.$let, 0); - } + for (my $year=2002; $year<=2009; ++$year) { + next if $year == 2006; + for (my $col=1; $col<=9; ++$col) { + enrankify($personinfohash, 'expenses'.$year.'_col'.$col, 0); } + enrankify($personinfohash, 'expenses'.$year.'_col7a', 0) if ($year>=2004); + enrankify($personinfohash, 'expenses'.$year.'_total', 0); + } + foreach my $let ('a'..'f') { + enrankify($personinfohash, 'expenses2007_col5'.$let, 0); + } + + foreach my $let ('a'..'d') { for (my $year=2008; $year<=2009; $year++) { - enrankify($personinfohash, 'expenses' . $year . '_colcomms_allowance', 0); - enrankify($personinfohash, 'expenses' . $year . '_colspouse_travel_a', 0); - enrankify($personinfohash, 'expenses' . $year . '_colfamily_travel_a', 0); - enrankify($personinfohash, 'expenses' . $year . '_coltotal_exc_travel', 0); - enrankify($personinfohash, 'expenses' . $year . '_coltotal_travel', 0); + enrankify($personinfohash, 'expenses' . $year . '_colmp_reg_travel_'.$let, 0); + enrankify($personinfohash, 'expenses' . $year . '_colmp_other_travel_'.$let, 0); } - enrankify($personinfohash, 'expenses2009_colstationery', 0); + } + for (my $year=2008; $year<=2009; $year++) { + enrankify($personinfohash, 'expenses' . $year . '_colcomms_allowance', 0); + enrankify($personinfohash, 'expenses' . $year . '_colspouse_travel_a', 0); + enrankify($personinfohash, 'expenses' . $year . '_colfamily_travel_a', 0); + enrankify($personinfohash, 'expenses' . $year . '_coltotal_exc_travel', 0); + enrankify($personinfohash, 'expenses' . $year . '_coltotal_travel', 0); + } + enrankify($personinfohash, 'expenses2009_colstationery', 0); } # Generate ranks from a data field sub enrankify { - my ($hash, $field, $backwards) = @_; - - # Extract value of $field for each MP who has it - my (%mpsvalue, %valuecount); - foreach my $mp_id (keys %$hash) { - my $value = $hash->{$mp_id}->{$field}; - if (defined $value) { - $value =~ s/%//; # remove % from end - $mpsvalue{$mp_id} = $value; - $valuecount{$value}++; - } + my ($hash, $field, $backwards) = @_; + + # Extract value of $field for each MP who has it + my (%mpsvalue, %valuecount); + foreach my $mp_id (keys %$hash) { + my $value = $hash->{$mp_id}->{$field}; + if (defined $value) { + $value =~ s/%//; # remove % from end + $mpsvalue{$mp_id} = $value; + $valuecount{$value}++; } - - my $count = scalar keys %mpsvalue; - return unless $count; - - # Sort, and calculate ranking for - my @mps; + } + + my $count = scalar keys %mpsvalue; + return unless $count; + + # Sort, and calculate ranking for + my @mps; + if ($backwards) { + @mps = sort { $mpsvalue{$a} <=> $mpsvalue{$b} } keys %mpsvalue; + } else { + @mps = sort { $mpsvalue{$b} <=> $mpsvalue{$a} } keys %mpsvalue; + } + + my @quintile = (); + for (my $i=1; $i<=4; $i++) { + my $q = ($count + 1) * $i / 5; + #$quintile[$i-1] = $q; + $quintile[$i-1] = $mpsvalue{$mps[int($q)]}; # ceil + } + + my $rank = 0; + my $activerank = 0; + my $prevvalue = -1; + foreach my $mp (@mps) { + $rank++; + $activerank = $rank if ($mpsvalue{$mp} != $prevvalue); + my $quintile; if ($backwards) { - @mps = sort { $mpsvalue{$a} <=> $mpsvalue{$b} } keys %mpsvalue; + # copy the below if you ever enrankify() something that is backwards } else { - @mps = sort { $mpsvalue{$b} <=> $mpsvalue{$a} } keys %mpsvalue; - } - - my @quintile = (); - for (my $i=1; $i<=4; $i++) { - my $q = ($count + 1) * $i / 5; - #$quintile[$i-1] = $q; - $quintile[$i-1] = $mpsvalue{$mps[int($q)]}; # ceil - } - - my $rank = 0; - my $activerank = 0; - my $prevvalue = -1; - foreach my $mp (@mps) { - $rank++; - $activerank = $rank if ($mpsvalue{$mp} != $prevvalue); - my $quintile; - if ($backwards) { - # copy the below if you ever enrankify() something that is backwards - } else { - # Ever so slightly biased towards average and above average, I guess - if ($mpsvalue{$mp} <= $quintile[1] && $mpsvalue{$mp} >= $quintile[2]) { - $quintile = 2; - } elsif ($mpsvalue{$mp} <= $quintile[0] && $mpsvalue{$mp} >= $quintile[1]) { - $quintile = 1; - } elsif ($mpsvalue{$mp} <= $quintile[2] && $mpsvalue{$mp} >= $quintile[3]) { - $quintile = 3; - } elsif ($mpsvalue{$mp} >= $quintile[0]) { - $quintile = 0; - } elsif ($mpsvalue{$mp} <= $quintile[3]) { - $quintile = 4; - } else { - die $!; - } - } - #print "$rank $activerank $mpsvalue{$mp} $quintile\n"; - #$quintile++ if ($activerank>$quintile[$quintile]); - #print $field . " " . $mp . " value $activerank of " . $#mps . "\n"; - $hash->{$mp}->{$field . "_rank"} = $activerank; - $hash->{$mp}->{$field . "_rank_joint"} = $valuecount{$mpsvalue{$mp}} if $valuecount{$mpsvalue{$mp}} > 1; - $hash->{$mp}->{$field . "_rank_outof"} = $count; - $hash->{$mp}->{$field . '_quintile'} = $quintile; - $prevvalue = $mpsvalue{$mp}; + # Ever so slightly biased towards average and above average, I guess + if ($mpsvalue{$mp} <= $quintile[1] && $mpsvalue{$mp} >= $quintile[2]) { + $quintile = 2; + } elsif ($mpsvalue{$mp} <= $quintile[0] && $mpsvalue{$mp} >= $quintile[1]) { + $quintile = 1; + } elsif ($mpsvalue{$mp} <= $quintile[2] && $mpsvalue{$mp} >= $quintile[3]) { + $quintile = 3; + } elsif ($mpsvalue{$mp} >= $quintile[0]) { + $quintile = 0; + } elsif ($mpsvalue{$mp} <= $quintile[3]) { + $quintile = 4; + } else { + die $!; + } } + #print "$rank $activerank $mpsvalue{$mp} $quintile\n"; + #$quintile++ if ($activerank>$quintile[$quintile]); + #print $field . " " . $mp . " value $activerank of " . $#mps . "\n"; + $hash->{$mp}->{$field . "_rank"} = $activerank; + $hash->{$mp}->{$field . "_rank_joint"} = $valuecount{$mpsvalue{$mp}} if $valuecount{$mpsvalue{$mp}} > 1; + $hash->{$mp}->{$field . "_rank_outof"} = $count; + $hash->{$mp}->{$field . '_quintile'} = $quintile; + $prevvalue = $mpsvalue{$mp}; + } } diff --git a/scripts/xml2db.pl b/scripts/xml2db.pl index 9c5c940068..f0cd1bb345 100755 --- a/scripts/xml2db.pl +++ b/scripts/xml2db.pl @@ -6,7 +6,7 @@ # # Magic numbers, and other properties of the destination schema # used to be documented here: -# http://web.archive.org/web/20090414002944/http://wiki.theyworkforyou.com/cgi-bin/moin.cgi/DataSchema +# http://web.archive.org/web/20090414002944/http://wiki.theyworkforyou.com/cgi-bin/moin.cgi/DataSchema # ... although please be aware that (as the archive.org URL suggests) # that document is no longer maintained and contains out-of-date information. # For some of the other magic numbers, you can refer to @@ -48,24 +48,24 @@ $scotland $scotwrans $scotqs %scotqspreloaded ); my $result = GetOptions ( "all" => \$all, - "recent" => \$recent, - "date=s" => \$date, - "from=s" => \$datefrom, - "to=s" => \$dateto, - "wrans" => \$wrans, - "westminhall" => \$westminhall, - "debates" => \$debates, - "wms" => \$wms, - "lordsdebates" => \$lordsdebates, - "ni" => \$ni, - "scotland" => \$scotland, - "scotwrans" => \$scotwrans, - "scotqs" => \$scotqs, - "standing" => \$standing, - "force" => \$force, - "quiet" => \$quiet, - "cronquiet" => \$cronquiet, - ); + "recent" => \$recent, + "date=s" => \$date, + "from=s" => \$datefrom, + "to=s" => \$dateto, + "wrans" => \$wrans, + "westminhall" => \$westminhall, + "debates" => \$debates, + "wms" => \$wms, + "lordsdebates" => \$lordsdebates, + "ni" => \$ni, + "scotland" => \$scotland, + "scotwrans" => \$scotwrans, + "scotqs" => \$scotqs, + "standing" => \$standing, + "force" => \$force, + "quiet" => \$quiet, + "cronquiet" => \$cronquiet, + ); my $c = 0; $c++ if $all; @@ -100,29 +100,29 @@ --from=YYYY-MM-DD --to=YYYY-MM-DD - reprocess this date range --force - also delete items from database that weren't in the XML - file (applied per day only) + file (applied per day only) --quiet - don't print the contents whenever an existing entry is - modified or deleted + modified or deleted --cronquiet - stop printing date names as entries are processed END - exit; + exit; } if ($datefrom || $dateto) { - $datefrom = "1000-01-01" if !defined $datefrom; - $dateto = "9999-12-31" if !defined $dateto; + $datefrom = "1000-01-01" if !defined $datefrom; + $dateto = "9999-12-31" if !defined $dateto; } else { - $datefrom = "9999-12-31"; - $dateto = "1000-01-01"; + $datefrom = "9999-12-31"; + $dateto = "1000-01-01"; } if ($date) { - $dateto = $date; - $datefrom = $date; + $dateto = $date; + $datefrom = $date; } db_connect(); @@ -178,89 +178,89 @@ END # Do dates in reverse order sub revsort { - return reverse sort @_; + return reverse sort @_; } # Process debates or wrans etc sub process_type { - my ($xnames, $xdirs, $xdayfunc) = @_; + my ($xnames, $xdirs, $xdayfunc) = @_; - my $process; - my $xsince = 0; - if (open FH, '<' . $lastupdatedir . $xnames->[0] . '-lastload') { - $xsince = readline FH; - close FH; - } - my @xmaxtime; - my $xmaxfile = ""; - for (my $i=0; $i<@$xdirs; $i++) { - my $xname = $xnames->[$i]; - my $xdir = $xdirs->[$i]; - $xmaxtime[$i] = 0; - - # Record which dates have files which have been updated - # (each date can have multiple files, if published Hansard has changed) - my $xwanted = sub { - return unless /^$xname(\d{4}-\d\d-\d\d)([a-z]*)\.xml$/ - || /^(\d{4}-\d\d-\d\d)_(\d+)\.xml$/ - || /^$xname\d{4}-\d\d-\d\d_[^_]*_[^_]*_(\d{4}-\d\d-\d\d)([a-z]*)\.xml$/; - my $xfile = $_; - my @stat = stat($xdir . $xfile); - my $use = ($stat[9] >= $xsince); - my $date_part = $1; - - if ($xmaxtime[$i] < $stat[9]) { - $xmaxfile = $xfile; - $xmaxtime[$i] = $stat[9]; - } + my $process; + my $xsince = 0; + if (open FH, '<' . $lastupdatedir . $xnames->[0] . '-lastload') { + $xsince = readline FH; + close FH; + } + my @xmaxtime; + my $xmaxfile = ""; + for (my $i=0; $i<@$xdirs; $i++) { + my $xname = $xnames->[$i]; + my $xdir = $xdirs->[$i]; + $xmaxtime[$i] = 0; + + # Record which dates have files which have been updated + # (each date can have multiple files, if published Hansard has changed) + my $xwanted = sub { + return unless /^$xname(\d{4}-\d\d-\d\d)([a-z]*)\.xml$/ + || /^(\d{4}-\d\d-\d\d)_(\d+)\.xml$/ + || /^$xname\d{4}-\d\d-\d\d_[^_]*_[^_]*_(\d{4}-\d\d-\d\d)([a-z]*)\.xml$/; + my $xfile = $_; + my @stat = stat($xdir . $xfile); + my $use = ($stat[9] >= $xsince); + my $date_part = $1; + + if ($xmaxtime[$i] < $stat[9]) { + $xmaxfile = $xfile; + $xmaxtime[$i] = $stat[9]; + } - #print $xfile ." ".($use?"t":"f")." $xsince $stat[9]\n"; - if ($all || ($use && $recent) || ($datefrom le $date_part && $date_part le $dateto)) { - $process->{$date_part} = 1; - } - }; - find({ wanted=>$xwanted, preprocess=>\&revsort }, $xdir); - } + #print $xfile ." ".($use?"t":"f")." $xsince $stat[9]\n"; + if ($all || ($use && $recent) || ($datefrom le $date_part && $date_part le $dateto)) { + $process->{$date_part} = 1; + } + }; + find({ wanted=>$xwanted, preprocess=>\&revsort }, $xdir); + } - # Go through dates, and load each one - my $xname = join(',', @$xnames); - foreach my $process_date (sort keys %$process) { - if (!$cronquiet) { - print "db loading $xname $process_date\n"; - } - &$xdayfunc($process_date); - # So we don't do it again - # XXX Doesn't currently apply to any files - #my $xfile = "$process_date.xml"; - #foreach my $xdir (@$xdirs) { - # utime(($xsince - 1), ($xsince - 1), ($xdir.$xfile)); - #} + # Go through dates, and load each one + my $xname = join(',', @$xnames); + foreach my $process_date (sort keys %$process) { + if (!$cronquiet) { + print "db loading $xname $process_date\n"; } + &$xdayfunc($process_date); + # So we don't do it again + # XXX Doesn't currently apply to any files + #my $xfile = "$process_date.xml"; + #foreach my $xdir (@$xdirs) { + # utime(($xsince - 1), ($xsince - 1), ($xdir.$xfile)); + #} + } - # Store that we've done - if ($recent) { - my $xxmaxtime = 0; - for (my $i=0; $i<@$xdirs; $i++) { - my $xdir = $xdirs->[$i]; - (my $sxdir = $xdir) =~ s/lords(wrans|wms)/lordspages/; - # Find last update time - die "xmaxtime[$i] not initialised" unless $xmaxtime[$i]; - if ($xxmaxtime < $xmaxtime[$i]) { - $xxmaxtime = $xmaxtime[$i]; - } - } + # Store that we've done + if ($recent) { + my $xxmaxtime = 0; + for (my $i=0; $i<@$xdirs; $i++) { + my $xdir = $xdirs->[$i]; + (my $sxdir = $xdir) =~ s/lords(wrans|wms)/lordspages/; + # Find last update time + die "xmaxtime[$i] not initialised" unless $xmaxtime[$i]; + if ($xxmaxtime < $xmaxtime[$i]) { + $xxmaxtime = $xmaxtime[$i]; + } + } - if ($xxmaxtime != $xsince) { - # We use the current maxtime, so we run things still at that time again - # (when there was an rsync from parlparse it might have only got one of - # two files set in # the same second, and next time it might get the other) - #print "$xname since: $xsince new max $xmaxtime from changedates\n"; - my $xname = $xnames->[0]; - open FH, ">$lastupdatedir$xname-lastload" or die "couldn't open $lastupdatedir$xname-lastload for writing"; - print FH $xxmaxtime; - close FH; - } + if ($xxmaxtime != $xsince) { + # We use the current maxtime, so we run things still at that time again + # (when there was an rsync from parlparse it might have only got one of + # two files set in # the same second, and next time it might get the other) + #print "$xname since: $xsince new max $xmaxtime from changedates\n"; + my $xname = $xnames->[0]; + open FH, ">$lastupdatedir$xname-lastload" or die "couldn't open $lastupdatedir$xname-lastload for writing"; + print FH $xxmaxtime; + close FH; } + } } # Load member->person data @@ -291,597 +291,597 @@ sub process_type { # Code from: perldoc -q 'How do I test whether two arrays or hashes are equal?' # Why is this not built in? This kind of thing makes me never want to use Perl again. sub compare_arrays { - my ($first, $second) = @_; - return 0 unless @$first == @$second; - for (my $i = 0; $i < @$first; $i++) { - if (defined $first->[$i] or defined $second->[$i]) { - if (!defined $first->[$i] or !defined $second->[$i]) { - return 0; - } - $second->[$i] = '00:00:00' if ($second->[$i] eq 'unknown'); - if ($first->[$i] ne $second->[$i]) { - return 0; - } - } + my ($first, $second) = @_; + return 0 unless @$first == @$second; + for (my $i = 0; $i < @$first; $i++) { + if (defined $first->[$i] or defined $second->[$i]) { + if (!defined $first->[$i] or !defined $second->[$i]) { + return 0; + } + $second->[$i] = '00:00:00' if ($second->[$i] eq 'unknown'); + if ($first->[$i] ne $second->[$i]) { + return 0; + } } - return 1; + } + return 1; } sub describe_compare_arrays { - my ($first, $second) = @_; - my $ret = ""; - if (@$first != @$second) { - die "sizes differ in describe_compare_arrays"; + my ($first, $second) = @_; + my $ret = ""; + if (@$first != @$second) { + die "sizes differ in describe_compare_arrays"; + } + for (my $i = 0; $i < @$first; $i++) + { + my $from = $first->[$i]; + my $to = $second->[$i]; + if (defined $from and (! defined $to)) + { $ret .= "at $i value #$from# to . "; } + elsif ((!defined $from) and defined $to) + { $ret .= "at $i value to #$to#. "; } + elsif (defined $from and defined $to) { + if ("$from" ne "$to") + { $ret .= "at $i value #$from# to #$to#. "; } } - for (my $i = 0; $i < @$first; $i++) - { - my $from = $first->[$i]; - my $to = $second->[$i]; - if (defined $from and (! defined $to)) - { $ret .= "at $i value #$from# to . "; } - elsif ((!defined $from) and defined $to) - { $ret .= "at $i value to #$to#. "; } - elsif (defined $from and defined $to) { - if ("$from" ne "$to") - { $ret .= "at $i value #$from# to #$to#. "; } - } - elsif ((!defined $from) and (!defined $to)) { - # OK - } - else - { die "unknown defined status in describe_compare_arrays"; } + elsif ((!defined $from) and (!defined $to)) { + # OK } - return $ret; + else + { die "unknown defined status in describe_compare_arrays"; } + } + return $ret; } sub array_difference { - my $array1 = shift; - my $array2 = shift; - - my @union = (); - my @intersection = (); - my @difference = (); - - my %count = (); - foreach my $element (@$array1, @$array2) { $count{$element}++ } - foreach my $element (keys %count) { - push @union, $element; - push @{ $count{$element} > 1 ? \@intersection : \@difference }, $element; - } - return \@difference; + my $array1 = shift; + my $array2 = shift; + + my @union = (); + my @intersection = (); + my @difference = (); + + my %count = (); + foreach my $element (@$array1, @$array2) { $count{$element}++ } + foreach my $element (keys %count) { + push @union, $element; + push @{ $count{$element} > 1 ? \@intersection : \@difference }, $element; + } + return \@difference; } sub strip_string { - my $s = shift; - $s =~ s/^\s+//; - $s =~ s/\s+$//; - return $s; + my $s = shift; + $s =~ s/^\s+//; + $s =~ s/\s+$//; + return $s; } # Converts all capital parts of a heading to mixed case sub fix_case { - $_ = shift; -# print "b:" . $_ . "\n"; + $_ = shift; +# print "b:" . $_ . "\n"; - # We work on each hyphen (mdash, —) separated section separately - my @parts = split /—/; - my @fixed_parts = map(&fix_case_part, @parts); - $_ = join(" — ", @fixed_parts); + # We work on each hyphen (mdash, —) separated section separately + my @parts = split /—/; + my @fixed_parts = map(&fix_case_part, @parts); + $_ = join(" — ", @fixed_parts); -# print "a:" . $_ . "\n"; - return $_; +# print "a:" . $_ . "\n"; + return $_; } sub fix_case_part { - # This mainly applies to departmental names for Oral Answers to Questions -# print "fix_case_part " . $_ . "\n"; - - s/\s+$//g; - s/^\s+//g; - s/\s+/ /g; - - # if it is all capitals in Hansard - # e.g. CABINET OFFICE - if (m/^[^a-z]+(&[^a-z]+)*$/) - { - die "undefined part title" if ! $_; -# print "fixing case: $_\n"; - Uncapitalise::format($_); -# print "fixed case: $_\n"; - } - die "not defined title part" if ! $_; + # This mainly applies to departmental names for Oral Answers to Questions +# print "fix_case_part " . $_ . "\n"; + + s/\s+$//g; + s/^\s+//g; + s/\s+/ /g; + + # if it is all capitals in Hansard + # e.g. CABINET OFFICE + if (m/^[^a-z]+(&[^a-z]+)*$/) + { + die "undefined part title" if ! $_; +# print "fixing case: $_\n"; + Uncapitalise::format($_); +# print "fixed case: $_\n"; + } + die "not defined title part" if ! $_; - return $_; + return $_; } # Parse all the files which match the glob using twig. sub parsefile_glob { - my ($twig, $glob) = @_; - my @files = glob($glob); - %ignorehistorygids = (); - foreach (@files) { - $current_file = $_; - #print "twigging: $_\n"; - $twig->parsefile($_); - } + my ($twig, $glob) = @_; + my @files = glob($glob); + %ignorehistorygids = (); + foreach (@files) { + $current_file = $_; + #print "twigging: $_\n"; + $twig->parsefile($_); + } } ########################################################################## # Database my ($dbh, - $epadd, $epcheck, $epupdate, - $hadd, $hcheck, $hupdate, $hdelete, $hdeletegid, - $gradd, $grdeletegid, - $scotqadd, $scotqdelete, $scotqbusinessexist, $scotqholdingexist, - $scotqdategidexist, $scotqreferenceexist, - $lastid); + $epadd, $epcheck, $epupdate, + $hadd, $hcheck, $hupdate, $hdelete, $hdeletegid, + $gradd, $grdeletegid, + $scotqadd, $scotqdelete, $scotqbusinessexist, $scotqholdingexist, + $scotqdategidexist, $scotqreferenceexist, + $lastid); sub db_connect { - # Connect to database, and prepare queries - my $dsn = 'DBI:mysql:database=' . mySociety::Config::get('TWFY_DB_NAME'). ':host=' . mySociety::Config::get('TWFY_DB_HOST'); - $dbh = DBI->connect($dsn, mySociety::Config::get('TWFY_DB_USER'), mySociety::Config::get('TWFY_DB_PASS'), { RaiseError => 1, PrintError => 0 }); - - # epobject queries - $epadd = $dbh->prepare("insert into epobject (title, body, type, created, modified) - values ('', ?, 1, NOW(), NOW())"); - $epcheck = $dbh->prepare("select body from epobject where epobject_id = ?"); - $epupdate = $dbh->prepare("update epobject set body = ?, modified = NOW() where epobject_id = ?"); - - # hansard object queries - $hadd = $dbh->prepare("insert into hansard (epobject_id, gid, colnum, htype, person_id, major, minor, section_id, subsection_id, hpos, hdate, htime, source_url, created, modified) - values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), NOW())"); - $hcheck = $dbh->prepare("select epobject_id, gid, colnum, htype, person_id, major, minor, section_id, subsection_id, hpos, hdate, htime, source_url from hansard where gid = ?"); - $hupdate = $dbh->prepare("update hansard set gid = ?, colnum = ?, htype = ?, person_id = ?, major = ?, minor = ?, section_id = ?, subsection_id = ?, hpos = ?, hdate = ?, htime = ?, source_url = ?, modified = NOW() - where epobject_id = ? and gid = ?"); - $hdelete = $dbh->prepare("delete from hansard where gid = ? and epobject_id = ?"); - $hdeletegid = $dbh->prepare("delete from hansard where gid = ?"); - - # gidredirect entries - $gradd = $dbh->prepare("replace into gidredirect (gid_from, gid_to, hdate, major) values (?,?,?,?)"); - $grdeletegid = $dbh->prepare("delete from gidredirect where gid_from = ?"); - - # scottish question mentions - $scotqadd = $dbh->prepare("insert into mentions (gid, type, date, url, mentioned_gid) values (?,?,?,?,?)"); - $scotqbusinessexist = $dbh->prepare("select mention_id from mentions where gid = ? and type = ? and date = ? and url = ?"); - $scotqholdingexist = $dbh->prepare("select mention_id from mentions where gid = ? and type = ? and date = ?"); - $scotqdategidexist = $dbh->prepare("select mention_id from mentions where gid = ? and type = ? and date = ? and mentioned_gid = ?"); - $scotqreferenceexist = $dbh->prepare("select mention_id from mentions where gid = ? and type = ? and mentioned_gid = ?"); - $scotqdelete = $dbh->prepare("delete from mentions where mention_id = ?"); - - # other queries - $lastid = $dbh->prepare("select last_insert_id()"); - - # Clear any half made previous attempts. - delete_lonely_epobjects() + # Connect to database, and prepare queries + my $dsn = 'DBI:mysql:database=' . mySociety::Config::get('TWFY_DB_NAME'). ':host=' . mySociety::Config::get('TWFY_DB_HOST'); + $dbh = DBI->connect($dsn, mySociety::Config::get('TWFY_DB_USER'), mySociety::Config::get('TWFY_DB_PASS'), { RaiseError => 1, PrintError => 0 }); + + # epobject queries + $epadd = $dbh->prepare("insert into epobject (title, body, type, created, modified) + values ('', ?, 1, NOW(), NOW())"); + $epcheck = $dbh->prepare("select body from epobject where epobject_id = ?"); + $epupdate = $dbh->prepare("update epobject set body = ?, modified = NOW() where epobject_id = ?"); + + # hansard object queries + $hadd = $dbh->prepare("insert into hansard (epobject_id, gid, colnum, htype, person_id, major, minor, section_id, subsection_id, hpos, hdate, htime, source_url, created, modified) + values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW(), NOW())"); + $hcheck = $dbh->prepare("select epobject_id, gid, colnum, htype, person_id, major, minor, section_id, subsection_id, hpos, hdate, htime, source_url from hansard where gid = ?"); + $hupdate = $dbh->prepare("update hansard set gid = ?, colnum = ?, htype = ?, person_id = ?, major = ?, minor = ?, section_id = ?, subsection_id = ?, hpos = ?, hdate = ?, htime = ?, source_url = ?, modified = NOW() + where epobject_id = ? and gid = ?"); + $hdelete = $dbh->prepare("delete from hansard where gid = ? and epobject_id = ?"); + $hdeletegid = $dbh->prepare("delete from hansard where gid = ?"); + + # gidredirect entries + $gradd = $dbh->prepare("replace into gidredirect (gid_from, gid_to, hdate, major) values (?,?,?,?)"); + $grdeletegid = $dbh->prepare("delete from gidredirect where gid_from = ?"); + + # scottish question mentions + $scotqadd = $dbh->prepare("insert into mentions (gid, type, date, url, mentioned_gid) values (?,?,?,?,?)"); + $scotqbusinessexist = $dbh->prepare("select mention_id from mentions where gid = ? and type = ? and date = ? and url = ?"); + $scotqholdingexist = $dbh->prepare("select mention_id from mentions where gid = ? and type = ? and date = ?"); + $scotqdategidexist = $dbh->prepare("select mention_id from mentions where gid = ? and type = ? and date = ? and mentioned_gid = ?"); + $scotqreferenceexist = $dbh->prepare("select mention_id from mentions where gid = ? and type = ? and mentioned_gid = ?"); + $scotqdelete = $dbh->prepare("delete from mentions where mention_id = ?"); + + # other queries + $lastid = $dbh->prepare("select last_insert_id()"); + + # Clear any half made previous attempts. + delete_lonely_epobjects() } # Process a file of "mentions"... sub process_mentions { - my ($xgidtype,$xdir) = @_; + my ($xgidtype,$xdir) = @_; - # Nasty cut-and-pasting of process_type, more or less: + # Nasty cut-and-pasting of process_type, more or less: - if ($xgidtype eq "spq") { + if ($xgidtype eq "spq") { - # Checking one-by-one which rows are already in the DB is - # horrendously slow, so we could load in the entire table as - # the first thing we do: + # Checking one-by-one which rows are already in the DB is + # horrendously slow, so we could load in the entire table as + # the first thing we do: - my $preload_table = 1; - if ($preload_table) { - %scotqspreloaded = (); - my $scotqsall = $dbh->prepare("select mention_id, gid, type, date, url, mentioned_gid from mentions"); - my $rows = $scotqsall->execute(); - while (my @row = $scotqsall->fetchrow_array()) { - my $s = join('|', map { defined $_ ? $_ : '' } @row[1..5]); - $scotqspreloaded{$s} = 1; - } - } - - my $process; - my $xsince = 0; - if (open FH, '<' . $lastupdatedir . $xgidtype . '-lastload') { - $xsince = readline FH; - close FH; - } + my $preload_table = 1; + if ($preload_table) { + %scotqspreloaded = (); + my $scotqsall = $dbh->prepare("select mention_id, gid, type, date, url, mentioned_gid from mentions"); + my $rows = $scotqsall->execute(); + while (my @row = $scotqsall->fetchrow_array()) { + my $s = join('|', map { defined $_ ? $_ : '' } @row[1..5]); + $scotqspreloaded{$s} = 1; + } + } - my $xmaxtime = 0; - my $xmaxfile = ""; + my $process; + my $xsince = 0; + if (open FH, '<' . $lastupdatedir . $xgidtype . '-lastload') { + $xsince = readline FH; + close FH; + } - # Record which dates have files which have been updated: + my $xmaxtime = 0; + my $xmaxfile = ""; - my $xwanted = sub { - return unless /^up-to-(\d{4}-\d\d-\d\d)(.*)\.xml$/; - my $xfile = $_; - my @stat = stat($xdir . $xfile); - my $use = ($stat[9] >= $xsince); - my $date_part = $1; + # Record which dates have files which have been updated: - if ($xmaxtime < $stat[9]) { - $xmaxfile = $xfile; - $xmaxtime = $stat[9]; - } + my $xwanted = sub { + return unless /^up-to-(\d{4}-\d\d-\d\d)(.*)\.xml$/; + my $xfile = $_; + my @stat = stat($xdir . $xfile); + my $use = ($stat[9] >= $xsince); + my $date_part = $1; - #print $xfile ." ".($use?"t":"f")." $xsince $stat[9]\n"; - if ($all || ($use && $recent) || ($datefrom le $date_part && $date_part le $dateto)) { - $process->{$date_part} = $xfile; - } - }; + if ($xmaxtime < $stat[9]) { + $xmaxfile = $xfile; + $xmaxtime = $stat[9]; + } - find({ wanted=>$xwanted, preprocess=>\&revsort }, $xdir); + #print $xfile ." ".($use?"t":"f")." $xsince $stat[9]\n"; + if ($all || ($use && $recent) || ($datefrom le $date_part && $date_part le $dateto)) { + $process->{$date_part} = $xfile; + } + }; - # Go through dates, and load each one - foreach my $process_date (sort keys %$process) { - my $xfile = $process->{$process_date}; - if (!$cronquiet) { - print "db loading $process_date (file: $xfile)\n"; - } - my $twig = XML::Twig->new(twig_handlers => - { 'question' => \&loadspq }); - $twig->parsefile($xdir . $xfile); - $twig->dispose(); - } + find({ wanted=>$xwanted, preprocess=>\&revsort }, $xdir); - # Store that we've done - if ($recent) { - die "xmaxtime not initialised" unless $xmaxtime; - if ($xmaxtime != $xsince) { - open FH, ">$lastupdatedir$xgidtype-lastload" or die "couldn't open $lastupdatedir$xgidtype-lastload for writing"; - print FH $xmaxtime; - close FH; - } - } + # Go through dates, and load each one + foreach my $process_date (sort keys %$process) { + my $xfile = $process->{$process_date}; + if (!$cronquiet) { + print "db loading $process_date (file: $xfile)\n"; + } + my $twig = XML::Twig->new(twig_handlers => + { 'question' => \&loadspq }); + $twig->parsefile($xdir . $xfile); + $twig->dispose(); + } - } else { - die "Unknown gid type in process_mentions ($xgidtype)" + # Store that we've done + if ($recent) { + die "xmaxtime not initialised" unless $xmaxtime; + if ($xmaxtime != $xsince) { + open FH, ">$lastupdatedir$xgidtype-lastload" or die "couldn't open $lastupdatedir$xgidtype-lastload for writing"; + print FH $xmaxtime; + close FH; + } } + + } else { + die "Unknown gid type in process_mentions ($xgidtype)" + } } sub db_disconnect { - $dbh->disconnect(); + $dbh->disconnect(); } sub delete_lonely_epobjects() { - # We assume all epobjects are type 1, i.e. have hansard table entries for now - my $r = $dbh->selectcol_arrayref("select count(*) from epobject where type <> 1;"); - my $c = $r->[0]; - die "Unknown type not 1 entries in epobject table, lots of code needs fixing" if $c > 0; - - # Quick check using counts - my $r1 = $dbh->selectcol_arrayref("select count(*) from epobject"); - my $c1 = $r1->[0]; - my $r2 = $dbh->selectcol_arrayref("select count(*) from hansard"); - my $c2 = $r2->[0]; - return if $c2 == $c1; - - print "Fixing up lonely epobjects. Counts: $c1 $c2\n" unless $cronquiet; - my $q = $dbh->prepare("select epobject_id from epobject"); - $q->execute(); - my $left; - while (my @row = $q->fetchrow_array) { - $left->{$row[0]} = 1; - } - $q = $dbh->prepare("select epobject_id from hansard"); - $q->execute(); - while (my @row = $q->fetchrow_array) { - delete($left->{$row[0]}); - } + # We assume all epobjects are type 1, i.e. have hansard table entries for now + my $r = $dbh->selectcol_arrayref("select count(*) from epobject where type <> 1;"); + my $c = $r->[0]; + die "Unknown type not 1 entries in epobject table, lots of code needs fixing" if $c > 0; + + # Quick check using counts + my $r1 = $dbh->selectcol_arrayref("select count(*) from epobject"); + my $c1 = $r1->[0]; + my $r2 = $dbh->selectcol_arrayref("select count(*) from hansard"); + my $c2 = $r2->[0]; + return if $c2 == $c1; + + print "Fixing up lonely epobjects. Counts: $c1 $c2\n" unless $cronquiet; + my $q = $dbh->prepare("select epobject_id from epobject"); + $q->execute(); + my $left; + while (my @row = $q->fetchrow_array) { + $left->{$row[0]} = 1; + } + $q = $dbh->prepare("select epobject_id from hansard"); + $q->execute(); + while (my @row = $q->fetchrow_array) { + delete($left->{$row[0]}); + } - my @array = keys(%$left); - my $rows = @array; - print "Lonely epobject count: $rows\n" unless $cronquiet; - if ($rows > 0) { - my $delids = join(", ", @array); - my $qq = $dbh->prepare("delete from epobject where epobject_id in (" . $delids . ")"); - my $delrows = $qq->execute(); - $qq->finish(); - die "deleted " . $delrows . " but thought " . $rows if $delrows != $rows; - } - $q->finish(); + my @array = keys(%$left); + my $rows = @array; + print "Lonely epobject count: $rows\n" unless $cronquiet; + if ($rows > 0) { + my $delids = join(", ", @array); + my $qq = $dbh->prepare("delete from epobject where epobject_id in (" . $delids . ")"); + my $delrows = $qq->execute(); + $qq->finish(); + die "deleted " . $delrows . " but thought " . $rows if $delrows != $rows; + } + $q->finish(); } # Check that there are no extra gids in db that weren't in xml sub check_extra_gids { - my $date = shift; - my $gidsref = shift; - my $where = shift; - - my $q = $dbh->prepare("select gid from hansard where hdate = ? and gid not like '%L' and $where"); - my $rows = $q->execute($date); - my $array_ref1 = $q->fetchall_arrayref(); - $q->finish(); - $q = $dbh->prepare("select gid_from from gidredirect where hdate = ? and $where"); - $rows = $q->execute($date); - my $array_ref2 = $q->fetchall_arrayref(); - $q->finish(); - - my @mysql_gids1 = map $_->[0], @$array_ref1; - my @mysql_gids2 = map $_->[0], @$array_ref2; - my @mysql_allgids = sort(@mysql_gids1, @mysql_gids2); - - my @xml_gids = sort @$gidsref; - - # Find items in MySQL which aren't in XML -- this shouldn't - # happen, the Public Whip parser should never allow it. This - # code is partly a double check. - my %xml_hash; - foreach my $gid (@xml_gids) { - $xml_hash{$gid} = 1; - } - my $missing = 0; - foreach my $gid (@mysql_allgids) { - if (!$xml_hash{$gid}) { - # in MySQL, not in XML - $missing++; - my $vital = 0; - # check no comments, votes etc. - for my $entry (["comments", "epobject_id",], - ["anonvotes", "epobject_id",], - ["uservotes", "epobject_id",], - ["trackbacks", "epobject_id",], - ["editqueue", "epobject_id_l",], - ["editqueue", "epobject_id_h",], - ) { - my ($table, $field) = @$entry; - my $epuse_comments = $dbh->prepare("select count(*) from epobject, hansard, $table - where epobject.epobject_id = $table.$field and epobject.epobject_id = hansard.epobject_id and - hansard.gid = ?"); - $epuse_comments->execute($gid); - my $num_rows = $epuse_comments->fetchrow_array(); - $epuse_comments->finish(); - if ($num_rows > 0) { - if ($gid =~ /wrans/ && !$cronquiet) { - my $search_gid = $gid; - $search_gid =~ s/(\d\d\d\d-\d)\d-\d\d\w(\.\d+\.)/$1%$2/; - my $daychange = $dbh->prepare('SELECT gid,epobject_id FROM hansard WHERE gid like ? AND gid != ?'); - $daychange->execute($search_gid, $gid); - my ($new_gid, $new_epobjectid) = $daychange->fetchrow_array(); - if ($new_epobjectid) { - my $hgetid = $dbh->prepare("select epobject_id from hansard where gid = ?"); - $hgetid->execute($gid); - my $old_epobjectid = $hgetid->fetchrow_array(); - $hgetid->finish(); - print "POSSIBLE FIX: $gid -> $new_gid, $old_epobjectid -> $new_epobjectid ?\n"; - my $yes = ; - if ($yes =~ /^y$/i) { - update_eid($table, $field, $old_epobjectid, $new_epobjectid); - next; - } - } - } - print "VITAL ERROR! gid $gid needs deleting, has an entry in table $table, but no gid redirect\n"; - $vital++; - } - } - # either fix it, or display it - if ($force) { - if ($vital > 0) { - die "Refusing to even force delete, when there are references in other tables\n"; - } else { - $hdeletegid->execute($gid); - $hdeletegid->finish(); - $grdeletegid->execute($gid); - $grdeletegid->finish(); - print "FORCED deleting $gid from db, wasn't in XML\n"; - } - } - else { - print "gid $gid in database not in XML, run again with --force to delete\n"; + my $date = shift; + my $gidsref = shift; + my $where = shift; + + my $q = $dbh->prepare("select gid from hansard where hdate = ? and gid not like '%L' and $where"); + my $rows = $q->execute($date); + my $array_ref1 = $q->fetchall_arrayref(); + $q->finish(); + $q = $dbh->prepare("select gid_from from gidredirect where hdate = ? and $where"); + $rows = $q->execute($date); + my $array_ref2 = $q->fetchall_arrayref(); + $q->finish(); + + my @mysql_gids1 = map $_->[0], @$array_ref1; + my @mysql_gids2 = map $_->[0], @$array_ref2; + my @mysql_allgids = sort(@mysql_gids1, @mysql_gids2); + + my @xml_gids = sort @$gidsref; + + # Find items in MySQL which aren't in XML -- this shouldn't + # happen, the Public Whip parser should never allow it. This + # code is partly a double check. + my %xml_hash; + foreach my $gid (@xml_gids) { + $xml_hash{$gid} = 1; + } + my $missing = 0; + foreach my $gid (@mysql_allgids) { + if (!$xml_hash{$gid}) { + # in MySQL, not in XML + $missing++; + my $vital = 0; + # check no comments, votes etc. + for my $entry (["comments", "epobject_id",], + ["anonvotes", "epobject_id",], + ["uservotes", "epobject_id",], + ["trackbacks", "epobject_id",], + ["editqueue", "epobject_id_l",], + ["editqueue", "epobject_id_h",], + ) { + my ($table, $field) = @$entry; + my $epuse_comments = $dbh->prepare("select count(*) from epobject, hansard, $table + where epobject.epobject_id = $table.$field and epobject.epobject_id = hansard.epobject_id and + hansard.gid = ?"); + $epuse_comments->execute($gid); + my $num_rows = $epuse_comments->fetchrow_array(); + $epuse_comments->finish(); + if ($num_rows > 0) { + if ($gid =~ /wrans/ && !$cronquiet) { + my $search_gid = $gid; + $search_gid =~ s/(\d\d\d\d-\d)\d-\d\d\w(\.\d+\.)/$1%$2/; + my $daychange = $dbh->prepare('SELECT gid,epobject_id FROM hansard WHERE gid like ? AND gid != ?'); + $daychange->execute($search_gid, $gid); + my ($new_gid, $new_epobjectid) = $daychange->fetchrow_array(); + if ($new_epobjectid) { + my $hgetid = $dbh->prepare("select epobject_id from hansard where gid = ?"); + $hgetid->execute($gid); + my $old_epobjectid = $hgetid->fetchrow_array(); + $hgetid->finish(); + print "POSSIBLE FIX: $gid -> $new_gid, $old_epobjectid -> $new_epobjectid ?\n"; + my $yes = ; + if ($yes =~ /^y$/i) { + update_eid($table, $field, $old_epobjectid, $new_epobjectid); + next; + } } + } + print "VITAL ERROR! gid $gid needs deleting, has an entry in table $table, but no gid redirect\n"; + $vital++; } - } - if ($missing) { - if ($force) { - delete_lonely_epobjects(); + } + # either fix it, or display it + if ($force) { + if ($vital > 0) { + die "Refusing to even force delete, when there are references in other tables\n"; } else { - die; + $hdeletegid->execute($gid); + $hdeletegid->finish(); + $grdeletegid->execute($gid); + $grdeletegid->finish(); + print "FORCED deleting $gid from db, wasn't in XML\n"; } + } + else { + print "gid $gid in database not in XML, run again with --force to delete\n"; + } + } + } + if ($missing) { + if ($force) { + delete_lonely_epobjects(); + } else { + die; } + } } sub delete_redirected_gids { - my ($date, $grdests) = @_; - my $q_redirect = $dbh->prepare('SELECT gid_to from gidredirect WHERE gid_from = ?'); - my $hgetid = $dbh->prepare("select epobject_id from hansard where gid = ?"); - open FP, '>>' . $lastupdatedir . 'deleted-gids'; - foreach my $from_gid (sort keys %$grdests) { - my $to_gid = $grdests->{$from_gid}[0]; - my $matchtype = $grdests->{$from_gid}[1]; - my $loop; - do { - $loop = 0; - $q_redirect->execute($to_gid); - my $lookup = $q_redirect->fetchrow_array(); - if ($lookup) { - $loop = 1; - $to_gid = $lookup; - } - } while ($loop); - $hcheck->execute($to_gid); - my $new_epobjectid = ($hcheck->fetchrow_array())[0]; - $hcheck->finish(); - unless ($new_epobjectid) { - #print "PROBLEM: $from_gid\n"; - next; - } + my ($date, $grdests) = @_; + my $q_redirect = $dbh->prepare('SELECT gid_to from gidredirect WHERE gid_from = ?'); + my $hgetid = $dbh->prepare("select epobject_id from hansard where gid = ?"); + open FP, '>>' . $lastupdatedir . 'deleted-gids'; + foreach my $from_gid (sort keys %$grdests) { + my $to_gid = $grdests->{$from_gid}[0]; + my $matchtype = $grdests->{$from_gid}[1]; + my $loop; + do { + $loop = 0; + $q_redirect->execute($to_gid); + my $lookup = $q_redirect->fetchrow_array(); + if ($lookup) { + $loop = 1; + $to_gid = $lookup; + } + } while ($loop); + $hcheck->execute($to_gid); + my $new_epobjectid = ($hcheck->fetchrow_array())[0]; + $hcheck->finish(); + unless ($new_epobjectid) { + #print "PROBLEM: $from_gid\n"; + next; + } - # move comments and votes and so forth to redirected gid destination - for my $entry (["comments", "epobject_id",], - ["anonvotes", "epobject_id",], - ["uservotes", "epobject_id",], - ["trackbacks", "epobject_id",], - ["editqueue", "epobject_id_l",], - ["editqueue", "epobject_id_h",], - ) { - my ($table, $field) = @$entry; - my $epuse_comments = $dbh->prepare("select count(*) from epobject, hansard, $table - where epobject.epobject_id = $table.$field and epobject.epobject_id = hansard.epobject_id and - hansard.gid = ?"); - $epuse_comments->execute($from_gid); - my $num_rows = $epuse_comments->fetchrow_array(); - $epuse_comments->finish(); - if ($num_rows > 0) { - $hgetid->execute($from_gid); - my $old_epobjectid = $hgetid->fetchrow_array(); - $hgetid->finish(); - - print "gid $from_gid has $num_rows " . ($num_rows==1?'entry':'entries') . " in table $table, new gid $to_gid\n" unless $cronquiet; - update_eid($table, $field, $old_epobjectid, $new_epobjectid); - } - } + # move comments and votes and so forth to redirected gid destination + for my $entry (["comments", "epobject_id",], + ["anonvotes", "epobject_id",], + ["uservotes", "epobject_id",], + ["trackbacks", "epobject_id",], + ["editqueue", "epobject_id_l",], + ["editqueue", "epobject_id_h",], + ) { + my ($table, $field) = @$entry; + my $epuse_comments = $dbh->prepare("select count(*) from epobject, hansard, $table + where epobject.epobject_id = $table.$field and epobject.epobject_id = hansard.epobject_id and + hansard.gid = ?"); + $epuse_comments->execute($from_gid); + my $num_rows = $epuse_comments->fetchrow_array(); + $epuse_comments->finish(); + if ($num_rows > 0) { + $hgetid->execute($from_gid); + my $old_epobjectid = $hgetid->fetchrow_array(); + $hgetid->finish(); + + print "gid $from_gid has $num_rows " . ($num_rows==1?'entry':'entries') . " in table $table, new gid $to_gid\n" unless $cronquiet; + update_eid($table, $field, $old_epobjectid, $new_epobjectid); + } + } - # Maintain video bits - if ($matchtype eq 'missing') { - $dbh->do('update video_timestamps set deleted=2 where gid=?', {}, $from_gid); - } else { - $dbh->do('update video_timestamps set gid=? where gid=?', {}, $to_gid, $from_gid); - } - my $video_update = $dbh->selectrow_array('select video_status from hansard where gid=?', {}, $from_gid); - $dbh->do('update hansard set video_status=? where gid=?', {}, $video_update, $to_gid) - if defined $video_update; - - # delete the now obsolete "from record" (which is replaced by its "to record") - my $c = $hdeletegid->execute($from_gid); - if ($c > 0) { - print "deleted $from_gid which is now redirected to $to_gid\n" unless $cronquiet; - print FP "$from_gid\n"; - } - $hdeletegid->finish(); - } - close FP; + # Maintain video bits + if ($matchtype eq 'missing') { + $dbh->do('update video_timestamps set deleted=2 where gid=?', {}, $from_gid); + } else { + $dbh->do('update video_timestamps set gid=? where gid=?', {}, $to_gid, $from_gid); + } + my $video_update = $dbh->selectrow_array('select video_status from hansard where gid=?', {}, $from_gid); + $dbh->do('update hansard set video_status=? where gid=?', {}, $video_update, $to_gid) + if defined $video_update; + + # delete the now obsolete "from record" (which is replaced by its "to record") + my $c = $hdeletegid->execute($from_gid); + if ($c > 0) { + print "deleted $from_gid which is now redirected to $to_gid\n" unless $cronquiet; + print FP "$from_gid\n"; + } + $hdeletegid->finish(); + } + close FP; } sub update_eid { - my ($table, $field, $old_epobjectid, $new_epobjectid) = @_; - print "updating epobject id from $old_epobjectid => $new_epobjectid\n" unless $cronquiet; - if ($table eq 'anonvotes') { - my $epalready = $dbh->prepare("select epobject_id,yes_votes,no_votes from anonvotes where epobject_id=?"); - $epalready->execute($new_epobjectid); - my @arr = $epalready->fetchrow_array(); - if ($arr[0]) { - my $epdelete = $dbh->prepare('delete from anonvotes where epobject_id=?'); - $epdelete->execute($new_epobjectid); - $epdelete->finish(); - my $epuse_updateid = $dbh->prepare("update anonvotes set yes_votes=yes_votes+$arr[1], - no_votes=no_votes+$arr[2], epobject_id=? where epobject_id = ?"); - $epuse_updateid->execute($new_epobjectid, $old_epobjectid); - $epuse_updateid->finish(); - return; - } + my ($table, $field, $old_epobjectid, $new_epobjectid) = @_; + print "updating epobject id from $old_epobjectid => $new_epobjectid\n" unless $cronquiet; + if ($table eq 'anonvotes') { + my $epalready = $dbh->prepare("select epobject_id,yes_votes,no_votes from anonvotes where epobject_id=?"); + $epalready->execute($new_epobjectid); + my @arr = $epalready->fetchrow_array(); + if ($arr[0]) { + my $epdelete = $dbh->prepare('delete from anonvotes where epobject_id=?'); + $epdelete->execute($new_epobjectid); + $epdelete->finish(); + my $epuse_updateid = $dbh->prepare("update anonvotes set yes_votes=yes_votes+$arr[1], + no_votes=no_votes+$arr[2], epobject_id=? where epobject_id = ?"); + $epuse_updateid->execute($new_epobjectid, $old_epobjectid); + $epuse_updateid->finish(); + return; } - my $epuse_updateid = $dbh->prepare("update $table set $field = ? where $field = ?"); - $epuse_updateid->execute($new_epobjectid, $old_epobjectid); - $epuse_updateid->finish(); + } + my $epuse_updateid = $dbh->prepare("update $table set $field = ? where $field = ?"); + $epuse_updateid->execute($new_epobjectid, $old_epobjectid); + $epuse_updateid->finish(); } sub db_addpair { - my $epparams = shift; - my $hparams = shift; - my $gid = $$hparams[0]; - my $major = $$hparams[3]; + my $epparams = shift; + my $hparams = shift; + my $gid = $$hparams[0]; + my $major = $$hparams[3]; - $ignorehistorygids{$gid} = 1; + $ignorehistorygids{$gid} = 1; - # Depending on what mode we're in - if ($tallygidsmode) { - die "Got gid $gid twice in XML file" if (defined $gids{$gid}); - $gids{$gid} = 1; - $tallygidsmodedummycount++; - return $tallygidsmodedummycount; - } + # Depending on what mode we're in + if ($tallygidsmode) { + die "Got gid $gid twice in XML file" if (defined $gids{$gid}); + $gids{$gid} = 1; + $tallygidsmodedummycount++; + return $tallygidsmodedummycount; + } - # Delete any redirect of this, should there be one - $grdeletegid->execute($gid); - $grdeletegid->finish(); + # Delete any redirect of this, should there be one + $grdeletegid->execute($gid); + $grdeletegid->finish(); - # See if we already have a hansard object with this global identifier (gid) - my $q = $hcheck->execute($gid); - die "More than one existing hansard object of same gid " . $gid if ($q > 1); + # See if we already have a hansard object with this global identifier (gid) + my $q = $hcheck->execute($gid); + die "More than one existing hansard object of same gid " . $gid if ($q > 1); + + if ($q == 1) + { + my @hvals = $hcheck->fetchrow_array(); + $hcheck->finish(); + my $epid = shift @hvals; + if ($hvals[9] gt $hparams->[9]) { # the hdate column + print "not updating hansard object $gid, db date of $hvals[9] greater than $hparams->[9]\n"; + return $epid; + } - if ($q == 1) + # Check matching epobject exists + my $q = $epcheck->execute($epid); + my @epvals = $epcheck->fetchrow_array(); + $epcheck->finish(); + die "More than one existing epobject of same id " . $epid if ($q > 1); + if ($q != 1) { - my @hvals = $hcheck->fetchrow_array(); - $hcheck->finish(); - my $epid = shift @hvals; - if ($hvals[9] gt $hparams->[9]) { # the hdate column - print "not updating hansard object $gid, db date of $hvals[9] greater than $hparams->[9]\n"; - return $epid; + print "strange, missing epobject $epid for gid $gid - part of db unexpectedly missing\n"; + print "deleting hansard object $gid and rebuilding\n"; + my $delcount = $hdelete->execute($gid, $epid); + $hdelete->finish(); + die "Deleted " . $delcount . " rows when expected to delete one for " . $gid if $delcount != 1; + } else { + # Check to see if the existing hansard object and new one are the same + if (!compare_arrays(\@hvals, $hparams)) + { + # They differ - update the existing hansard object + die "Sizes incompatible when comparing hansard objects (in " . $gid . ")" if $#hvals != $#$hparams; + if (!$quiet) { + print "updating hansard object " . $gid . ", changing: "; + print describe_compare_arrays(\@hvals, $hparams) . "\n"; } + $hupdate->execute(@$hparams, $epid, $gid); + $hupdate->finish(); + } - # Check matching epobject exists - my $q = $epcheck->execute($epid); - my @epvals = $epcheck->fetchrow_array(); - $epcheck->finish(); - die "More than one existing epobject of same id " . $epid if ($q > 1); - if ($q != 1) - { - print "strange, missing epobject $epid for gid $gid - part of db unexpectedly missing\n"; - print "deleting hansard object $gid and rebuilding\n"; - my $delcount = $hdelete->execute($gid, $epid); - $hdelete->finish(); - die "Deleted " . $delcount . " rows when expected to delete one for " . $gid if $delcount != 1; - } else { - # Check to see if the existing hansard object and new one are the same - if (!compare_arrays(\@hvals, $hparams)) - { - # They differ - update the existing hansard object - die "Sizes incompatible when comparing hansard objects (in " . $gid . ")" if $#hvals != $#$hparams; - if (!$quiet) { - print "updating hansard object " . $gid . ", changing: "; - print describe_compare_arrays(\@hvals, $hparams) . "\n"; - } - $hupdate->execute(@$hparams, $epid, $gid); - $hupdate->finish(); - } - - # Check epobject is also the same - if (!compare_arrays(\@epvals, $epparams)) - { - # They differ - update the existing epobject - die "Sizes incompatible when comparing epobjects (in " . $gid . ")" if $#epvals != $#$epparams;; - if (!$quiet) { - print "updating epobject epid " . $epid . " for gid " . $gid . "\n"; - } - $epupdate->execute(@$epparams, $epid); - $epupdate->finish(); - } - - # Happy new and old objects are the same - #print "existing object " . $gid . " ignored\n"; - return $epid; + # Check epobject is also the same + if (!compare_arrays(\@epvals, $epparams)) + { + # They differ - update the existing epobject + die "Sizes incompatible when comparing epobjects (in " . $gid . ")" if $#epvals != $#$epparams;; + if (!$quiet) { + print "updating epobject epid " . $epid . " for gid " . $gid . "\n"; } + $epupdate->execute(@$epparams, $epid); + $epupdate->finish(); + } + + # Happy new and old objects are the same + #print "existing object " . $gid . " ignored\n"; + return $epid; } - $hcheck->finish(); - - $epadd->execute(@$epparams); - my $epid = last_id(); - $epadd->finish(); - $hadd->execute($epid, @$hparams); - $hadd->finish(); - - # print "added " . $gid . "\n"; - - return $epid; + } + $hcheck->finish(); + + $epadd->execute(@$epparams); + my $epid = last_id(); + $epadd->finish(); + $hadd->execute($epid, @$hparams); + $hadd->finish(); + + # print "added " . $gid . "\n"; + + return $epid; } # Autoincrement id of last added item sub last_id { - $lastid->execute(); - my @arr = $lastid->fetchrow_array(); - $lastid->finish(); - return $arr[0]; + $lastid->execute(); + my @arr = $lastid->fetchrow_array(); + $lastid->finish(); + return $arr[0]; } ########################################################################## @@ -889,65 +889,65 @@ sub last_id sub add_wrans_day { - my ($date) = @_; - - use vars qw($lordshead); - my $twig = XML::Twig->new(twig_handlers => { - 'ques' => sub { do_load_speech($_, 3, 1, $_->sprint(1)) }, - 'reply' => sub { do_load_speech($_, 3, 2, $_->sprint(1)) }, - 'minor-heading' => sub { - my $subheading = $_; - if ($lordshead==1) { - my $ohgid = $_->att('id'); - $ohgid =~ s/\d+\.\d+$//; - my ($lett) = $ohgid =~ /\d\d\d\d-\d\d-\d\d(.)/; - for ('a'..$lett) { - next if $_ eq $lett; - (my $oldgid = $ohgid) =~ s/$lett\.$/$_\./; - $hdeletegid->execute($oldgid.'L') unless $tallygidsmode; - } - my $ohcolnum = $_->att('colnum'); - my $ohurl = $_->att('url'); - my $overhead = XML::Twig::Elt->new('major-heading', - { id=>$ohgid . 'L', - colnum=> $ohcolnum, - url=> $ohurl, - nospeaker=>'true' - }, 'HOUSE OF LORDS'); - do_load_heading($overhead, 3, strip_string($overhead->sprint(1))); - $lordshead = 2; - } - do_load_subheading($subheading, 3, strip_string($subheading->sprint(1))) - }, - 'major-heading' => sub { do_load_heading($_, 3, strip_string($_->sprint(1))) }, - 'gidredirect' => sub { do_load_gidredirect($_, 3) }, - }, output_filter => $outputfilter ); - $curdate = $date; - - # find out what gids there are (using tallygidsmode) - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; - $tallygidsmode = 1; %gids = (); %grdests = (); $tallygidsmodedummycount = 10; - $lordshead = 0; - parsefile_glob($twig, $parldata . "scrapedxml/wrans/answers" . $curdate. "*.xml"); - # On 2015-01-26 Lords switched to a system that does give department names, like the Commons - $lordshead = 1 if $curdate lt '2015-01-26'; - parsefile_glob($twig, $parldata . "scrapedxml/lordswrans/lordswrans" . $curdate. "*.xml"); - # see if there are deleted gids - my @gids = keys %gids; - check_extra_gids($date, \@gids, "major = 3"); - - # make the modifications - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; - $tallygidsmode = 0; %gids = (); - $lordshead = 0; - parsefile_glob($twig, $parldata . "scrapedxml/wrans/answers" . $curdate. "*.xml"); - $lordshead = 1 if $curdate lt '2015-01-26'; - parsefile_glob($twig, $parldata . "scrapedxml/lordswrans/lordswrans" . $curdate. "*.xml"); - - # and delete anything that has been redirected (moving comments etc) - delete_redirected_gids($date, \%grdests); - - undef $twig; + my ($date) = @_; + + use vars qw($lordshead); + my $twig = XML::Twig->new(twig_handlers => { + 'ques' => sub { do_load_speech($_, 3, 1, $_->sprint(1)) }, + 'reply' => sub { do_load_speech($_, 3, 2, $_->sprint(1)) }, + 'minor-heading' => sub { + my $subheading = $_; + if ($lordshead==1) { + my $ohgid = $_->att('id'); + $ohgid =~ s/\d+\.\d+$//; + my ($lett) = $ohgid =~ /\d\d\d\d-\d\d-\d\d(.)/; + for ('a'..$lett) { + next if $_ eq $lett; + (my $oldgid = $ohgid) =~ s/$lett\.$/$_\./; + $hdeletegid->execute($oldgid.'L') unless $tallygidsmode; + } + my $ohcolnum = $_->att('colnum'); + my $ohurl = $_->att('url'); + my $overhead = XML::Twig::Elt->new('major-heading', + { id=>$ohgid . 'L', + colnum=> $ohcolnum, + url=> $ohurl, + nospeaker=>'true' + }, 'HOUSE OF LORDS'); + do_load_heading($overhead, 3, strip_string($overhead->sprint(1))); + $lordshead = 2; + } + do_load_subheading($subheading, 3, strip_string($subheading->sprint(1))) + }, + 'major-heading' => sub { do_load_heading($_, 3, strip_string($_->sprint(1))) }, + 'gidredirect' => sub { do_load_gidredirect($_, 3) }, + }, output_filter => $outputfilter ); + $curdate = $date; + + # find out what gids there are (using tallygidsmode) + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; + $tallygidsmode = 1; %gids = (); %grdests = (); $tallygidsmodedummycount = 10; + $lordshead = 0; + parsefile_glob($twig, $parldata . "scrapedxml/wrans/answers" . $curdate. "*.xml"); + # On 2015-01-26 Lords switched to a system that does give department names, like the Commons + $lordshead = 1 if $curdate lt '2015-01-26'; + parsefile_glob($twig, $parldata . "scrapedxml/lordswrans/lordswrans" . $curdate. "*.xml"); + # see if there are deleted gids + my @gids = keys %gids; + check_extra_gids($date, \@gids, "major = 3"); + + # make the modifications + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; + $tallygidsmode = 0; %gids = (); + $lordshead = 0; + parsefile_glob($twig, $parldata . "scrapedxml/wrans/answers" . $curdate. "*.xml"); + $lordshead = 1 if $curdate lt '2015-01-26'; + parsefile_glob($twig, $parldata . "scrapedxml/lordswrans/lordswrans" . $curdate. "*.xml"); + + # and delete anything that has been redirected (moving comments etc) + delete_redirected_gids($date, \%grdests); + + undef $twig; } ########################################################################## @@ -955,99 +955,99 @@ sub add_wrans_day sub add_debates_day { - my ($date) = @_; - my $twig = XML::Twig->new(twig_handlers => { - 'speech' => sub { do_load_speech($_, 1, 0, $_->sprint(1)) }, - 'minor-heading' => sub { do_load_subheading($_, 1, strip_string($_->sprint(1))) }, - 'major-heading' => sub { load_debate_heading($_, 1) }, - 'oral-heading' => sub { $inoralanswers = 1 }, - 'division' => sub { load_debate_division($_, 1) }, - 'gidredirect' => sub { do_load_gidredirect($_, 1) }, - }, output_filter => $outputfilter ); - $curdate = $date; - - # find out what gids there are (using tallygidsmode) - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; - $tallygidsmode = 1; %gids = (); %grdests = (); $tallygidsmodedummycount = 10; - parsefile_glob($twig, $parldata . "scrapedxml/debates/debates" . $curdate. "*.xml"); - # see if there are deleted gids - my @gids = keys %gids; - check_extra_gids($date, \@gids, "major = 1"); - - # make the modifications - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; - $tallygidsmode = 0; %gids = (); - parsefile_glob($twig, $parldata . "scrapedxml/debates/debates" . $curdate. "*.xml"); - - # and delete anything that has been redirected (moving comments etc) - delete_redirected_gids($date, \%grdests); - - undef $twig; + my ($date) = @_; + my $twig = XML::Twig->new(twig_handlers => { + 'speech' => sub { do_load_speech($_, 1, 0, $_->sprint(1)) }, + 'minor-heading' => sub { do_load_subheading($_, 1, strip_string($_->sprint(1))) }, + 'major-heading' => sub { load_debate_heading($_, 1) }, + 'oral-heading' => sub { $inoralanswers = 1 }, + 'division' => sub { load_debate_division($_, 1) }, + 'gidredirect' => sub { do_load_gidredirect($_, 1) }, + }, output_filter => $outputfilter ); + $curdate = $date; + + # find out what gids there are (using tallygidsmode) + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; + $tallygidsmode = 1; %gids = (); %grdests = (); $tallygidsmodedummycount = 10; + parsefile_glob($twig, $parldata . "scrapedxml/debates/debates" . $curdate. "*.xml"); + # see if there are deleted gids + my @gids = keys %gids; + check_extra_gids($date, \@gids, "major = 1"); + + # make the modifications + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; + $tallygidsmode = 0; %gids = (); + parsefile_glob($twig, $parldata . "scrapedxml/debates/debates" . $curdate. "*.xml"); + + # and delete anything that has been redirected (moving comments etc) + delete_redirected_gids($date, \%grdests); + + undef $twig; } # load tags sub load_debate_heading { - my ($speech, $major) = @_; - # we merge together the Oral Answers to Questions major heading with the - # major headings "under" it. - my $text = strip_string($speech->sprint(1)); - if ($inoralanswers) { - if ($wrans_major_headings !~ /,\Q$text\E,/) { # $text =~ m/[a-z]/ || $text eq 'BILL PRESENTED' || $text eq 'NEW MEMBER' || $text eq 'POINT OF ORDER') { - $inoralanswers = 0; - } else { - # — is mdash (apparently some browsers don't know —) - $text = "Oral Answers to Questions — " . fix_case($text); - } + my ($speech, $major) = @_; + # we merge together the Oral Answers to Questions major heading with the + # major headings "under" it. + my $text = strip_string($speech->sprint(1)); + if ($inoralanswers) { + if ($wrans_major_headings !~ /,\Q$text\E,/) { # $text =~ m/[a-z]/ || $text eq 'BILL PRESENTED' || $text eq 'NEW MEMBER' || $text eq 'POINT OF ORDER') { + $inoralanswers = 0; + } else { + # — is mdash (apparently some browsers don't know —) + $text = "Oral Answers to Questions — " . fix_case($text); } - do_load_heading($speech, $major, $text); + } + do_load_heading($speech, $major, $text); } # load tags sub load_debate_division { - my ($division, $major) = @_; - my $divdate = $division->att('divdate'); - my $divnumber = $division->att('divnumber'); + my ($division, $major) = @_; + my $divdate = $division->att('divdate'); + my $divnumber = $division->att('divnumber'); - my $text = + my $text = "

Division number $divnumber

See full + $text .= '&house=lords' if $major == 101; + $text .= "&showall=yes#voters\">See full list of votes (From The Public Whip)

"; - my $divcount = $division->first_child('divisioncount'); # attr ayes noes tellerayes tellernoes - my ($votes_tag, $vote_tag); - if ($major == 101) { - $votes_tag = 'lordlist'; - $vote_tag = 'lord'; - } else { - $votes_tag = 'mplist'; - $vote_tag = 'mpname'; - } - my @lists = $division->children($votes_tag); - foreach my $list (@lists) { - my $side = $list->att('vote'); - die unless $side =~ /^(aye|no|content|not-content)$/; - $text .= "

\u$side

    "; - my @names = $list->children($vote_tag); # attr ids vote (teller), text is name - foreach my $person (@names) { - my $person_id = $person->att('person_id') || $membertoperson{$person->att('id')}; - $person_id =~ s/.*\///; - my $vote = $person->att('vote'); - die unless $vote eq $side; - my $teller = $person->att('teller'); - my $name = $person->sprint(1); - $name =~ s/ *\[Teller\]//; # In Lords - $name =~ s/^(.*), (.*)$/$2 $1/; - $name =~ s/^(rh|Mr|Sir|Ms|Mrs|Dr) //; - $text .= "
  • $name"; - $text .= ' (teller)' if $teller; - $text .= "
  • \n"; - } - $text .= "
"; + my $divcount = $division->first_child('divisioncount'); # attr ayes noes tellerayes tellernoes + my ($votes_tag, $vote_tag); + if ($major == 101) { + $votes_tag = 'lordlist'; + $vote_tag = 'lord'; + } else { + $votes_tag = 'mplist'; + $vote_tag = 'mpname'; + } + my @lists = $division->children($votes_tag); + foreach my $list (@lists) { + my $side = $list->att('vote'); + die unless $side =~ /^(aye|no|content|not-content)$/; + $text .= "

\u$side

    "; + my @names = $list->children($vote_tag); # attr ids vote (teller), text is name + foreach my $person (@names) { + my $person_id = $person->att('person_id') || $membertoperson{$person->att('id')}; + $person_id =~ s/.*\///; + my $vote = $person->att('vote'); + die unless $vote eq $side; + my $teller = $person->att('teller'); + my $name = $person->sprint(1); + $name =~ s/ *\[Teller\]//; # In Lords + $name =~ s/^(.*), (.*)$/$2 $1/; + $name =~ s/^(rh|Mr|Sir|Ms|Mrs|Dr) //; + $text .= "
  • $name"; + $text .= ' (teller)' if $teller; + $text .= "
  • \n"; } + $text .= "
"; + } - do_load_speech($division, $major, 0, $text); + do_load_speech($division, $major, 0, $text); } ########################################################################## @@ -1055,33 +1055,33 @@ sub load_debate_division { sub add_lordsdebates_day { - my ($date) = @_; - my $twig = XML::Twig->new(twig_handlers => { - 'speech' => sub { do_load_speech($_, 101, 0, $_->sprint(1)) }, - 'minor-heading' => sub { do_load_subheading($_, 101, strip_string($_->sprint(1))) }, - 'major-heading' => sub { do_load_heading($_, 101, strip_string($_->sprint(1))) }, - 'division' => sub { load_debate_division($_, 101) }, - 'gidredirect' => sub { do_load_gidredirect($_, 101) }, - }, output_filter => $outputfilter ); - $curdate = $date; - - # find out what gids there are (using tallygidsmode) - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; - $tallygidsmode = 1; %gids = (); %grdests = (); $tallygidsmodedummycount = 10; - parsefile_glob($twig, $parldata . "scrapedxml/lordspages/daylord" . $curdate. "*.xml"); - # see if there are deleted gids - my @gids = keys %gids; - check_extra_gids($date, \@gids, "major = 101"); - - # make the modifications - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; - $tallygidsmode = 0; %gids = (); - parsefile_glob($twig, $parldata . "scrapedxml/lordspages/daylord" . $curdate. "*.xml"); - - # and delete anything that has been redirected (moving comments etc) - delete_redirected_gids($date, \%grdests); - - undef $twig; + my ($date) = @_; + my $twig = XML::Twig->new(twig_handlers => { + 'speech' => sub { do_load_speech($_, 101, 0, $_->sprint(1)) }, + 'minor-heading' => sub { do_load_subheading($_, 101, strip_string($_->sprint(1))) }, + 'major-heading' => sub { do_load_heading($_, 101, strip_string($_->sprint(1))) }, + 'division' => sub { load_debate_division($_, 101) }, + 'gidredirect' => sub { do_load_gidredirect($_, 101) }, + }, output_filter => $outputfilter ); + $curdate = $date; + + # find out what gids there are (using tallygidsmode) + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; + $tallygidsmode = 1; %gids = (); %grdests = (); $tallygidsmodedummycount = 10; + parsefile_glob($twig, $parldata . "scrapedxml/lordspages/daylord" . $curdate. "*.xml"); + # see if there are deleted gids + my @gids = keys %gids; + check_extra_gids($date, \@gids, "major = 101"); + + # make the modifications + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; + $tallygidsmode = 0; %gids = (); + parsefile_glob($twig, $parldata . "scrapedxml/lordspages/daylord" . $curdate. "*.xml"); + + # and delete anything that has been redirected (moving comments etc) + delete_redirected_gids($date, \%grdests); + + undef $twig; } ########################################################################## @@ -1089,542 +1089,542 @@ sub add_lordsdebates_day sub add_westminhall_day { - my ($date) = @_; - my $twig = XML::Twig->new(twig_handlers => { - 'speech' => sub { do_load_speech($_, 2, 0, $_->sprint(1)) }, - 'minor-heading' => sub { do_load_subheading($_, 2, strip_string($_->sprint(1))) }, - 'major-heading' => sub { load_debate_heading($_, 2) }, - 'oral-heading' => sub { $inoralanswers = 1; }, - 'division' => sub { die "Division in Westminter Hall, not handled yet!" }, - 'gidredirect' => sub { do_load_gidredirect($_, 2) }, - }, output_filter => $outputfilter ); - $curdate = $date; - - # find out what gids there are (using tallygidsmode) - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; - $tallygidsmode = 1; %gids = (); %grdests = (); $tallygidsmodedummycount = 10; - parsefile_glob($twig, $parldata . "scrapedxml/westminhall/westminster" . $curdate. "*.xml"); - # see if there are deleted gids - my @gids = keys %gids; - check_extra_gids($date, \@gids, "major = 2"); - - # make the modifications - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; - $tallygidsmode = 0; %gids = (); - parsefile_glob($twig, $parldata . "scrapedxml/westminhall/westminster" . $curdate. "*.xml"); - - # and delete anything that has been redirected (moving comments etc) - delete_redirected_gids($date, \%grdests); - - undef $twig; + my ($date) = @_; + my $twig = XML::Twig->new(twig_handlers => { + 'speech' => sub { do_load_speech($_, 2, 0, $_->sprint(1)) }, + 'minor-heading' => sub { do_load_subheading($_, 2, strip_string($_->sprint(1))) }, + 'major-heading' => sub { load_debate_heading($_, 2) }, + 'oral-heading' => sub { $inoralanswers = 1; }, + 'division' => sub { die "Division in Westminter Hall, not handled yet!" }, + 'gidredirect' => sub { do_load_gidredirect($_, 2) }, + }, output_filter => $outputfilter ); + $curdate = $date; + + # find out what gids there are (using tallygidsmode) + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; + $tallygidsmode = 1; %gids = (); %grdests = (); $tallygidsmodedummycount = 10; + parsefile_glob($twig, $parldata . "scrapedxml/westminhall/westminster" . $curdate. "*.xml"); + # see if there are deleted gids + my @gids = keys %gids; + check_extra_gids($date, \@gids, "major = 2"); + + # make the modifications + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; + $tallygidsmode = 0; %gids = (); + parsefile_glob($twig, $parldata . "scrapedxml/westminhall/westminster" . $curdate. "*.xml"); + + # and delete anything that has been redirected (moving comments etc) + delete_redirected_gids($date, \%grdests); + + undef $twig; } sub add_wms_day { - my ($date) = @_; - use vars qw($heading $subheading $overhead); - my $twig = XML::Twig->new(twig_handlers => { - 'minor-heading' => sub { $subheading = $_; }, - 'major-heading' => sub { $heading = $_; }, - 'gidredirect' => sub { do_load_gidredirect($_, 4) }, - }, output_filter => $outputfilter ); - $curdate = $date; - - # On 2015-01-26 Lords switched to a system that does give department names, like the Commons - my $lordsfn = \&load_lords_wms_speech; - $lordsfn = \&load_wms_speech if $curdate ge '2015-01-26'; - - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; - $tallygidsmode = 1; %gids = (); %grdests = (); $tallygidsmodedummycount = 10; - $twig->setTwigHandler('speech', $lordsfn); - $heading = ''; $subheading = ''; - parsefile_glob($twig, $parldata . "scrapedxml/lordswms/lordswms" . $curdate . "*.xml"); - $twig->setTwigHandler('speech', \&load_wms_speech); - $heading = ''; $subheading = ''; - parsefile_glob($twig, $parldata . "scrapedxml/wms/ministerial" . $curdate . "*.xml"); - # see if there are deleted gids - my @gids = keys %gids; - check_extra_gids($date, \@gids, "major = 4"); - - # make the modifications - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; - $tallygidsmode = 0; %gids = (); $overhead = undef; - $twig->setTwigHandler('speech', $lordsfn); - $heading = ''; $subheading = ''; - parsefile_glob($twig, $parldata . "scrapedxml/lordswms/lordswms" . $curdate . "*.xml"); - $twig->setTwigHandler('speech', \&load_wms_speech); - $heading = ''; $subheading = ''; - parsefile_glob($twig, $parldata . "scrapedxml/wms/ministerial" . $curdate . "*.xml"); - - # and delete anything that has been redirected - delete_redirected_gids($date, \%grdests); - - undef $twig; + my ($date) = @_; + use vars qw($heading $subheading $overhead); + my $twig = XML::Twig->new(twig_handlers => { + 'minor-heading' => sub { $subheading = $_; }, + 'major-heading' => sub { $heading = $_; }, + 'gidredirect' => sub { do_load_gidredirect($_, 4) }, + }, output_filter => $outputfilter ); + $curdate = $date; + + # On 2015-01-26 Lords switched to a system that does give department names, like the Commons + my $lordsfn = \&load_lords_wms_speech; + $lordsfn = \&load_wms_speech if $curdate ge '2015-01-26'; + + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; + $tallygidsmode = 1; %gids = (); %grdests = (); $tallygidsmodedummycount = 10; + $twig->setTwigHandler('speech', $lordsfn); + $heading = ''; $subheading = ''; + parsefile_glob($twig, $parldata . "scrapedxml/lordswms/lordswms" . $curdate . "*.xml"); + $twig->setTwigHandler('speech', \&load_wms_speech); + $heading = ''; $subheading = ''; + parsefile_glob($twig, $parldata . "scrapedxml/wms/ministerial" . $curdate . "*.xml"); + # see if there are deleted gids + my @gids = keys %gids; + check_extra_gids($date, \@gids, "major = 4"); + + # make the modifications + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; $inoralanswers = 0; + $tallygidsmode = 0; %gids = (); $overhead = undef; + $twig->setTwigHandler('speech', $lordsfn); + $heading = ''; $subheading = ''; + parsefile_glob($twig, $parldata . "scrapedxml/lordswms/lordswms" . $curdate . "*.xml"); + $twig->setTwigHandler('speech', \&load_wms_speech); + $heading = ''; $subheading = ''; + parsefile_glob($twig, $parldata . "scrapedxml/wms/ministerial" . $curdate . "*.xml"); + + # and delete anything that has been redirected + delete_redirected_gids($date, \%grdests); + + undef $twig; } sub is_dupe { my $rthon = '(right[ ])? (hon(\.|ourable)[ ])?'; my $statement = 'Statement'; if ($curdate ge '2015-01-26') { - $rthon = '((right|rt)[ ])? (hon(\.|ourable)?[ ])?'; - $statement = '(Statement|Announcement)'; + $rthon = '((right|rt)[ ])? (hon(\.|ourable)?[ ])?'; + $statement = '(Statement|Announcement)'; } return 1 if $_[0] =~ / - My[ ] - $rthon - (and[ ])? (noble[ ])? - friend\s*.*?[ ] - (has[ ])? (today[ ])? - ( (made|issued)[ ]the[ ]following[ ] (Written[ ])? (Ministerial[ ])? $statement - | published[ ]a[ ]report - ) - /ix; + My[ ] + $rthon + (and[ ])? (noble[ ])? + friend\s*.*?[ ] + (has[ ])? (today[ ])? + ( (made|issued)[ ]the[ ]following[ ] (Written[ ])? (Ministerial[ ])? $statement + | published[ ]a[ ]report + ) + /ix; return 0; } sub load_wms_speech { - my ($twig, $speech) = @_; - my $text = $speech->sprint(1); - return 1 if is_dupe($text); - do_load_heading($heading, 4, strip_string($heading->sprint(1))) if $heading; - do_load_subheading($subheading, 4, strip_string($subheading->sprint(1))) if $subheading; - do_load_speech($speech, 4, 0, $text); + my ($twig, $speech) = @_; + my $text = $speech->sprint(1); + return 1 if is_dupe($text); + do_load_heading($heading, 4, strip_string($heading->sprint(1))) if $heading; + do_load_subheading($subheading, 4, strip_string($subheading->sprint(1))) if $subheading; + do_load_speech($speech, 4, 0, $text); } sub load_lords_wms_speech { - my ($twig, $speech) = @_; - my $text = $speech->sprint(1); - return 1 if is_dupe($text); - my $firsthead = $heading || $subheading; - if (!$overhead && $firsthead) { - my $ohgid = $firsthead->att('id'); - $ohgid =~ s/\d+\.\d+$//; - my ($lett) = $ohgid =~ /\d\d\d\d-\d\d-\d\d(.)/; - for ('a'..$lett) { - next if $_ eq $lett; - (my $oldgid = $ohgid) =~ s/$lett\.$/$_\./; - $hdeletegid->execute($oldgid.'L') unless $tallygidsmode; - } - my $ohcolnum = $firsthead->att('colnum'); - my $ohurl = $firsthead->att('url'); - $overhead = XML::Twig::Elt->new('major-heading', - { id=>$ohgid . 'L', - colnum=> $ohcolnum, - url=> $ohurl, - nospeaker=>'true' - }, 'HOUSE OF LORDS'); - do_load_heading($overhead, 4, strip_string($overhead->sprint(1))); + my ($twig, $speech) = @_; + my $text = $speech->sprint(1); + return 1 if is_dupe($text); + my $firsthead = $heading || $subheading; + if (!$overhead && $firsthead) { + my $ohgid = $firsthead->att('id'); + $ohgid =~ s/\d+\.\d+$//; + my ($lett) = $ohgid =~ /\d\d\d\d-\d\d-\d\d(.)/; + for ('a'..$lett) { + next if $_ eq $lett; + (my $oldgid = $ohgid) =~ s/$lett\.$/$_\./; + $hdeletegid->execute($oldgid.'L') unless $tallygidsmode; } - do_load_subheading($firsthead, 4, strip_string($firsthead->sprint(1))) if $firsthead; - do_load_speech($speech, 4, 0, $text); + my $ohcolnum = $firsthead->att('colnum'); + my $ohurl = $firsthead->att('url'); + $overhead = XML::Twig::Elt->new('major-heading', + { id=>$ohgid . 'L', + colnum=> $ohcolnum, + url=> $ohurl, + nospeaker=>'true' + }, 'HOUSE OF LORDS'); + do_load_heading($overhead, 4, strip_string($overhead->sprint(1))); + } + do_load_subheading($firsthead, 4, strip_string($firsthead->sprint(1))) if $firsthead; + do_load_speech($speech, 4, 0, $text); } ########################################################################## # Northern Ireland Assembly sub add_ni_day { - my ($date) = @_; - my $twig = XML::Twig->new(twig_handlers => { - 'speech' => sub { - my $speech = $_; - if (!$currsection && !$currsubsection) { - my $overhead = XML::Twig::Elt->new('major-heading', - { id=>'uk.org.publicwhip/ni/'.$date.'.0.0', - url=>'', - nospeaker=>'true' - }, 'Northern Ireland Assembly'); - do_load_heading($overhead, 5, strip_string($overhead->sprint(1))); - } - do_load_speech($speech, 5, 0, $speech->sprint(1)) - }, - 'minor-heading' => sub { do_load_subheading($_, 5, strip_string($_->sprint(1))) }, - 'oral-heading/major-heading' => sub { load_ni_heading($_, 1) }, - 'major-heading' => sub { load_ni_heading($_, 0) }, - }, output_filter => $outputfilter ); - $curdate = $date; - - # find out what gids there are (using tallygidsmode) - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; - $tallygidsmode = 1; %gids = (); $tallygidsmodedummycount = 10; - parsefile_glob($twig, $parldata . "scrapedxml/ni/ni" . $curdate. "*.xml"); - # see if there are deleted gids - my @gids = keys %gids; - check_extra_gids($date, \@gids, "major = 5"); - - # make the modifications - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; - $tallygidsmode = 0; %gids = (); - parsefile_glob($twig, $parldata . "scrapedxml/ni/ni" . $curdate. "*.xml"); - - undef $twig; + my ($date) = @_; + my $twig = XML::Twig->new(twig_handlers => { + 'speech' => sub { + my $speech = $_; + if (!$currsection && !$currsubsection) { + my $overhead = XML::Twig::Elt->new('major-heading', + { id=>'uk.org.publicwhip/ni/'.$date.'.0.0', + url=>'', + nospeaker=>'true' + }, 'Northern Ireland Assembly'); + do_load_heading($overhead, 5, strip_string($overhead->sprint(1))); + } + do_load_speech($speech, 5, 0, $speech->sprint(1)) + }, + 'minor-heading' => sub { do_load_subheading($_, 5, strip_string($_->sprint(1))) }, + 'oral-heading/major-heading' => sub { load_ni_heading($_, 1) }, + 'major-heading' => sub { load_ni_heading($_, 0) }, + }, output_filter => $outputfilter ); + $curdate = $date; + + # find out what gids there are (using tallygidsmode) + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; + $tallygidsmode = 1; %gids = (); $tallygidsmodedummycount = 10; + parsefile_glob($twig, $parldata . "scrapedxml/ni/ni" . $curdate. "*.xml"); + # see if there are deleted gids + my @gids = keys %gids; + check_extra_gids($date, \@gids, "major = 5"); + + # make the modifications + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; + $tallygidsmode = 0; %gids = (); + parsefile_glob($twig, $parldata . "scrapedxml/ni/ni" . $curdate. "*.xml"); + + undef $twig; } sub load_ni_heading { - my ($speech, $inoralanswers) = @_; - my $text = strip_string($speech->sprint(1)); - if ($inoralanswers) { - $text = "Oral Answers to Questions — " . fix_case($text); - } - do_load_heading($speech, 5, $text); - return 0; # Do not chain handlers + my ($speech, $inoralanswers) = @_; + my $text = strip_string($speech->sprint(1)); + if ($inoralanswers) { + $text = "Oral Answers to Questions — " . fix_case($text); + } + do_load_heading($speech, 5, $text); + return 0; # Do not chain handlers } ########################################################################## # Scottish Parliament sub add_scotland_day { - my ($date) = @_; - - # This script now is hardcoded to only use the new Scottish - # Parliament data. This exists for the whole of the - # parliament, but we should only use it for days after - # 2011-01-13, since the earlier data from before the - # parliament website was changed is much higher quality. - if ($date lt "2011-01-14") { - return; - } + my ($date) = @_; + + # This script now is hardcoded to only use the new Scottish + # Parliament data. This exists for the whole of the + # parliament, but we should only use it for days after + # 2011-01-13, since the earlier data from before the + # parliament website was changed is much higher quality. + if ($date lt "2011-01-14") { + return; + } - my $twig = XML::Twig->new(twig_handlers => { - 'speech' => sub { do_load_speech($_, 7, 0, $_->sprint(1)) }, - 'minor-heading' => sub { do_load_subheading($_, 7, strip_string($_->sprint(1))) }, - 'major-heading' => sub { do_load_heading($_, 7, strip_string($_->sprint(1))) }, - 'division' => sub { load_scotland_division($_) }, - }, output_filter => $outputfilter ); - $curdate = $date; - - # find out what gids there are (using tallygidsmode) - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; - $tallygidsmode = 1; %gids = (); $tallygidsmodedummycount = 10; - parsefile_glob($twig, $parldata . "scrapedxml/sp-new/meeting-of-the-parliament/" . $curdate. "*.xml"); - # see if there are deleted gids - my @gids = keys %gids; - check_extra_gids($date, \@gids, "major = 7"); - - # make the modifications - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; - $tallygidsmode = 0; %gids = (); - parsefile_glob($twig, $parldata . "scrapedxml/sp-new/meeting-of-the-parliament/" . $curdate. "*.xml"); - - undef $twig; + my $twig = XML::Twig->new(twig_handlers => { + 'speech' => sub { do_load_speech($_, 7, 0, $_->sprint(1)) }, + 'minor-heading' => sub { do_load_subheading($_, 7, strip_string($_->sprint(1))) }, + 'major-heading' => sub { do_load_heading($_, 7, strip_string($_->sprint(1))) }, + 'division' => sub { load_scotland_division($_) }, + }, output_filter => $outputfilter ); + $curdate = $date; + + # find out what gids there are (using tallygidsmode) + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; + $tallygidsmode = 1; %gids = (); $tallygidsmodedummycount = 10; + parsefile_glob($twig, $parldata . "scrapedxml/sp-new/meeting-of-the-parliament/" . $curdate. "*.xml"); + # see if there are deleted gids + my @gids = keys %gids; + check_extra_gids($date, \@gids, "major = 7"); + + # make the modifications + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; + $tallygidsmode = 0; %gids = (); + parsefile_glob($twig, $parldata . "scrapedxml/sp-new/meeting-of-the-parliament/" . $curdate. "*.xml"); + + undef $twig; } # load tags sub load_scotland_division { - my ($division) = @_; - my $divnumber = $division->att('divnumber') + 1; # Own internal numbering from 0, per day - my $text = $division->sprint(1); - my %out; - while ($text =~ m#(.*?)\s\(.*?#g) { - push @{$out{$2}}, '' . $3 . ''; - } - while ($text =~ m#(.*?)\s\(.*?#g) { - push @{$out{$2}}, '' . $3 . ''; - } - $text = "

Division number $divnumber

"; - foreach ('for','against','abstentions','spoiled votes') { - next unless $out{$_}; - $text .= "\u$_: "; - $text .= join(', ', @{$out{$_}}); - $text .= '
'; - } - $text .= '

'; - do_load_speech($division, 7, 0, $text); + my ($division) = @_; + my $divnumber = $division->att('divnumber') + 1; # Own internal numbering from 0, per day + my $text = $division->sprint(1); + my %out; + while ($text =~ m#(.*?)\s\(.*?#g) { + push @{$out{$2}}, '' . $3 . ''; + } + while ($text =~ m#(.*?)\s\(.*?#g) { + push @{$out{$2}}, '' . $3 . ''; + } + $text = "

Division number $divnumber

"; + foreach ('for','against','abstentions','spoiled votes') { + next unless $out{$_}; + $text .= "\u$_: "; + $text .= join(', ', @{$out{$_}}); + $text .= '
'; + } + $text .= '

'; + do_load_speech($division, 7, 0, $text); } sub add_scotwrans_day { - my ($date) = @_; - my $twig = XML::Twig->new(twig_handlers => { - 'ques' => sub { do_load_speech($_, 8, 1, $_->sprint(1)) }, - 'reply' => sub { do_load_speech($_, 8, 2, $_->sprint(1)) }, - 'minor-heading' => sub { do_load_heading($_, 8, strip_string($_->sprint(1))) }, - #'major-heading' => sub { do_load_heading($_, 8, strip_string($_->sprint(1))) }, - }, output_filter => $outputfilter ); - $curdate = $date; - - # find out what gids there are (using tallygidsmode) - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; - $tallygidsmode = 1; %gids = (); $tallygidsmodedummycount = 10; - parsefile_glob($twig, $parldata . "scrapedxml/sp-written/spwa" . $curdate. "*.xml"); - # see if there are deleted gids - my @gids = keys %gids; - check_extra_gids($date, \@gids, "major = 8"); - - # make the modifications - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; - $tallygidsmode = 0; %gids = (); - parsefile_glob($twig, $parldata . "scrapedxml/sp-written/spwa" . $curdate. "*.xml"); - - undef $twig; + my ($date) = @_; + my $twig = XML::Twig->new(twig_handlers => { + 'ques' => sub { do_load_speech($_, 8, 1, $_->sprint(1)) }, + 'reply' => sub { do_load_speech($_, 8, 2, $_->sprint(1)) }, + 'minor-heading' => sub { do_load_heading($_, 8, strip_string($_->sprint(1))) }, + #'major-heading' => sub { do_load_heading($_, 8, strip_string($_->sprint(1))) }, + }, output_filter => $outputfilter ); + $curdate = $date; + + # find out what gids there are (using tallygidsmode) + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; + $tallygidsmode = 1; %gids = (); $tallygidsmodedummycount = 10; + parsefile_glob($twig, $parldata . "scrapedxml/sp-written/spwa" . $curdate. "*.xml"); + # see if there are deleted gids + my @gids = keys %gids; + check_extra_gids($date, \@gids, "major = 8"); + + # make the modifications + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; + $tallygidsmode = 0; %gids = (); + parsefile_glob($twig, $parldata . "scrapedxml/sp-written/spwa" . $curdate. "*.xml"); + + undef $twig; } ########################################################################## # Standing/Public Bill Committees sub add_bill { - my ($bill, $url) = @_; - my $lords = $bill =~ s/\s*\[Lords\]//; - my $session; - if ($url =~ /(\d\d\d\d)-?(\d\d)/) { - $session = "$1-$2"; - } else { - die "Couldn't get session out of $url, $bill, $date"; - } - # Get bill ID - my $bill_id = $dbh->selectrow_array('select id from bills where url=?', {}, $url); - if (!$bill_id) { - $bill_id = $dbh->selectrow_array('select id from bills where title=? and session=?', {}, $bill, $session); - } - if (!$bill_id) { - $dbh->do('insert into bills (session, title, lords, url, standingprefix) values (?,?,?,?,"")', - {}, $session, $bill, $lords, $url); - $bill_id = last_id(); - } - return $bill_id; + my ($bill, $url) = @_; + my $lords = $bill =~ s/\s*\[Lords\]//; + my $session; + if ($url =~ /(\d\d\d\d)-?(\d\d)/) { + $session = "$1-$2"; + } else { + die "Couldn't get session out of $url, $bill, $date"; + } + # Get bill ID + my $bill_id = $dbh->selectrow_array('select id from bills where url=?', {}, $url); + if (!$bill_id) { + $bill_id = $dbh->selectrow_array('select id from bills where title=? and session=?', {}, $bill, $session); + } + if (!$bill_id) { + $dbh->do('insert into bills (session, title, lords, url, standingprefix) values (?,?,?,?,"")', + {}, $session, $bill, $lords, $url); + $bill_id = last_id(); + } + return $bill_id; } sub add_standing_title { - my ($heading, $bill, $bill_id, @preheadingspeech) = @_; - $heading->att('id') =~ /^.*\/(.*?_.*?_)/; - my $prefix = $1; - $dbh->do('update bills set standingprefix=? where id=?', {}, $prefix, $bill_id); - do_load_heading($heading, 6, $bill, $bill_id); - foreach (@preheadingspeech) { - do_load_speech($_, 6, $bill_id, $_->sprint(1)); - } + my ($heading, $bill, $bill_id, @preheadingspeech) = @_; + $heading->att('id') =~ /^.*\/(.*?_.*?_)/; + my $prefix = $1; + $dbh->do('update bills set standingprefix=? where id=?', {}, $prefix, $bill_id); + do_load_heading($heading, 6, $bill, $bill_id); + foreach (@preheadingspeech) { + do_load_speech($_, 6, $bill_id, $_->sprint(1)); + } } sub add_standing_day { - my ($date) = @_; - use vars qw($bill $bill_id $majorheadingstate @preheadingspeech); - $majorheadingstate = 0; - my $twig = XML::Twig->new(twig_handlers => { - 'bill' => sub { - $bill = strip_string($_->att('title')); - my $url = $_->att('url'); - $bill_id = add_bill($bill, $url); - $majorheadingstate = 1; # Got a - }, - 'committee' => sub { - my @names = $_->descendants('mpname'); - foreach (@names) { - my $chairman = ($_->parent()->tag() eq 'chairmen'); - my $attending = ($_->att('attending') eq 'true'); - my $person_id = $_->att('person_id') || $membertoperson{$_->att('memberid')}; - $current_file =~ /_(\d\d-\d)_/; - my $sitting = $1; - if (my ($id, $curr_attending) = $dbh->selectrow_array('select id,attending from pbc_members where person_id=? and bill_id=? - and sitting=?', {}, $person_id, $bill_id, $sitting)) { - if ($curr_attending != $attending) { - $dbh->do('update pbc_members set attending=? where id=?', {}, - $attending, $id); - } - } else { - $dbh->do('insert into pbc_members (bill_id, sitting, person_id, attending, chairman) values - (?, ?, ?, ?, ?)', {}, $bill_id, $sitting, $person_id, $attending, $chairman); - } - } - }, - 'major-heading' => sub { - my $commhead = $_->sprint(1) =~ /(Standing Committee [A-H]|Special Standing Committee|Second Reading Committee)\s*$/; - if ($_->sprint(1) =~ /^\s*Public Bill Commit?tee\s*$/) { - # All PBCs have a - add_standing_title($_, $bill, $bill_id, @preheadingspeech); - $majorheadingstate = 9; # No more headings - } elsif ($majorheadingstate==1 && $commhead) { - # A , an old SC heading, the bill title will come again - add_standing_title($_, $bill, $bill_id, @preheadingspeech); - $majorheadingstate = 3; - } elsif ($majorheadingstate==3) { - # Ignore this heading of the bill title - $majorheadingstate = 9; - } elsif ($commhead) { - # No , we're going to get another major-heading with the bill title in it... - $majorheadingstate = 2; - } elsif ($majorheadingstate==2) { - $bill = strip_string($_->sprint(1)); - my $url = $_->att('url'); - $bill_id = add_bill($bill, $url); - add_standing_title($_, $bill, $bill_id, @preheadingspeech); - $majorheadingstate = 9; - } elsif ($majorheadingstate==0 || $majorheadingstate==1) { - die "Odd first major heading: " . $_->sprint(1); - } else { - # Any major heading other than the ones above I'm assuming is part of an amendment - # So load as a normal speech! - do_load_speech($_, 6, $bill_id, $_->sprint(1)); - } - }, - 'speech' => sub { - if ($currsection==0) { - push @preheadingspeech, $_; - } else { - do_load_speech($_, 6, $bill_id, Encode::encode('iso-8859-1', $_->sprint(1))) - } - }, - 'publicwhip' => sub { - # Clear variables for next file - $majorheadingstate = 0; $bill = ''; $bill_id = 0; - @preheadingspeech = (); - }, - 'minor-heading' => sub { do_load_subheading($_, 6, strip_string($_->sprint(1)), $bill_id) }, - 'divisioncount' => sub { load_standing_division($_, $bill_id) }, - 'gidredirect' => sub { do_load_gidredirect($_, 6) }, - }, output_filter => $outputfilter ); - $curdate = $date; - - # find out what gids there are (using tallygidsmode) - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; - $tallygidsmode = 1; %gids = (); %grdests = (); $tallygidsmodedummycount = 10; - parsefile_glob($twig, $parldata . "scrapedxml/standing/standing*_*_*_" . $curdate. "*.xml"); - # see if there are deleted gids - my @gids = keys %gids; - check_extra_gids($date, \@gids, "major = 6"); - - # make the modifications - $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; - $tallygidsmode = 0; %gids = (); - parsefile_glob($twig, $parldata . "scrapedxml/standing/standing*_*_*_" . $curdate. "*.xml"); - - # and delete anything that has been redirected (moving comments etc) - delete_redirected_gids($date, \%grdests); - - undef $twig; + my ($date) = @_; + use vars qw($bill $bill_id $majorheadingstate @preheadingspeech); + $majorheadingstate = 0; + my $twig = XML::Twig->new(twig_handlers => { + 'bill' => sub { + $bill = strip_string($_->att('title')); + my $url = $_->att('url'); + $bill_id = add_bill($bill, $url); + $majorheadingstate = 1; # Got a + }, + 'committee' => sub { + my @names = $_->descendants('mpname'); + foreach (@names) { + my $chairman = ($_->parent()->tag() eq 'chairmen'); + my $attending = ($_->att('attending') eq 'true'); + my $person_id = $_->att('person_id') || $membertoperson{$_->att('memberid')}; + $current_file =~ /_(\d\d-\d)_/; + my $sitting = $1; + if (my ($id, $curr_attending) = $dbh->selectrow_array('select id,attending from pbc_members where person_id=? and bill_id=? + and sitting=?', {}, $person_id, $bill_id, $sitting)) { + if ($curr_attending != $attending) { + $dbh->do('update pbc_members set attending=? where id=?', {}, + $attending, $id); + } + } else { + $dbh->do('insert into pbc_members (bill_id, sitting, person_id, attending, chairman) values + (?, ?, ?, ?, ?)', {}, $bill_id, $sitting, $person_id, $attending, $chairman); + } + } + }, + 'major-heading' => sub { + my $commhead = $_->sprint(1) =~ /(Standing Committee [A-H]|Special Standing Committee|Second Reading Committee)\s*$/; + if ($_->sprint(1) =~ /^\s*Public Bill Commit?tee\s*$/) { + # All PBCs have a + add_standing_title($_, $bill, $bill_id, @preheadingspeech); + $majorheadingstate = 9; # No more headings + } elsif ($majorheadingstate==1 && $commhead) { + # A , an old SC heading, the bill title will come again + add_standing_title($_, $bill, $bill_id, @preheadingspeech); + $majorheadingstate = 3; + } elsif ($majorheadingstate==3) { + # Ignore this heading of the bill title + $majorheadingstate = 9; + } elsif ($commhead) { + # No , we're going to get another major-heading with the bill title in it... + $majorheadingstate = 2; + } elsif ($majorheadingstate==2) { + $bill = strip_string($_->sprint(1)); + my $url = $_->att('url'); + $bill_id = add_bill($bill, $url); + add_standing_title($_, $bill, $bill_id, @preheadingspeech); + $majorheadingstate = 9; + } elsif ($majorheadingstate==0 || $majorheadingstate==1) { + die "Odd first major heading: " . $_->sprint(1); + } else { + # Any major heading other than the ones above I'm assuming is part of an amendment + # So load as a normal speech! + do_load_speech($_, 6, $bill_id, $_->sprint(1)); + } + }, + 'speech' => sub { + if ($currsection==0) { + push @preheadingspeech, $_; + } else { + do_load_speech($_, 6, $bill_id, Encode::encode('iso-8859-1', $_->sprint(1))) + } + }, + 'publicwhip' => sub { + # Clear variables for next file + $majorheadingstate = 0; $bill = ''; $bill_id = 0; + @preheadingspeech = (); + }, + 'minor-heading' => sub { do_load_subheading($_, 6, strip_string($_->sprint(1)), $bill_id) }, + 'divisioncount' => sub { load_standing_division($_, $bill_id) }, + 'gidredirect' => sub { do_load_gidredirect($_, 6) }, + }, output_filter => $outputfilter ); + $curdate = $date; + + # find out what gids there are (using tallygidsmode) + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; + $tallygidsmode = 1; %gids = (); %grdests = (); $tallygidsmodedummycount = 10; + parsefile_glob($twig, $parldata . "scrapedxml/standing/standing*_*_*_" . $curdate. "*.xml"); + # see if there are deleted gids + my @gids = keys %gids; + check_extra_gids($date, \@gids, "major = 6"); + + # make the modifications + $hpos = 0; $currsection = 0; $currsubsection = 0; $promotedheading = 0; + $tallygidsmode = 0; %gids = (); + parsefile_glob($twig, $parldata . "scrapedxml/standing/standing*_*_*_" . $curdate. "*.xml"); + + # and delete anything that has been redirected (moving comments etc) + delete_redirected_gids($date, \%grdests); + + undef $twig; } sub load_standing_division { - my ($division, $id) = @_; - my $divnumber = $division->att('divnumber'); - my $ayes = $division->att('ayes'); - my $noes = $division->att('noes'); - my @names = $division->descendants('mpname'); - my %out = ( aye => '', no => '' ); - foreach (@names) { - my $person_id = $_->att('person_id') || $membertoperson{$_->att('memberid')}; - $person_id =~ s/.*\///; - my $name = $_->att('membername'); - my $v = $_->att('vote'); - $out{$v} .= '' . $name . ', '; - } - $out{aye} =~ s/, $//; - $out{no} =~ s/, $//; - my $text = "

Division number $divnumber - $ayes yes, $noes no

+ my ($division, $id) = @_; + my $divnumber = $division->att('divnumber'); + my $ayes = $division->att('ayes'); + my $noes = $division->att('noes'); + my @names = $division->descendants('mpname'); + my %out = ( aye => '', no => '' ); + foreach (@names) { + my $person_id = $_->att('person_id') || $membertoperson{$_->att('memberid')}; + $person_id =~ s/.*\///; + my $name = $_->att('membername'); + my $v = $_->att('vote'); + $out{$v} .= '' . $name . ', '; + } + $out{aye} =~ s/, $//; + $out{no} =~ s/, $//; + my $text = "

Division number $divnumber - $ayes yes, $noes no

Voting yes: $out{aye}

Voting no: $out{no}

"; - # Standing XML is UTF-8, so transcode - do_load_speech($division, 6, $id, Encode::encode('iso-8859-1', $text)); + # Standing XML is UTF-8, so transcode + do_load_speech($division, 6, $id, Encode::encode('iso-8859-1', $text)); } sub loadspq { - my ($twig, $question) = @_; - my %typemap = ( - 'business-today', 1, - 'business-oral', 2, - 'business-written', 3, - 'answer', 4, - 'holding', 5, - 'oral-asked-in-official-report', 6, - 'referenced-in-question-text', 7 ); - - my $gid = $question->att('gid'); - if (!$quiet) { - print "Scottish Parliament question ID $gid\n"; + my ($twig, $question) = @_; + my %typemap = ( + 'business-today', 1, + 'business-oral', 2, + 'business-written', 3, + 'answer', 4, + 'holding', 5, + 'oral-asked-in-official-report', 6, + 'referenced-in-question-text', 7 ); + + my $gid = $question->att('gid'); + if (!$quiet) { + print "Scottish Parliament question ID $gid\n"; + } + + my @mentions = $question->children(); + foreach my $mention (@mentions) { + my $mentiontype = $typemap{$mention->att('type')}; + my $mentionname = $mention->att('type'); + unless ($mentiontype) { + die "Unknown mention type ($mentiontype) found."; } + my $mentiondate = $mention->att('date'); + my $doit = (!$mentiondate) || $all || $recent || ($datefrom le $mentiondate && $mentiondate le $dateto); + print " ($mentionname) " unless $quiet; + if (!$doit) { + print " skipping\n" unless $quiet; + next; + } + my $mentionurl = $mention->att('url'); + my $mentiongid; + my $rows; +# Need to pick out a few attributes in some cases: + if ($mentiontype == 4) { + $mentiongid = $mention->att('spwrans'); + } elsif ($mentiontype == 6) { + $mentiongid = $mention->att('orgid'); + } elsif ($mentiontype == 7) { + $mentiongid = $mention->att('referrer'); + } + + my $preload_hash_key; + if (%scotqspreloaded) { + my @row = ($gid,$mentiontype,$mentiondate,$mentionurl,$mentiongid); + $preload_hash_key = join('|', map { defined $_ ? $_ : '' } @row ); - my @mentions = $question->children(); - foreach my $mention (@mentions) { - my $mentiontype = $typemap{$mention->att('type')}; - my $mentionname = $mention->att('type'); - unless ($mentiontype) { - die "Unknown mention type ($mentiontype) found."; + if ($scotqspreloaded{$preload_hash_key}) { + $rows = 1; + } else { + $rows = 0; + } + } else { + if ($mentiontype >= 1 && $mentiontype <= 3) { # 'business-*' + $rows = $scotqbusinessexist->execute($gid,$mentiontype,$mentiondate,$mentionurl); + if ($rows > 1) { + die "Multiple rows matched $gid, $mentiontype, $mentiondate, $mentionurl"; } - my $mentiondate = $mention->att('date'); - my $doit = (!$mentiondate) || $all || $recent || ($datefrom le $mentiondate && $mentiondate le $dateto); - print " ($mentionname) " unless $quiet; - if (!$doit) { - print " skipping\n" unless $quiet; - next; + my @row = $scotqbusinessexist->fetchrow_array(); + $scotqbusinessexist->finish(); + } elsif ($mentiontype == 4) { # 'answer' + $rows = $scotqdategidexist->execute($gid,$mentiontype,$mentiondate,$mentiongid); + if ($rows > 1) { + die "Multiple rows matched $gid, $mentiontype, $mentiondate, $mentiongid"; } - my $mentionurl = $mention->att('url'); - my $mentiongid; - my $rows; -# Need to pick out a few attributes in some cases: - if ($mentiontype == 4) { - $mentiongid = $mention->att('spwrans'); - } elsif ($mentiontype == 6) { - $mentiongid = $mention->att('orgid'); - } elsif ($mentiontype == 7) { - $mentiongid = $mention->att('referrer'); + my @row = $scotqdategidexist->fetchrow_array(); + $scotqdategidexist->finish(); + } elsif ($mentiontype == 5) { # 'holding' + $rows = $scotqholdingexist->execute($gid,$mentiontype,$mentiondate); + if ($rows > 1) { + die "Multiple rows matched $gid, $mentiontype, $mentiondate"; } - - my $preload_hash_key; - if (%scotqspreloaded) { - my @row = ($gid,$mentiontype,$mentiondate,$mentionurl,$mentiongid); - $preload_hash_key = join('|', map { defined $_ ? $_ : '' } @row ); - - if ($scotqspreloaded{$preload_hash_key}) { - $rows = 1; - } else { - $rows = 0; - } - } else { - if ($mentiontype >= 1 && $mentiontype <= 3) { # 'business-*' - $rows = $scotqbusinessexist->execute($gid,$mentiontype,$mentiondate,$mentionurl); - if ($rows > 1) { - die "Multiple rows matched $gid, $mentiontype, $mentiondate, $mentionurl"; - } - my @row = $scotqbusinessexist->fetchrow_array(); - $scotqbusinessexist->finish(); - } elsif ($mentiontype == 4) { # 'answer' - $rows = $scotqdategidexist->execute($gid,$mentiontype,$mentiondate,$mentiongid); - if ($rows > 1) { - die "Multiple rows matched $gid, $mentiontype, $mentiondate, $mentiongid"; - } - my @row = $scotqdategidexist->fetchrow_array(); - $scotqdategidexist->finish(); - } elsif ($mentiontype == 5) { # 'holding' - $rows = $scotqholdingexist->execute($gid,$mentiontype,$mentiondate); - if ($rows > 1) { - die "Multiple rows matched $gid, $mentiontype, $mentiondate"; - } - my @row = $scotqholdingexist->fetchrow_array(); - $scotqholdingexist->finish(); - } elsif ($mentiontype == 6) { # 'oral-asked-in-official-report' - $rows = $scotqdategidexist->execute($gid,$mentiontype,$mentiondate,$mentiongid); - if ($rows > 1) { - die "Multiple rows matched $gid, $mentiontype, $mentiondate, $mentiongid"; - } - my @row = $scotqdategidexist->fetchrow_array(); - $scotqdategidexist->finish(); - } elsif ($mentiontype == 7) { # 'referenced-in-question-text' - $rows = $scotqreferenceexist->execute($gid,$mentiontype,$mentiongid); - if ($rows > 1) { - die "Multiple rows matched $gid, $mentiontype, $mentiongid"; - } - my @row = $scotqreferenceexist->fetchrow_array(); - $scotqreferenceexist->finish(); - } + my @row = $scotqholdingexist->fetchrow_array(); + $scotqholdingexist->finish(); + } elsif ($mentiontype == 6) { # 'oral-asked-in-official-report' + $rows = $scotqdategidexist->execute($gid,$mentiontype,$mentiondate,$mentiongid); + if ($rows > 1) { + die "Multiple rows matched $gid, $mentiontype, $mentiondate, $mentiongid"; } - - if( $rows == 1 ) { - if (!$quiet) { print "already present\n"; } - } else { - if (!$quiet) { print "inserting\n"; } - $scotqadd->execute($gid,$mentiontype,$mentiondate,$mentionurl,$mentiongid); - $scotqadd->finish(); - if (%scotqspreloaded) { - $scotqspreloaded{$preload_hash_key} = 1; - } + my @row = $scotqdategidexist->fetchrow_array(); + $scotqdategidexist->finish(); + } elsif ($mentiontype == 7) { # 'referenced-in-question-text' + $rows = $scotqreferenceexist->execute($gid,$mentiontype,$mentiongid); + if ($rows > 1) { + die "Multiple rows matched $gid, $mentiontype, $mentiongid"; } + my @row = $scotqreferenceexist->fetchrow_array(); + $scotqreferenceexist->finish(); + } + } + + if( $rows == 1 ) { + if (!$quiet) { print "already present\n"; } + } else { + if (!$quiet) { print "inserting\n"; } + $scotqadd->execute($gid,$mentiontype,$mentiondate,$mentionurl,$mentiongid); + $scotqadd->finish(); + if (%scotqspreloaded) { + $scotqspreloaded{$preload_hash_key} = 1; + } } - $twig->purge(); + } + $twig->purge(); } sub canon_time { - my $t = shift; - $t = "$t"; - $t = substr($t, 1) if $t =~ /^0\d\d:/; - $t .= ":00" if $t =~ /^\d\d?:\d\d$/; # Standing - $t = "0$t" if $t =~ /^\d:/; # Standing - return $t; + my $t = shift; + $t = "$t"; + $t = substr($t, 1) if $t =~ /^0\d\d:/; + $t .= ":00" if $t =~ /^\d\d?:\d\d$/; # Standing + $t = "0$t" if $t =~ /^\d:/; # Standing + return $t; } ########################################################################## @@ -1633,159 +1633,159 @@ sub canon_time { sub do_load_speech { - my ($speech, $major, $minor, $text) = @_; + my ($speech, $major, $minor, $text) = @_; - my $id = $speech->att('id'); - my $colnum = $speech->att('colnum'); - $colnum =~ s/[^\d]//g if $colnum; + my $id = $speech->att('id'); + my $colnum = $speech->att('colnum'); + $colnum =~ s/[^\d]//g if $colnum; - my $len = length($speech->sprint(1)); - return if ($len == 0); + my $len = length($speech->sprint(1)); + return if ($len == 0); - if (defined $ignorehistorygids{$id}) { - # This happens to historical speeches which have already been - # redirected, and aren't needed to be repeated - #print "Ignoring historical " . $id . "\n"; - return; - } + if (defined $ignorehistorygids{$id}) { + # This happens to historical speeches which have already been + # redirected, and aren't needed to be repeated + #print "Ignoring historical " . $id . "\n"; + return; + } - die "speech without (sub)heading $id '$text'" if $currsection == 0 and $currsubsection == 0; + die "speech without (sub)heading $id '$text'" if $currsection == 0 and $currsubsection == 0; - $hpos++; - my $htime = $speech->att('time'); - $htime = canon_time($htime) if defined $htime; - my $url = $speech->att('url') || ''; + $hpos++; + my $htime = $speech->att('time'); + $htime = canon_time($htime) if defined $htime; + my $url = $speech->att('url') || ''; - my $type; - my $speaker = 0; - my $pretext = ""; - if ($speech->att('person_id') || $speech->att('speakerid')) { - $type = 12; - if ($speech->att('person_id')) { - ($speaker = $speech->att('person_id')) =~ s#uk.org.publicwhip/person/##; - } else { - $speaker = $membertoperson{$speech->att('speakerid')} || 'unknown'; - } - if ($speaker eq "unknown") { - $speaker = 0; - my $encoded = HTML::Entities::encode_entities($speech->att('speakername')); - $pretext = '

' . $encoded . ':

'; - } + my $type; + my $speaker = 0; + my $pretext = ""; + if ($speech->att('person_id') || $speech->att('speakerid')) { + $type = 12; + if ($speech->att('person_id')) { + ($speaker = $speech->att('person_id')) =~ s#uk.org.publicwhip/person/##; } else { - # procedural - $type = 13; + $speaker = $membertoperson{$speech->att('speakerid')} || 'unknown'; + } + if ($speaker eq "unknown") { + $speaker = 0; + my $encoded = HTML::Entities::encode_entities($speech->att('speakername')); + $pretext = '

' . $encoded . ':

'; } + } else { + # procedural + $type = 13; + } - my @epparam = ($pretext . $text); - my @hparam = ($id, $colnum, $type, $speaker, $major, $minor, $currsection, $currsubsection, $hpos, $curdate, $htime, $url); - my $epid = db_addpair(\@epparam, \@hparam); + my @epparam = ($pretext . $text); + my @hparam = ($id, $colnum, $type, $speaker, $major, $minor, $currsection, $currsubsection, $hpos, $curdate, $htime, $url); + my $epid = db_addpair(\@epparam, \@hparam); } sub do_load_heading { - my ($speech, $major, $text, $minor) = @_; - $minor ||= 0; - #print "heading " . $text . "\n"; - - if (defined $ignorehistorygids{$speech->att('id')}) { - # This happens to historical headings which have already been - # redirected, and aren't needed to be repeated - #print "Ignoring historical " . $speech->att('id') . "\n"; - return; - } + my ($speech, $major, $text, $minor) = @_; + $minor ||= 0; + #print "heading " . $text . "\n"; + + if (defined $ignorehistorygids{$speech->att('id')}) { + # This happens to historical headings which have already been + # redirected, and aren't needed to be repeated + #print "Ignoring historical " . $speech->att('id') . "\n"; + return; + } - $hpos++; - my $htime = $speech->att('time'); - $htime = canon_time($htime) if defined $htime; - my $url = $speech->att('url') || ''; - my $colnum = $speech->att('colnum'); - $colnum =~ s/[^\d]//g if $colnum; + $hpos++; + my $htime = $speech->att('time'); + $htime = canon_time($htime) if defined $htime; + my $url = $speech->att('url') || ''; + my $colnum = $speech->att('colnum'); + $colnum =~ s/[^\d]//g if $colnum; - my $type = 10; - my $speaker = 0; + my $type = 10; + my $speaker = 0; - my @epparam = (fix_case($text)); - my @hparam = ($speech->att('id'), $colnum, $type, $speaker, $major, $minor, 0, 0, $hpos, $curdate, $htime, $url); - my $epid = db_addpair(\@epparam, \@hparam); + my @epparam = (fix_case($text)); + my @hparam = ($speech->att('id'), $colnum, $type, $speaker, $major, $minor, 0, 0, $hpos, $curdate, $htime, $url); + my $epid = db_addpair(\@epparam, \@hparam); - $currsubsection = $epid; - $currsection = $epid; + $currsubsection = $epid; + $currsection = $epid; } sub do_load_subheading { - my ($speech, $major, $text, $minor) = @_; - $minor ||= 0; - #print "subheading " . $speech->att('id') . "\n"; - - if (defined $ignorehistorygids{$speech->att('id')}) { - # This happens to historical headings which have already been - # redirected, and aren't needed to be repeated - #print "Ignoring historical " . $speech->att('id') . "\n"; - return; - } + my ($speech, $major, $text, $minor) = @_; + $minor ||= 0; + #print "subheading " . $speech->att('id') . "\n"; + + if (defined $ignorehistorygids{$speech->att('id')}) { + # This happens to historical headings which have already been + # redirected, and aren't needed to be repeated + #print "Ignoring historical " . $speech->att('id') . "\n"; + return; + } - # if the current section heading is a promoted one, clear it as if - # it weren't there (to stop this subsection heading going under it) - if ($promotedheading == $currsection) - { - $currsection = 0; - } - # fawkes PHP scripts don't display minor headings without a major - # heading before them. so we make such minor headings into major - # headings. this only happens at the start of a file. - if ($currsection == 0) - { - # print "subheading promoted to heading " . $speech->att('id') . " $text\n"; - do_load_heading($speech, $major, $text, $minor); - # store so we promote other subheadings, rather than putting under this - $promotedheading = $currsection; - return; - } + # if the current section heading is a promoted one, clear it as if + # it weren't there (to stop this subsection heading going under it) + if ($promotedheading == $currsection) + { + $currsection = 0; + } + # fawkes PHP scripts don't display minor headings without a major + # heading before them. so we make such minor headings into major + # headings. this only happens at the start of a file. + if ($currsection == 0) + { + # print "subheading promoted to heading " . $speech->att('id') . " $text\n"; + do_load_heading($speech, $major, $text, $minor); + # store so we promote other subheadings, rather than putting under this + $promotedheading = $currsection; + return; + } - $hpos++; - my $htime = $speech->att('time'); - $htime = canon_time($htime) if defined $htime; - my $url = $speech->att('url') || ''; - my $colnum = $speech->att('colnum'); - $colnum =~ s/[^\d]//g if $colnum; + $hpos++; + my $htime = $speech->att('time'); + $htime = canon_time($htime) if defined $htime; + my $url = $speech->att('url') || ''; + my $colnum = $speech->att('colnum'); + $colnum =~ s/[^\d]//g if $colnum; - my $type = 11; - my $speaker = 0; + my $type = 11; + my $speaker = 0; - my @epparam = (fix_case($text)); - my @hparam = ($speech->att('id'), $colnum, $type, $speaker, $major, $minor, $currsection, 0, $hpos, $curdate, $htime, $url); - my $epid = db_addpair(\@epparam, \@hparam); + my @epparam = (fix_case($text)); + my @hparam = ($speech->att('id'), $colnum, $type, $speaker, $major, $minor, $currsection, 0, $hpos, $curdate, $htime, $url); + my $epid = db_addpair(\@epparam, \@hparam); - $currsubsection = $epid; + $currsubsection = $epid; } sub do_load_gidredirect { - my ($gidredirect, $major) = @_; + my ($gidredirect, $major) = @_; - my $oldgid = $gidredirect->att('oldgid'); - my $newgid = $gidredirect->att('newgid'); - my $matchtype = $gidredirect->att('matchtype'); - # if matchtype is multiplecover, let through >1 identical GIDs + my $oldgid = $gidredirect->att('oldgid'); + my $newgid = $gidredirect->att('newgid'); + my $matchtype = $gidredirect->att('matchtype'); + # if matchtype is multiplecover, let through >1 identical GIDs - $ignorehistorygids{$oldgid} = 1; + $ignorehistorygids{$oldgid} = 1; - if ($tallygidsmode) { - if ($matchtype ne 'multiplecover' && defined $gids{$oldgid} && $grdests{$oldgid}[0] ne $newgid) { - die "Got gid $oldgid twice, with different destinations, in XML file"; - } - $gids{$oldgid} = 1; - $grdests{$oldgid} = [ $newgid, $matchtype ]; - return; - } else { - return if ($matchtype eq 'multiplecover' && defined $gids{$oldgid}); - $gids{$oldgid} = 1; - return if ($matchtype eq 'removed'); + if ($tallygidsmode) { + if ($matchtype ne 'multiplecover' && defined $gids{$oldgid} && $grdests{$oldgid}[0] ne $newgid) { + die "Got gid $oldgid twice, with different destinations, in XML file"; } + $gids{$oldgid} = 1; + $grdests{$oldgid} = [ $newgid, $matchtype ]; + return; + } else { + return if ($matchtype eq 'multiplecover' && defined $gids{$oldgid}); + $gids{$oldgid} = 1; + return if ($matchtype eq 'removed'); + } - $gradd->execute($oldgid, $newgid, $curdate, $major); - $gradd->finish(); + $gradd->execute($oldgid, $newgid, $curdate, $major); + $gradd->finish(); } # TODO From b05b79331b74d8420a99b1500c6742da24b9ef7f Mon Sep 17 00:00:00 2001 From: Matthew Somerville Date: Wed, 20 May 2015 16:31:31 +0100 Subject: [PATCH 2/2] Utilise any 'redirect' entries in JSON data. This allows us to load in people that have been merged without having to update every relevant XML file or search index, and redirect at the front end. --- scripts/load-people | 25 +++++++++--- scripts/xml2db.pl | 43 +++++++++++++------- www/includes/easyparliament/member.php | 14 +++++++ www/includes/easyparliament/searchengine.php | 21 ++++++++-- 4 files changed, 80 insertions(+), 23 deletions(-) diff --git a/scripts/load-people b/scripts/load-people index 0bc8c46029..8c24cc9552 100755 --- a/scripts/load-people +++ b/scripts/load-people @@ -51,7 +51,7 @@ verbose("End"); # --- -my ($dbh, $memberadd, $memberexist, $membercheck, $nameadd, $nameupdate, $namefetch, $namedelete); +my ($dbh, $memberadd, $memberexist, $membercheck, $nameadd, $nameupdate, $namefetch, $namedelete, $gradd, $grdelete); sub db_connect { #DBI->trace(1); @@ -73,6 +73,9 @@ sub db_connect { family_name, lordofname, start_date, end_date) values (?, ?, ?, ?, ?, ?, ?, ?)"); $namefetch = $dbh->prepare("select id from person_names where person_id = ? and end_date >= ? ORDER BY id"); $namedelete = $dbh->prepare('delete from person_names where id=?'); + + $gradd = $dbh->prepare("replace into gidredirect (gid_from, gid_to) values (?,?)"); + $grdelete = $dbh->prepare("delete from gidredirect where gid_from = ?"); } my %organizations; @@ -142,10 +145,18 @@ sub load_people { $posts{$_->{id}} = $_; } foreach (@{$j->{memberships}}) { - load_member($_); + if ($_->{redirect}) { + $gradd->execute($_->{id}, $_->{redirect}); + } else { + load_member($_); + } } foreach (@{$j->{persons}}) { - load_names($_); + if ($_->{redirect}) { + $gradd->execute($_->{id}, $_->{redirect}); + } else { + load_names($_); + } } } @@ -154,7 +165,9 @@ my %member_ids = (); sub load_member { my ($member) = @_; - (my $id = $member->{id}) =~ s:uk.org.publicwhip/(member|lord|royal)/::; + my $id = $member->{id}; + $grdelete->execute($id); + $id =~ s:uk.org.publicwhip/(member|lord|royal)/::; (my $person_id = $member->{person_id}) =~ s#uk.org.publicwhip/person/##; $member_ids{$id} = 1; @@ -195,7 +208,9 @@ sub load_member { sub load_names { my $person = shift; - (my $id = $person->{id}) =~ s#uk.org.publicwhip/person/##; + my $id = $person->{id}; + $grdelete->execute($id); + $id =~ s#uk.org.publicwhip/person/##; my @names; foreach my $name (@{$person->{other_names}}) { diff --git a/scripts/xml2db.pl b/scripts/xml2db.pl index f0cd1bb345..1f53050600 100755 --- a/scripts/xml2db.pl +++ b/scripts/xml2db.pl @@ -132,7 +132,7 @@ END use vars qw($hpos $curdate); use vars qw($currsection $currsubsection $inoralanswers $promotedheading); use vars qw(%gids %grdests %ignorehistorygids $tallygidsmode $tallygidsmodedummycount); -use vars qw(%membertoperson); +use vars qw(%membertoperson %personredirect); use vars qw($current_file); use vars qw($debatesdir $wransdir $lordswransdir $westminhalldir $wmsdir @@ -267,9 +267,20 @@ sub process_type { my $pwmembers = mySociety::Config::get('PWMEMBERS'); my $j = decode_json(read_file($pwmembers . 'people.json')); foreach (@{$j->{memberships}}) { + next if $_->{redirect}; (my $person_id = $_->{person_id}) =~ s#uk.org.publicwhip/person/##; $membertoperson{$_->{id}} = $person_id; } +foreach (@{$j->{memberships}}) { + next unless $_->{redirect}; + $membertoperson{$_->{id}} = $membertoperson{$_->{redirect}}; +} +foreach (@{$j->{persons}}) { + next unless $_->{redirect}; + (my $id = $_->{id}) =~ s#uk.org.publicwhip/person/##; + (my $redirect = $_->{redirect}) =~ s#uk.org.publicwhip/person/##; + $personredirect{$id} = $redirect; +} # Process main data process_type(["debates"], [$debatesdir], \&add_debates_day) if ($debates) ; @@ -884,6 +895,14 @@ sub last_id return $arr[0]; } +sub person_id { + my ($item, $member_id_attr) = @_; + my $person_id = $item->att('person_id') || $membertoperson{$item->att($member_id_attr)} || 'unknown'; + $person_id =~ s/.*\///; + $person_id = $personredirect{$person_id} || $person_id; + return $person_id; +} + ########################################################################## # Written Answers @@ -1030,13 +1049,12 @@ sub load_debate_division { die unless $side =~ /^(aye|no|content|not-content)$/; $text .= "

\u$side

    "; my @names = $list->children($vote_tag); # attr ids vote (teller), text is name - foreach my $person (@names) { - my $person_id = $person->att('person_id') || $membertoperson{$person->att('id')}; - $person_id =~ s/.*\///; - my $vote = $person->att('vote'); + foreach my $vote (@names) { + my $person_id = person_id($vote, 'id'); + my $vote = $vote->att('vote'); die unless $vote eq $side; - my $teller = $person->att('teller'); - my $name = $person->sprint(1); + my $teller = $vote->att('teller'); + my $name = $vote->sprint(1); $name =~ s/ *\[Teller\]//; # In Lords $name =~ s/^(.*), (.*)$/$2 $1/; $name =~ s/^(rh|Mr|Sir|Ms|Mrs|Dr) //; @@ -1407,7 +1425,7 @@ sub add_standing_day { foreach (@names) { my $chairman = ($_->parent()->tag() eq 'chairmen'); my $attending = ($_->att('attending') eq 'true'); - my $person_id = $_->att('person_id') || $membertoperson{$_->att('memberid')}; + my $person_id = person_id($_, 'memberid'); $current_file =~ /_(\d\d-\d)_/; my $sitting = $1; if (my ($id, $curr_attending) = $dbh->selectrow_array('select id,attending from pbc_members where person_id=? and bill_id=? @@ -1497,8 +1515,7 @@ sub load_standing_division { my @names = $division->descendants('mpname'); my %out = ( aye => '', no => '' ); foreach (@names) { - my $person_id = $_->att('person_id') || $membertoperson{$_->att('memberid')}; - $person_id =~ s/.*\///; + my $person_id = person_id($_, 'memberid'); my $name = $_->att('membername'); my $v = $_->att('vote'); $out{$v} .= '' . $name . ', '; @@ -1661,11 +1678,7 @@ sub do_load_speech my $pretext = ""; if ($speech->att('person_id') || $speech->att('speakerid')) { $type = 12; - if ($speech->att('person_id')) { - ($speaker = $speech->att('person_id')) =~ s#uk.org.publicwhip/person/##; - } else { - $speaker = $membertoperson{$speech->att('speakerid')} || 'unknown'; - } + $speaker = person_id($speech, 'speakerid'); if ($speaker eq "unknown") { $speaker = 0; my $encoded = HTML::Entities::encode_entities($speech->att('speakername')); diff --git a/www/includes/easyparliament/member.php b/www/includes/easyparliament/member.php index 925c58f998..459f14d220 100644 --- a/www/includes/easyparliament/member.php +++ b/www/includes/easyparliament/member.php @@ -81,6 +81,13 @@ public function MEMBER($args) { $person_id = $this->postcode_to_person_id($args['postcode'], $house); } elseif (isset($args['person_id']) && is_numeric($args['person_id'])) { $person_id = $args['person_id']; + $q = $this->db->query("SELECT gid_to FROM gidredirect + WHERE gid_from = :gid_from", + array(':gid_from' => "uk.org.publicwhip/person/$person_id") + ); + if ($q->rows > 0) { + $person_id = str_replace('uk.org.publicwhip/person/', '', $q->field(0, 'gid_to')); + } } if (!$person_id) { @@ -183,6 +190,13 @@ public function member_id_to_person_id($member_id) { WHERE member_id = :member_id", array(':member_id' => $member_id) ); + if ($q->rows == 0) { + $q = $this->db->query("SELECT person_id FROM gidredirect, member + WHERE gid_from = :gid_from AND + CONCAT('uk.org.publicwhip/member/', member_id) = gid_to", + array(':gid_from' => "uk.org.publicwhip/member/$member_id") + ); + } if ($q->rows > 0) { return $q->field(0, 'person_id'); } else { diff --git a/www/includes/easyparliament/searchengine.php b/www/includes/easyparliament/searchengine.php index 47af6cb94d..b57c39dcbb 100644 --- a/www/includes/easyparliament/searchengine.php +++ b/www/includes/easyparliament/searchengine.php @@ -178,6 +178,15 @@ public function SEARCHENGINE($query) { $this->query .= " 19990101..$to"; } + # Merged people + $db = new ParlDB; + $merged = $db->query('SELECT * FROM gidredirect WHERE gid_from LIKE :gid_from', array(':gid_from' => "uk.org.publicwhip/person/%")); + for ($n=0; $n<$merged->rows(); $n++) { + $from_id = str_replace('uk.org.publicwhip/person/', '', $merged->field($n, 'gid_from')); + $to_id = str_replace('uk.org.publicwhip/person/', '', $merged->field($n, 'gid_to')); + $this->query = preg_replace("#speaker:($from_id|$to_id)#i", "(speaker:$from_id OR speaker:$to_id)", $this->query); + } + twfy_debug("SEARCH", "prefixed: " . var_export($this->prefixed, true)); twfy_debug("SEARCH", "query -- ". $this->query); @@ -253,11 +262,17 @@ public function SEARCHENGINE($query) { } # Speakers + for ($n=0; $n<$merged->rows(); $n++) { + $from_id = str_replace('uk.org.publicwhip/person/', '', $merged->field($n, 'gid_from')); + $to_id = str_replace('uk.org.publicwhip/person/', '', $merged->field($n, 'gid_to')); + $qd = preg_replace("#\(S$from_id OR S$to_id\)#", "S$to_id", $qd); + } + preg_match_all('#S(\d+)#', $qd, $m); foreach ($m[1] as $mm) { - $member = new MEMBER(array('person_id' => $mm)); - $name = iconv('iso-8859-1', 'utf-8//TRANSLIT', $member->full_name()); # Names are currently in ISO-8859-1 - $qd = str_replace("S$mm", "speaker:$name", $qd); + $member = new MEMBER(array('person_id' => $mm)); + $name = iconv('iso-8859-1', 'utf-8//TRANSLIT', $member->full_name()); # Names are currently in ISO-8859-1 + $qd = str_replace("S$mm", "speaker:$name", $qd); } # Simplify display of excluded words