-
Notifications
You must be signed in to change notification settings - Fork 52
/
parse_ministerial_cttes.pl
executable file
·170 lines (143 loc) · 4.43 KB
/
parse_ministerial_cttes.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#!/usr/bin/perl
use warnings;
use strict;
use LWP::Simple;
# Dec 2006 URL is http://www.pm.gov.uk/files/pdf/Acr3BB.pdf
my $url= shift || die "usage: $0 http://www.foo.com/path/to/file.pdf\n";
{
my $all= &process(&read);
$all= &cleanup_members($all);
&output($all);
}
sub read {
if (0) {
my $content= get($url) || die "couldn't fetch $url\n";
open (OUT, ">/tmp/cab_cttes.pdf") || die "can't write to temp file /tmp/cab_cttes.pdf:$!";
print OUT $content;
close(OUT);
`pdftotext -layout /tmp/cab_cttes.pdf`;
}
my @lines;
open (IN, "</tmp/cab_cttes.txt") || die "can't open /tmp/cab_cttes.txt $!";
foreach my $l (<IN>) {
chomp($l);
next if $l =~ m#^\s*$#;
push @lines, $l;
}
close (IN);
return (@lines);
}
sub process {
my @lines= @_;
my $groups;#hash of all groups
my $group; # name of group
while (my $l = shift @lines) {
last if $l=~ m#^\s+4\s*$#; # interesting stuff starts on page 4
}
while (my $line = shift @lines) {
#print "$line\n" if ($line =~ m#^#);
if ($line =~ m#^(.*)#) {
$group=$1;
my $subctte='';
until ($lines[0]=~ m#^(?:SUB\-COMMITTEE|COMPOSITION)#) {
#warn "in composition adding $line to $group\n\n\n============\n";
$group.= " " . shift @lines;
}
#warn "====$group----\n";
if ($lines[0]=~ m#^(SUB\-COMMITTEE.*)#) {
do {
$subctte.= " " . shift @lines;
#warn "in sub-ctte\n";
} until ($lines[0] =~ m#COMPOSITION#);
}
if (defined $lines[0] and $lines[0]=~ m#COMPOSITION#) {
shift @lines; #composition heading. ignore
do {
push @{$groups->{$group}->{$subctte}{'composition'}}, shift @lines;
last if not defined $lines[0];
} until ($lines[0] =~ m#^(?:TERMS OF REF|)#);
}
if (defined $lines[0] and $lines[0]=~ m#TERMS OF REF#) {
shift @lines; # heading. ignore
until ($lines[0]=~ m#^\s+\d+\s*$#) {
#warn "in terms of use\n";
$groups->{$group}->{$subctte}->{'termsofreference'}.= " " . shift @lines;
}
}
#use Data::Dumper; print Dumper($groups->{$group});
#warn $line;
#warn "outside\n";
} elsif ($line =~ m#^\s+\d+\s*$#) {
# page number. ignore
} else {
warn $line;
}
#warn "outer most loop\n";
}
#use Data::Dumper; print Dumper($groups);
return ($groups);
}
sub cleanup_members {
my $ref= shift;
foreach my $ctte (keys %{$ref}) {
foreach my $subctte (keys %{$ref->{$ctte}}) {
#print "\n$ctte: $subctte\n\t";
my @members;
# pass 1 - rules. does most of the checkable cleanup.
foreach my $member (@{$ref->{$ctte}->{$subctte}->{'composition'}}) {
if ($member =~ m#^[\(a-z]#) { #titles all start with a capital letter
$members[$#members] .= " " . $member;
} elsif ($member =~ m#^[^(]+\)#) { # tings which have a closing bracket by no opening are continuations
$members[$#members] .= " " . $member;
} elsif ($member =~ m#^\s+\d+\s*$#) { # errant page numbers
} elsif ($member =~ m#^\S+\s*$#) { # positions all have more than one word
$members[$#members] .= " " . $member;
} else {
push @members, $member;
}
}
my @intermediate=@members;
@members=();
# pass 2. Look for keywords. Only doable after previous step
my $attendance='';
foreach my $member (@intermediate) {
if ($member =~ m#^(?:Prime|Minster|Minister|Deputy|Secretary|Chief Whip|Chief Sec|Chancellor|Leader|The|Attorney|Other|Advocate|Paymaster|Solicitor|Parliamentary|When)#) {
if ($member=~ m#^(?:The )?(.*) also has the right to attend#) {
$member = "[Right to Attend] $1";
}
push @members, $attendance.$member;
} else {
if ($member =~ m#In attendance#i) {
$attendance= "[In Attendance] ";
} elsif ($member =~ m#^Also has the right to attend$#) {
$attendance= "[Right to Attend] ";
} else {
$members[$#members] .= " " . $member;
}
#$members[$#members] .= " " . $member;
}
}
#print join "\n\t", @members;
$ref->{$ctte}->{$subctte}->{'composition'}=\@members;
}
}
return ($ref);
}
sub output {
my $ref = shift;
print "ctte\tsubctte\ttype\tvalue\tnote\n";
foreach my $ctte (keys %{$ref}) {
foreach my $subctte (keys %{$ref->{$ctte}}){
if ($ref->{$ctte}->{$subctte}->{'termsofreference'}) {
print "$ctte\t$subctte\tterms\t$ref->{$ctte}->{$subctte}->{'termsofreference'}\n";
}
foreach my $member (@{$ref->{$ctte}->{$subctte}->{'composition'}}) {
if ($member=~m#^\[([^\]]+)\] (.*)#) {
print "$ctte\t$subctte\tcomposition\t$2\t$1\n";
} else {
print "$ctte\t$subctte\tcomposition\t$member\n";
}
}
}
}
}