From 3236e4cf63d0eba122f43d84afec5a856db97c60 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Wed, 25 Nov 2015 23:49:25 +0100 Subject: [PATCH 1/2] [enh] support extra field (unremunerated) in findecl table A,D (added: 15/5/2015) --- parltrack/scrapers/findecl.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/parltrack/scrapers/findecl.py b/parltrack/scrapers/findecl.py index 9ac36a7e..f7da2df0 100755 --- a/parltrack/scrapers/findecl.py +++ b/parltrack/scrapers/findecl.py @@ -53,15 +53,23 @@ def parse_table(rows, threshold=3): for row in rows[1:]: if not row.strip(): if row_texts: - x_pos = max(len(l) for l in row_texts) - 1 + cut_pos = x_pos = max(len(l) for l in row_texts) - 1 + pos = -1 + + x_found = False + for row in row_texts: + if row.strip().endswith(' X'): + x_found = True + cut_pos = len(row[:-1].strip()) + 1 + pos = 0 + break - if x_pos > min(column_index.values()) - threshold: - row_text = ' '.join(x[:x_pos-1].strip() for x in row_texts) + if x_found: + row_text = ' '.join(x[:cut_pos].strip() for x in row_texts) else: row_text = ' '.join(x.strip() for x in row_texts) if len(row_text) > 5: - pos = -1 for i,v in column_index.items(): if x_pos <= v + threshold and x_pos >= v - threshold: pos = i From 87a583c0b3a18f4f647074e1bd53102886abb7d1 Mon Sep 17 00:00:00 2001 From: Adam Tauber Date: Sat, 28 Nov 2015 22:29:27 +0100 Subject: [PATCH 2/2] [fix] findecl table row whitespaces --- parltrack/scrapers/findecl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parltrack/scrapers/findecl.py b/parltrack/scrapers/findecl.py index f7da2df0..22f6720e 100755 --- a/parltrack/scrapers/findecl.py +++ b/parltrack/scrapers/findecl.py @@ -65,7 +65,7 @@ def parse_table(rows, threshold=3): break if x_found: - row_text = ' '.join(x[:cut_pos].strip() for x in row_texts) + row_text = ' '.join(x.strip()[:cut_pos] for x in row_texts) else: row_text = ' '.join(x.strip() for x in row_texts)