From 34ac55e01c2e4a38eb5a6f27a6cf0f63d55284e8 Mon Sep 17 00:00:00 2001 From: Friedrich Lindenberg Date: Mon, 4 Nov 2013 10:44:21 +0100 Subject: [PATCH] egofaktor refinements. --- Ego.yaml | 34 ++++++++++++++++++++++++++++++++++ contrib/egofaktor.py | 7 ++++++- 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 Ego.yaml diff --git a/Ego.yaml b/Ego.yaml new file mode 100644 index 0000000..7272c5b --- /dev/null +++ b/Ego.yaml @@ -0,0 +1,34 @@ +common: + database: postgresql://localhost/parlament_etl + prefix: data-exports/ + format: csv + +exports: + + - query: > + SELECT partei, (sum(egos)::float/sum(words)::float)*100 AS faktor, SUM(words) FROM egos + WHERE partei IS NOT NULL GROUP BY partei ORDER BY (sum(egos)::float/sum(words)::float) DESC; + filename: ego-parteien.csv + + - query: > + SELECT fingerprint, (egos::float/words::float)*100 AS faktor, words FROM egos + ORDER BY (egos::float/words::float) DESC; + filename: ego-personen.csv + + - query: > + SELECT partei, (sum(egos)::float/sum(words)::float)*100 AS faktor, + (1/(sum(egos)::float/sum(words)::float))::bigint AS wordnum, SUM(words) FROM egos + WHERE partei IS NOT NULL GROUP BY partei ORDER BY (sum(egos)::float/sum(words)::float) DESC; + format: 'json' + filename: parteien.json + + - query: > + SELECT e.fingerprint, p.partei, p.geschlecht, p.vorname, p.nachname, + (e.egos::float/e.words::float)*100 AS faktor, + (1/(e.egos::float/e.words::float))::bigint AS wordnum, e.words FROM egos e + LEFT JOIN person p ON e.fingerprint = p.fingerprint + ORDER BY (e.egos::float/e.words::float) DESC; + format: 'json' + filename: personen.json + + diff --git a/contrib/egofaktor.py b/contrib/egofaktor.py index 9d87314..c96f4d0 100644 --- a/contrib/egofaktor.py +++ b/contrib/egofaktor.py @@ -3,7 +3,7 @@ from collections import defaultdict from unicodedata import normalize as ucnorm, category -egos = re.compile(r'\b(ich|mir|mein|meiner|meines|mich)\b') +egos = re.compile(r'\b(ich|mir|mein|meiner|meines|mich|meines)\b', re.U) words = re.compile('\w{2,}') engine = dataset.connect('postgresql://localhost/parlament_etl') @@ -43,8 +43,13 @@ def normalize(text): egofaktor = engine['egos'] egofaktor.delete() for fp in num_egos.keys(): + pers = engine['person'].find_one(fingerprint=fp) or {} + partei = None + if 'partei' in pers: + partei = pers['partei'] egofaktor.upsert({ 'fingerprint': fp, + 'partei': partei, 'egos': num_egos[fp], 'words': num_words[fp] }, ['fingerprint'])