More configs (#102)

* Add Petr2 config file * Use config file for Mongo details * Add Petr2 config file * Forgot about the scraper conns here
openeventdata · May 1, 2017 · d1639c5 · d1639c5
1 parent af5c2b0
commit d1639c5
Show file tree

Hide file tree

Showing 7 changed files with 163 additions and 17 deletions.
diff --git a/PHOX_config.ini b/PHOX_config.ini
@@ -9,7 +9,7 @@ geo_service = Mordecai
 cliff_host = http://localhost
 cliff_port = 8080
 mordecai_host = http://localhost
-mordecai_port = 5011
+mordecai_port = 5000
 
 [Pipeline]
 scraper_stem = scraper_results_
@@ -20,11 +20,15 @@ dupfile_stem = Phoenix.dupindex.
 outputfile_stem = Phoenix.events.20
 newsourcestem = newsources.
 
-oneaday_filter = True
+oneaday_filter = False
 
 [Petrarch]
 petrarch_version = 2
 
+[Mongo]
+db = event_scrape
+collection = stories
+
 #[Logging]
 #log_file = /root/logs/pipeline.log
 

diff --git a/README.md b/README.md
@@ -6,8 +6,9 @@ phoenix_pipeline
 
 Turning news into events since 2014.
 
-This system links a series of Python programs to convert the files which have
-been downloaded by a [web scraper](https://github.com/openeventdata/scraper) to
+
+This system links a series of Python programs to convert the files which have been 
+downloaded by a [web scraper](https://github.com/openeventdata/scraper) to
 coded event data which is uploaded to a web site designated in the config file.
 The system processes a single day of information, but this can be derived from
 multiple text files. The pipeline also implements a filter for source URLs as
@@ -16,6 +17,7 @@ defined by the keys in the `source_keys.txt` file. These keys correspond to the
 
 For more information please visit the [documentation](http://phoenix-pipeline.readthedocs.org/en/latest/).
 
+
 ## Requirements
 
 The pipeline requires either
@@ -41,8 +43,22 @@ setup instructions [here](https://github.com/openeventdata/mordecai). The
 version of the pipeline deployed in production currently uses CLIFF/CLAVIN, but
 future development will focus on improvements to Mordecai.
 
-##Running
+## Configuration
+
+The pipeline has two configuration files. `PHOX_config.ini` specifies which
+geolocation system to use, how to name the files produced by the pipeline, and
+how to upload the files to a remote server if desired.
+
+`petr_config.ini` is the configuration file for Petrarch2 itself, including the
+location of dictionaries, new actor extraction options, and the one-a-day filter. For
+more details see the main [Petrarch2 repo](https://github.com/openeventdata/petrarch2/).
+
+## Running
 
 To run the program:
 
-`python pipeline.py`
+```
+python pipeline.py
+```
+
+
diff --git a/geolocation.py b/geolocation.py
@@ -288,7 +288,8 @@ def mordecai(events, file_details, server_details, geo_details):
             Same as in the parameter but with the addition of a value that is
             a list of lon, lat, placeName, stateName, countryCode.
     """
-    coll = utilities.make_conn(file_details.auth_db, file_details.auth_user,
+    coll = utilities.make_conn(file_details.db_db, file_details.db_collection,
+                               file_details.auth_db, file_details.auth_user,
                                file_details.auth_pass)
 
     for event in events:
@@ -329,7 +330,8 @@ def cliff(events, file_details, server_details, geo_details):
             Same as in the parameter but with the addition of a value that is
             a list of lon, lat, placeName, stateName, countryCode.
     """
-    coll = utilities.make_conn(file_details.auth_db, file_details.auth_user,
+    coll = utilities.make_conn(file_details.db_db, file_details.db_collection,
+                               file_details.auth_db, file_details.auth_user,
                                file_details.auth_pass)
 
     for event in events:

diff --git a/petr_config.ini b/petr_config.ini
@@ -0,0 +1,106 @@
+# Configuration file for release version of PETRARCH event coder
+# Codes the GigaWord.sample.PETR.txt using current dictionaries and default options
+# Last update: 30 April 2015
+
+[Dictionaries]
+# See the PETRreader.py file for the purpose and format of these files
+verbfile_name    = CAMEO.2.0.txt
+actorfile_list   = Phoenix.Countries.actors.txt, Phoenix.International.actors.txt, Phoenix.MilNonState.actors.txt
+agentfile_name   = Phoenix.agents.txt
+discardfile_name = Phoenix.discards.txt
+issuefile_name   = Phoenix.IssueCoding.txt
+
+
+
+
+[Options]
+# textfile_list is a comma-delimited list of text files to code. This list has priority if 
+#               both a textfile_list and textfile_name are present
+textfile_list = data/text/GigaWord.sample.PETR.xml
+#textfile_list = AFP0808-01.xml, AFP0909-01.xml, AFP1210-01.xml
+# textfile_name is the name of a file containing a list of names of files to code, one 
+# file name per line.
+#textfile_name  = PETR.textfiles.benchmark.txt
+
+# eventfile_name is the output file for the events
+eventfile_name = events.PETR-Demo.txt
+
+
+# INTERFACE OPTIONS: uncomment to activate
+# Default: set all of these false, which is equivalent to an A)utocode in TABARI
+
+# code_by_sentence: show events after each sentence has been coded; default is to 
+#                   show events after all of the sentences in a story have been coded
+code_by_sentence = True
+# pause_by_sentence: pause after the coding of each sentence. Entering 'Return' will 
+#                    cause the next sentence to be coded; entering any character will 
+#                    cause the program to exit. Default is to code without any pausing. 
+#pause_by_sentence = True
+# pause_by_story: pause after the coding of each story. 
+#pause_by_story = True
+
+
+# CODING OPTIONS: 
+# Defaults are more or less equivalent to TABARI
+
+# write_actor_root: If True, the event record will include the text of the actor root: 
+#                   The root is the text at the head of the actor synonym set in the 
+#                   dictionary. Default is False
+write_actor_root = False
+
+# write_actor_text: If True, the event record will include include the complete text of 
+#                   the noun phrase that was used to identify the actor.  Default is False
+write_actor_text = True
+
+# write_event_text: If True, the event record will include include the complete text of 
+#                   the verb phrase that was used to identify the event.  Default is False
+write_event_text = True
+
+# NULL CODING OPTIONS
+# null_verbs: If True, only get verb phrases that are not in the dictionary but are associated 
+#             with coded noun phrases
+null_verbs  = False
+
+# null_actors: If True, only get actor phrases that are not in the dictionary but associated with 
+#             coded verb phrases. This also requires new_actor_length to be set to a value > 0:
+#             typically a value of 4 to 8 will give good results.
+null_actors = False  
+
+# new_actor_length: Maximum length for new actors extracted from noun phrases if no 
+#                   actor or agent generating a code is found. To disable and just 
+#                   use null codes "---", set to zero; this is the default. 
+#                   Setting this to a large number will extract anything found in a (NP
+#                   noun phrase, though usually true actors contain a small number of words 
+#                   This must be an integer.                       
+new_actor_length = 0
+
+# require_dyad: Events require a non-null source and target: setting this false is likely
+#               to result in a very large number of nonsense events. As happened with the 
+#               infamous GDELT data set of 2013-2014. And certainly no one wants to see 
+#               that again. So the default is True
+require_dyad = False
+
+# stop_on_error: If True, parsing errors causing the program to halt; typically used for 
+#                debugging. With the default [false], the error is written to the error 
+#                file, record is skipped, and processing continues. 
+stop_on_error = False
+
+# commas: These adjust the length (in words) of comma-delimited clauses that are eliminated 
+#         from the parse. To deactivate, set the max to zero. 
+#         Defaults, based on TABARI, are in ()
+#         comma_min :  internal clause minimum length [2]
+#         comma_max :  internal clause maximum length [8]
+#         comma_bmin : initial ("begin") clause minimum length [0]
+#         comma_bmax : initial clause maximum length [0 : deactivated by default]
+#         comma_emin : terminal ("end") clause minimum length [2]
+#         comma_emax : terminal clause maximum length [8]
+comma_min = 2
+comma_max = 8
+comma_bmin = 0
+comma_bmax = 0
+comma_emin = 2
+comma_emax = 8
+
+[StanfordNLP]
+stanford_dir = ~/stanford-corenlp/
+
diff --git a/pipeline.py b/pipeline.py
@@ -14,7 +14,7 @@
 import scraper_connection
 
 
-def main(file_details, geo_details, server_details, petrarch_version, logger_file=None, run_filter=None,
+def main(file_details, geo_details, server_details, petrarch_version, mongo_details, logger_file=None, run_filter=None,
          run_date='', version=''):
     """
     Main function to run all the things.
@@ -32,6 +32,9 @@ def main(file_details, geo_details, server_details, petrarch_version, logger_fil
                     Config information specifically related to the remote
                     server for FTP uploading.
 
+    petrarch_version: String.
+                       Which version of Petrarch to use. Must be '1' or '2'
+
     logger_file: String.
                     Path to a log file. Defaults to ``None`` and opens a
                     ``PHOX_pipeline.log`` file in the current working
@@ -111,12 +114,12 @@ def main(file_details, geo_details, server_details, petrarch_version, logger_fil
         #        petrarch.run_pipeline(formatted,
         #                              '{}{}.txt'.format(file_details.fullfile_stem,
         #                                                date_string), parsed=True)
-        petr_results = petrarch.run_pipeline(formatted, write_output=False,
+        petr_results = petrarch.run_pipeline(formatted, config = "petr_config.ini", write_output=False,
                                              parsed=True)
     elif run_filter == 'True':
         print('Running PETRARCH and returning output.')
         logger.info('Running PETRARCH and returning output.')
-        petr_results = petrarch.run_pipeline(formatted, write_output=False,
+        petr_results = petrarch.run_pipeline(formatted, config = "petr_config.ini", write_output=False,
                                              parsed=True)
     else:
         print("""Can't run with the options you've specified. You need to fix

diff --git a/scraper_connection.py b/scraper_connection.py
@@ -74,6 +74,7 @@ def query_all(collection, lt_date, gt_date, sources, write_file=False):
     posts = collection.find({"$and": [{"date_added": {"$lte": lt_date}},
                                       {"date_added": {"$gt": gt_date}},
                                       {"source": {"$in": sources}}]})
+    #posts = collection.find()
 
     print('Total number of stories: {}'.format(posts.count()))
     logger.info('Total number of stories: {}'.format(posts.count()))
@@ -145,7 +146,8 @@ def main(current_date, file_details, write_file=False, file_stem=None):
 
     """
     sources = _get_sources('source_keys.txt')
-    conn = utilities.make_conn(file_details.auth_db, file_details.auth_user,
+    conn = utilities.make_conn(file_details.db_db, file_details.db_collection,
+                               file_details.auth_db, file_details.auth_user,
                                file_details.auth_pass, file_details.db_host)
 
     less_than = datetime.datetime(current_date.year, current_date.month,

diff --git a/utilities.py b/utilities.py
@@ -101,6 +101,15 @@ def parse_config(config_filename):
 
         petrarch_version = parser.get('Petrarch', 'petrarch_version')
 
+        if 'Mongo' in parser.sections():
+            db_db = parser.get('Mongo', 'db')
+            db_collection = parser.get('Mongo', 'collection')
+        else:
+            db_db = 'event_scrape'
+            db_collection = 'stories'
+
+
+
         file_attrs = namedtuple('FileAttributes', ['scraper_stem',
                                                    'recordfile_stem',
                                                    'fullfile_stem',
@@ -112,12 +121,16 @@ def parse_config(config_filename):
                                                    'auth_db',
                                                    'auth_user',
                                                    'auth_pass',
-                                                   'db_host'])
+                                                   'db_host',
+                                                   'db_db',
+                                                   'db_collection'])
 
         file_list = file_attrs(scraper_stem, recordfile_stem, fullfile_stem,
                                eventfile_stem, dupfile_stem, outputfile_stem,
                                oneaday_filter, log_file, auth_db, auth_user,
-                               auth_pass, db_host)
+                               auth_pass, db_host, db_db, db_collection)
+
+
 
         return server_list, geo_list, file_list, petrarch_version
     except Exception as e:
@@ -161,7 +174,7 @@ def do_RuntimeError(st1, filename='', st2=''):
     raise RuntimeError(st1 + ' ' + filename + ' ' + st2)
 
 
-def make_conn(db_auth, db_user, db_pass, db_host=None):
+def make_conn(db_db, db_collection, db_auth, db_user, db_pass, db_host=None):
     """
     Function to establish a connection to a local MonoDB instance.
 
@@ -192,8 +205,8 @@ def make_conn(db_auth, db_user, db_pass, db_host=None):
         client = MongoClient()
     if db_auth:
         client[db_auth].authenticate(db_user, db_pass)
-    database = client.event_scrape
-    collection = database['stories']
+    database = client[db_db]
+    collection = database[db_collection]
     return collection