Merge pull request #18 from oduwsdl/stopword-lists

Stopword lists
oduwsdl · Jul 13, 2020 · 17cd8a3 · 17cd8a3
2 parents 2e071d7 + bf68c78
commit 17cd8a3
Show file tree

Hide file tree

Showing 6 changed files with 30 additions and 96 deletions.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -26,7 +26,7 @@
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'0.2020.06.24.032337'
+release = u'0.2020.07.13.224110'
 
 # -- General configuration ---------------------------------------------------
 

diff --git a/hypercane/__init__.py b/hypercane/__init__.py
@@ -0,0 +1,3 @@
+import os
+
+package_directory = os.path.dirname(os.path.abspath(__file__))
diff --git a/hypercane/actions/report.py b/hypercane/actions/report.py
@@ -249,6 +249,10 @@ def report_ranked_terms(args):
     parser.add_argument('--sumgrams', '--use-sumgrams', help="If specified, generate sumgrams rather than n-grams.",
         action='store_true', default=False, dest='use_sumgrams'
     )
+
+    parser.add_argument('--added-stopwords', help="If specified, add stopwords from this file.",
+        dest='added_stopword_filename', default=None
+    )
 
     args = process_input_args(args, parser)
     output_type = 'mementos'
@@ -268,11 +272,22 @@ def report_ranked_terms(args):
         session, discover_mementos_by_input_type
     )
 
+    added_stopwords = []
+
+    if args.added_stopword_filename is not None:
+        with open(args.added_stopword_filename) as f:
+            for line in f:
+                added_stopwords.append(line.strip())
+
     if args.use_sumgrams is True:
 
         from hypercane.report.sumgrams import generate_sumgrams
+        from hypercane import package_directory
 
-        ranked_terms = generate_sumgrams(list(urimdata.keys()), args.cache_storage)
+        ranked_terms = generate_sumgrams(
+            list(urimdata.keys()), args.cache_storage,
+            added_stopwords=added_stopwords
+            )
 
         with open(args.output_filename, 'w') as f:
 
@@ -286,7 +301,10 @@ def report_ranked_terms(args):
     else:
         from hypercane.report.terms import generate_ranked_terms
 
-        ranked_terms = generate_ranked_terms(list(urimdata.keys()), args.cache_storage, ngram_length=args.ngram_length)
+        ranked_terms = generate_ranked_terms(
+            list(urimdata.keys()), args.cache_storage, 
+            ngram_length=args.ngram_length,
+            added_stopwords=added_stopwords)
 
         with open(args.output_filename, 'w') as f:
 

diff --git a/hypercane/report/sumgrams.py b/hypercane/report/sumgrams.py
@@ -5,7 +5,7 @@
 
 module_logger = logging.getLogger('hypercane.report.sumgrams')
 
-def generate_sumgrams(urimlist, cache_storage):
+def generate_sumgrams(urimlist, cache_storage, added_stopwords=[]):
 
     import concurrent.futures
     import nltk
@@ -42,52 +42,6 @@ def generate_sumgrams(urimlist, cache_storage):
 
     now = datetime.now()
     current_year = now.year
-    last_year = current_year - 1
-    current_date = now.day
-
-    # sumgram processes stop words at two levels:
-    # 1. when the vocabulary is built
-    # 2. stopwords are applied when finding sumgrams
-    # start with single terms before moving on to bigrams, etc.
-
-    # TODO: load these from a file
-    added_stopwords = [
-        "associated press",
-        "com",
-        "donald trump",
-        "fox news",
-        "abc news",
-        "getty images",
-        "last month",
-        "last week",
-        "last year",
-        "pic",
-        "pinterest reddit",
-        "pm et",
-        "president donald",
-        "president donald trump",
-        "president trump",
-        "president trump's",
-        "print mail",
-        "reddit print",
-        "said statement",
-        "send whatsapp",
-        "sign up",
-        "trump administration",
-        "trump said",
-        "twitter",
-        "united states",
-        "washington post",
-        "white house",
-        "whatsapp pinterest",
-        "subscribe whatsapp",
-        "york times",
-        "privacy policy",
-        "terms use"
-    ]
-
-    added_stopwords.append( "{} read".format(last_year) )
-    added_stopwords.append( "{} read".format(current_year) )
 
     stopmonths = [
         "january",
@@ -104,9 +58,6 @@ def generate_sumgrams(urimlist, cache_storage):
         "december"
     ]
 
-    # add just the month to the stop words
-    added_stopwords.extend(stopmonths)
-
     stopmonths_short = [
         "jan",
         "feb",
@@ -122,45 +73,6 @@ def generate_sumgrams(urimlist, cache_storage):
         "dec"
     ]
 
-    added_stopwords.extend(stopmonths_short)
-
-    # add the day of the week, too
-    added_stopwords.extend([
-        "monday",
-        "tuesday",
-        "wednesday",
-        "thursday",
-        "friday",
-        "saturday",
-        "sunday"
-    ])
-
-    added_stopwords.extend([
-        "mon",
-        "tue",
-        "wed",
-        "thu",
-        "fri",
-        "sat",
-        "sun"
-    ])
-
-    # for i in range(1, 13):
-    #     added_stopwords.append(
-    #         datetime(current_year, i, current_date).strftime('%b %Y')
-    #     )
-    #     added_stopwords.append(
-    #         datetime(last_year, i, current_date).strftime('%b %Y')
-    #     )
-
-    # for i in range(1, 13):
-    #     added_stopwords.append(
-    #         datetime(current_year, i, current_date).strftime('%B %Y')
-    #     )
-    #     added_stopwords.append(
-    #         datetime(last_year, i, current_date).strftime('%B %Y')
-    #     )
-
     params = {
         "add_stopwords": ", ".join(added_stopwords),
         "top_sumgram_count": 20,

diff --git a/hypercane/report/terms.py b/hypercane/report/terms.py
@@ -3,7 +3,7 @@
 
 module_logger = logging.getLogger('hypercane.report.terms')
 
-def get_document_tokens(urim, cache_storage, ngram_length):
+def get_document_tokens(urim, cache_storage, ngram_length, added_stopwords=[]):
 
     from hypercane.utils import get_boilerplate_free_content
     from nltk.corpus import stopwords
@@ -12,6 +12,7 @@ def get_document_tokens(urim, cache_storage, ngram_length):
 
     # TODO: stoplist based on language of the document
     stoplist = list(set(stopwords.words('english')))
+    stoplist.extend(added_stopwords)
     punctuation = [ i for i in string.punctuation ]
     additional_stopchars = [ '’', '‘', '“', '”', '•', '·', '—', '–', '›', '»']
     stop_numbers = [ str(i) for i in range(0, 11) ]
@@ -27,7 +28,7 @@ def get_document_tokens(urim, cache_storage, ngram_length):
 
     return list(doc_ngrams)
 
-def generate_ranked_terms(urimlist, cache_storage, ngram_length=1):
+def generate_ranked_terms(urimlist, cache_storage, ngram_length=1, added_stopwords=[]):
 
     import concurrent.futures
     import nltk
@@ -38,7 +39,7 @@ def generate_ranked_terms(urimlist, cache_storage, ngram_length=1):
     # with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
     with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
 
-        future_to_urim = { executor.submit(get_document_tokens, urim, cache_storage, ngram_length): urim for urim in urimlist }
+        future_to_urim = { executor.submit(get_document_tokens, urim, cache_storage, ngram_length, added_stopwords=added_stopwords): urim for urim in urimlist }
 
         for future in concurrent.futures.as_completed(future_to_urim):
 

diff --git a/hypercane/version.py b/hypercane/version.py
@@ -1,3 +1,3 @@
 __appname__ = "hypercane"
-__appversion__ = '0.2020.06.24.032337'
+__appversion__ = '0.2020.07.13.224110'
 __useragent__ = "{}/{}".format(__appname__, __appversion__)