Skip to content

Commit

Permalink
Merge pull request #18 from oduwsdl/stopword-lists
Browse files Browse the repository at this point in the history
Stopword lists
  • Loading branch information
shawnmjones committed Jul 13, 2020
2 parents 2e071d7 + bf68c78 commit 17cd8a3
Show file tree
Hide file tree
Showing 6 changed files with 30 additions and 96 deletions.
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# The short X.Y version
version = u''
# The full version, including alpha/beta/rc tags
release = u'0.2020.06.24.032337'
release = u'0.2020.07.13.224110'

# -- General configuration ---------------------------------------------------

Expand Down
3 changes: 3 additions & 0 deletions hypercane/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import os

package_directory = os.path.dirname(os.path.abspath(__file__))
22 changes: 20 additions & 2 deletions hypercane/actions/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,10 @@ def report_ranked_terms(args):
parser.add_argument('--sumgrams', '--use-sumgrams', help="If specified, generate sumgrams rather than n-grams.",
action='store_true', default=False, dest='use_sumgrams'
)

parser.add_argument('--added-stopwords', help="If specified, add stopwords from this file.",
dest='added_stopword_filename', default=None
)

args = process_input_args(args, parser)
output_type = 'mementos'
Expand All @@ -268,11 +272,22 @@ def report_ranked_terms(args):
session, discover_mementos_by_input_type
)

added_stopwords = []

if args.added_stopword_filename is not None:
with open(args.added_stopword_filename) as f:
for line in f:
added_stopwords.append(line.strip())

if args.use_sumgrams is True:

from hypercane.report.sumgrams import generate_sumgrams
from hypercane import package_directory

ranked_terms = generate_sumgrams(list(urimdata.keys()), args.cache_storage)
ranked_terms = generate_sumgrams(
list(urimdata.keys()), args.cache_storage,
added_stopwords=added_stopwords
)

with open(args.output_filename, 'w') as f:

Expand All @@ -286,7 +301,10 @@ def report_ranked_terms(args):
else:
from hypercane.report.terms import generate_ranked_terms

ranked_terms = generate_ranked_terms(list(urimdata.keys()), args.cache_storage, ngram_length=args.ngram_length)
ranked_terms = generate_ranked_terms(
list(urimdata.keys()), args.cache_storage,
ngram_length=args.ngram_length,
added_stopwords=added_stopwords)

with open(args.output_filename, 'w') as f:

Expand Down
90 changes: 1 addition & 89 deletions hypercane/report/sumgrams.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

module_logger = logging.getLogger('hypercane.report.sumgrams')

def generate_sumgrams(urimlist, cache_storage):
def generate_sumgrams(urimlist, cache_storage, added_stopwords=[]):

import concurrent.futures
import nltk
Expand Down Expand Up @@ -42,52 +42,6 @@ def generate_sumgrams(urimlist, cache_storage):

now = datetime.now()
current_year = now.year
last_year = current_year - 1
current_date = now.day

# sumgram processes stop words at two levels:
# 1. when the vocabulary is built
# 2. stopwords are applied when finding sumgrams
# start with single terms before moving on to bigrams, etc.

# TODO: load these from a file
added_stopwords = [
"associated press",
"com",
"donald trump",
"fox news",
"abc news",
"getty images",
"last month",
"last week",
"last year",
"pic",
"pinterest reddit",
"pm et",
"president donald",
"president donald trump",
"president trump",
"president trump's",
"print mail",
"reddit print",
"said statement",
"send whatsapp",
"sign up",
"trump administration",
"trump said",
"twitter",
"united states",
"washington post",
"white house",
"whatsapp pinterest",
"subscribe whatsapp",
"york times",
"privacy policy",
"terms use"
]

added_stopwords.append( "{} read".format(last_year) )
added_stopwords.append( "{} read".format(current_year) )

stopmonths = [
"january",
Expand All @@ -104,9 +58,6 @@ def generate_sumgrams(urimlist, cache_storage):
"december"
]

# add just the month to the stop words
added_stopwords.extend(stopmonths)

stopmonths_short = [
"jan",
"feb",
Expand All @@ -122,45 +73,6 @@ def generate_sumgrams(urimlist, cache_storage):
"dec"
]

added_stopwords.extend(stopmonths_short)

# add the day of the week, too
added_stopwords.extend([
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday"
])

added_stopwords.extend([
"mon",
"tue",
"wed",
"thu",
"fri",
"sat",
"sun"
])

# for i in range(1, 13):
# added_stopwords.append(
# datetime(current_year, i, current_date).strftime('%b %Y')
# )
# added_stopwords.append(
# datetime(last_year, i, current_date).strftime('%b %Y')
# )

# for i in range(1, 13):
# added_stopwords.append(
# datetime(current_year, i, current_date).strftime('%B %Y')
# )
# added_stopwords.append(
# datetime(last_year, i, current_date).strftime('%B %Y')
# )

params = {
"add_stopwords": ", ".join(added_stopwords),
"top_sumgram_count": 20,
Expand Down
7 changes: 4 additions & 3 deletions hypercane/report/terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

module_logger = logging.getLogger('hypercane.report.terms')

def get_document_tokens(urim, cache_storage, ngram_length):
def get_document_tokens(urim, cache_storage, ngram_length, added_stopwords=[]):

from hypercane.utils import get_boilerplate_free_content
from nltk.corpus import stopwords
Expand All @@ -12,6 +12,7 @@ def get_document_tokens(urim, cache_storage, ngram_length):

# TODO: stoplist based on language of the document
stoplist = list(set(stopwords.words('english')))
stoplist.extend(added_stopwords)
punctuation = [ i for i in string.punctuation ]
additional_stopchars = [ '’', '‘', '“', '”', '•', '·', '—', '–', '›', '»']
stop_numbers = [ str(i) for i in range(0, 11) ]
Expand All @@ -27,7 +28,7 @@ def get_document_tokens(urim, cache_storage, ngram_length):

return list(doc_ngrams)

def generate_ranked_terms(urimlist, cache_storage, ngram_length=1):
def generate_ranked_terms(urimlist, cache_storage, ngram_length=1, added_stopwords=[]):

import concurrent.futures
import nltk
Expand All @@ -38,7 +39,7 @@ def generate_ranked_terms(urimlist, cache_storage, ngram_length=1):
# with concurrent.futures.ProcessPoolExecutor(max_workers=5) as executor:
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:

future_to_urim = { executor.submit(get_document_tokens, urim, cache_storage, ngram_length): urim for urim in urimlist }
future_to_urim = { executor.submit(get_document_tokens, urim, cache_storage, ngram_length, added_stopwords=added_stopwords): urim for urim in urimlist }

for future in concurrent.futures.as_completed(future_to_urim):

Expand Down
2 changes: 1 addition & 1 deletion hypercane/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__appname__ = "hypercane"
__appversion__ = '0.2020.06.24.032337'
__appversion__ = '0.2020.07.13.224110'
__useragent__ = "{}/{}".format(__appname__, __appversion__)

0 comments on commit 17cd8a3

Please sign in to comment.