Skip to content
This repository has been archived by the owner on Dec 14, 2023. It is now read-only.

Commit

Permalink
Create token_pool.py
Browse files Browse the repository at this point in the history
tokenize articles
  • Loading branch information
DonggeLiu committed Jun 29, 2017
1 parent 08b0329 commit e24f3b7
Showing 1 changed file with 104 additions and 0 deletions.
104 changes: 104 additions & 0 deletions mediacloud/mediawords/util/topic_modeling/token_pool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from mediawords.db import connect_to_db
import json
import re
# from stop_words import get_stop_words


class TokenPool:
""" Fetch the sentences and break it down to words.
"""
DB_QUERY = """SELECT stories_id, sentence FROM story_sentences"""
STOP_WORDS = "lib/MediaWords/Languages/resources/en_stopwords.txt"
DELIMITERS = "[^\w]"

def __init__(self):
"""Initialisations"""
pass

def fetch_sentences(self):
"""
Fetch the sentence from DB
:return: the sentences in json format
"""
db_connection = connect_to_db()
sentences_hash = db_connection.query(self.DB_QUERY).hashes()
sentences_json = json.loads(s=json.dumps(obj=sentences_hash))
db_connection.disconnect()

return sentences_json

def tokenize_sentence(self, sentences):
"""
Break the sentence down into tokens and group them by article ID
:param sentences: a json containing sentences and their article id
:return: a dictionary of articles and words in them
"""
articles = {}

for sentence in sentences:
if sentence['stories_id'] not in articles.keys():
articles[sentence['stories_id']] = []
articles[sentence['stories_id']] \
+= self.eliminate_symbols(article_sentence=sentence['sentence'])

return articles

def eliminate_symbols(self, article_sentence):
"""
Remove symbols in the given list of words in article
:param article_sentence: a sentence in an article
:return: a list of non-symbol tokens
"""
return re.split(pattern=self.DELIMITERS, string=article_sentence)

def fetch_stopwords(self):
"""
Fetch the stopwords from file en_stopwords.txt
:return: all stopwords in the file
"""
stopwords = [element[:-1] for element in open(self.STOP_WORDS).readlines()]
return stopwords

def eliminate_stopwords(self, article_words):
"""
Remove stopwords in the given list of words in article
:param article_words: a list containing all words in an article
:return: a list of all the meaningful words
"""
stopwords_file = self.fetch_stopwords()
# stopwords_package = get_stop_words('en')

stemmed_tokens_via_file = [word for word in article_words
if ((len(word) > 1) and (word.lower() not in stopwords_file))]

# stemmed_tokens_via_package = [word for word in article_words
# if ((len(word) > 1)
# and (word.lower() not in stopwords_package))]

# print(set(stemmed_tokens_via_file) - set(stemmed_tokens_via_package))
# print(set(stemmed_tokens_via_package) - set(stemmed_tokens_via_file))

return stemmed_tokens_via_file

def output_tokens(self):
"""
Go though each step to output the tokens of articles
:return: a dictionary with key as the id of each article and value as the useful tokens
"""
sentences = self.fetch_sentences()
tokens = self.tokenize_sentence(sentences=sentences)
stemmed_tokens = {}

counter = 0
for article_id, article_token in tokens.items():
stemmed_tokens[article_id] = self.eliminate_stopwords(article_words=article_token)
counter += 1
if counter > 4:
break

return stemmed_tokens


# A sample output
# pool = TokenPool()
# print(pool.output_tokens().popitem())

0 comments on commit e24f3b7

Please sign in to comment.