Skip to content
This repository has been archived by the owner on Dec 14, 2023. It is now read-only.

Commit

Permalink
added more test cases on checking the accuracy of the model via likel…
Browse files Browse the repository at this point in the history
…ihood comparisons
  • Loading branch information
DonggeLiu committed Aug 14, 2017
1 parent 8e77ed4 commit 809aad7
Showing 1 changed file with 69 additions and 5 deletions.
74 changes: 69 additions & 5 deletions mediacloud/mediawords/util/topic_modeling/test_model_lda.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,14 @@ def setUp(self):
self._flat_story_tokens = self._flatten_story_tokens()
self._lda_model = ModelLDA()
self._lda_model.add_stories(self._story_tokens)
self._topics = self._lda_model.summarize_topic()
self._optimal_topic_num_poly = self._lda_model.tune_with_polynomial()
self._optimal_topic_num_iter = self._lda_model.tune_with_iteration()

self._topics_via_poly \
= self._lda_model.summarize_topic(total_topic_num=self._optimal_topic_num_poly)
self._topics_via_iter \
= self._lda_model.summarize_topic(total_topic_num=self._optimal_topic_num_iter)

logging.getLogger("lda").setLevel(logging.WARNING)
logging.getLogger("gensim").setLevel(logging.WARNING)

Expand All @@ -44,11 +51,18 @@ def _flatten_story_tokens(self) -> Dict[int, List[str]]:
return flat_story_tokens

def test_one_to_one_relationship(self):
"""
Pass topics generated by both methods to _check_one_to_one_relationship()
"""
self._check_one_to_one_relationship(topics=self._topics_via_poly)
self._check_one_to_one_relationship(topics=self._topics_via_iter)

def _check_one_to_one_relationship(self, topics: Dict[int, List]):
"""
Test if there is one-to-one relationship for articles and topics
(i.e. no mysteries topic id or missing article id)
"""
topic_ids = self._topics.keys()
topic_ids = topics.keys()
story_ids = self._story_tokens.keys()

for topic_id in topic_ids:
Expand All @@ -64,6 +78,13 @@ def test_one_to_one_relationship(self):
msg="Missing article id: {}".format(article_id))

def test_story_contains_topic_word(self):
"""
Pass topics generated by both methods to _check_story_contains_topic_word()
"""
self._check_story_contains_topic_word(topics=self._topics_via_poly)
self._check_story_contains_topic_word(topics=self._topics_via_iter)

def _check_story_contains_topic_word(self, topics: Dict[int, List]):
"""
Test if each story contains at least one of the topic words
"""
Expand All @@ -75,24 +96,67 @@ def test_story_contains_topic_word(self):
if len(self._flat_story_tokens.get(story_id)) < 25:
return
exist = False
for topic in iter(self._topics.get(story_id)):
for topic in iter(topics.get(story_id)):
exist = topic in self._flat_story_tokens.get(story_id) or exist
if exist:
break
if not exist:
raise ValueError("Story {id} does not contain any of its topic words: {topic}\n"
"Story tokens:\n {tokens}"
.format(id=story_id, topic=self._topics.get(story_id),
.format(id=story_id, topic=topics.get(story_id),
tokens=self._flat_story_tokens.get(story_id)))

def test_default_topic_params(self):
"""
Pass topics generated by both methods to _check_default_topic_params()
"""
self._check_default_topic_params(topics=self._topics_via_poly)
self._check_default_topic_params(topics=self._topics_via_iter)

def _check_default_topic_params(self, topics: Dict[int, List[str]]):
"""
Test if the correct number of words for each topic is returned
"""
default_word_num = 4
for topics in self._topics.values():
for topics in topics.values():
unittest.TestCase.assertEqual(
self=self, first=default_word_num, second=len(topics),
msg="Default word number ({}) != word number ({})\nTopic = {}"
.format(default_word_num, len(topics), topics))

def test_highest_likelihood(self):
self._check_highest_likelihood(num=self._optimal_topic_num_iter, name="Iteration")
self._check_highest_likelihood(num=self._optimal_topic_num_poly, name="Polynomial")

def _check_highest_likelihood(self, num: int, name: str):
"""
Test if the result is the most accurate one
"""
optimal_likelihood = self._lda_model.evaluate()[1]
other_nums = [0, 1, num-1, num+1, num*2]

for other_num in other_nums:
if (other_num == num) or num < 0:
continue
other_likelihood = self._lda_model.evaluate(topic_num=other_num)[1]
unittest.TestCase.assertGreaterEqual(
self=self,
a=optimal_likelihood,
b=other_likelihood,
msg="Topic num {} has a better likelihood {} than {} with {}:{}"
.format(other_num, other_likelihood, name, num, optimal_likelihood))

def test_equal_likelihood(self):
"""
The likelihood of both methods should be the same (i.e. the max),
However, the total topic nums do not have to be the same
"""
unittest.TestCase.assertEqual(
self=self, first=self._topics_via_iter, second=self._topics_via_poly,
msg="Iter: {}\nPoly: {}"
.format(self._lda_model.evaluate(topic_num=self._optimal_topic_num_iter)[1],
self._lda_model.evaluate(topic_num=self._optimal_topic_num_poly)[1]))


if __name__ == '__main__':
unittest.main()

0 comments on commit 809aad7

Please sign in to comment.