added more test cases on checking the accuracy of the model via likel…

…ihood comparisons
mediacloud · Aug 14, 2017 · 809aad7 · 809aad7
1 parent 8e77ed4
commit 809aad7
Showing 1 changed file with 69 additions and 5 deletions.
diff --git a/mediacloud/mediawords/util/topic_modeling/test_model_lda.py b/mediacloud/mediawords/util/topic_modeling/test_model_lda.py
@@ -26,7 +26,14 @@ def setUp(self):
         self._flat_story_tokens = self._flatten_story_tokens()
         self._lda_model = ModelLDA()
         self._lda_model.add_stories(self._story_tokens)
-        self._topics = self._lda_model.summarize_topic()
+        self._optimal_topic_num_poly = self._lda_model.tune_with_polynomial()
+        self._optimal_topic_num_iter = self._lda_model.tune_with_iteration()
+
+        self._topics_via_poly \
+            = self._lda_model.summarize_topic(total_topic_num=self._optimal_topic_num_poly)
+        self._topics_via_iter \
+            = self._lda_model.summarize_topic(total_topic_num=self._optimal_topic_num_iter)
+
         logging.getLogger("lda").setLevel(logging.WARNING)
         logging.getLogger("gensim").setLevel(logging.WARNING)
 
@@ -44,11 +51,18 @@ def _flatten_story_tokens(self) -> Dict[int, List[str]]:
         return flat_story_tokens
 
     def test_one_to_one_relationship(self):
+        """
+        Pass topics generated by both methods to _check_one_to_one_relationship()
+        """
+        self._check_one_to_one_relationship(topics=self._topics_via_poly)
+        self._check_one_to_one_relationship(topics=self._topics_via_iter)
+
+    def _check_one_to_one_relationship(self, topics: Dict[int, List]):
         """
         Test if there is one-to-one relationship for articles and topics
         (i.e. no mysteries topic id or missing article id)
         """
-        topic_ids = self._topics.keys()
+        topic_ids = topics.keys()
         story_ids = self._story_tokens.keys()
 
         for topic_id in topic_ids:
@@ -64,6 +78,13 @@ def test_one_to_one_relationship(self):
                 msg="Missing article id: {}".format(article_id))
 
     def test_story_contains_topic_word(self):
+        """
+        Pass topics generated by both methods to _check_story_contains_topic_word()
+        """
+        self._check_story_contains_topic_word(topics=self._topics_via_poly)
+        self._check_story_contains_topic_word(topics=self._topics_via_iter)
+
+    def _check_story_contains_topic_word(self, topics: Dict[int, List]):
         """
         Test if each story contains at least one of the topic words
         """
@@ -75,24 +96,67 @@ def test_story_contains_topic_word(self):
             if len(self._flat_story_tokens.get(story_id)) < 25:
                 return
             exist = False
-            for topic in iter(self._topics.get(story_id)):
+            for topic in iter(topics.get(story_id)):
                 exist = topic in self._flat_story_tokens.get(story_id) or exist
                 if exist:
                     break
             if not exist:
                 raise ValueError("Story {id} does not contain any of its topic words: {topic}\n"
                                  "Story tokens:\n {tokens}"
-                                 .format(id=story_id, topic=self._topics.get(story_id),
+                                 .format(id=story_id, topic=topics.get(story_id),
                                          tokens=self._flat_story_tokens.get(story_id)))
 
     def test_default_topic_params(self):
+        """
+        Pass topics generated by both methods to _check_default_topic_params()
+        """
+        self._check_default_topic_params(topics=self._topics_via_poly)
+        self._check_default_topic_params(topics=self._topics_via_iter)
+
+    def _check_default_topic_params(self, topics: Dict[int, List[str]]):
+        """
+        Test if the correct number of words for each topic is returned
+        """
         default_word_num = 4
-        for topics in self._topics.values():
+        for topics in topics.values():
             unittest.TestCase.assertEqual(
                 self=self, first=default_word_num, second=len(topics),
                 msg="Default word number ({}) != word number ({})\nTopic = {}"
                     .format(default_word_num, len(topics), topics))
 
+    def test_highest_likelihood(self):
+        self._check_highest_likelihood(num=self._optimal_topic_num_iter, name="Iteration")
+        self._check_highest_likelihood(num=self._optimal_topic_num_poly, name="Polynomial")
+
+    def _check_highest_likelihood(self, num: int, name: str):
+        """
+        Test if the result is the most accurate one
+        """
+        optimal_likelihood = self._lda_model.evaluate()[1]
+        other_nums = [0, 1, num-1, num+1, num*2]
+
+        for other_num in other_nums:
+            if (other_num == num) or num < 0:
+                continue
+            other_likelihood = self._lda_model.evaluate(topic_num=other_num)[1]
+            unittest.TestCase.assertGreaterEqual(
+                self=self,
+                a=optimal_likelihood,
+                b=other_likelihood,
+                msg="Topic num {} has a better likelihood {} than {}  with {}:{}"
+                    .format(other_num, other_likelihood, name, num, optimal_likelihood))
+
+    def test_equal_likelihood(self):
+        """
+        The likelihood of both methods should be the same (i.e. the max),
+        However, the total topic nums do not have to be the same
+        """
+        unittest.TestCase.assertEqual(
+            self=self, first=self._topics_via_iter, second=self._topics_via_poly,
+            msg="Iter: {}\nPoly: {}"
+                .format(self._lda_model.evaluate(topic_num=self._optimal_topic_num_iter)[1],
+                        self._lda_model.evaluate(topic_num=self._optimal_topic_num_poly)[1]))
+
 
 if __name__ == '__main__':
     unittest.main()