Merge pull request #58 from navigating-stories/issue39

fixes issue #39 and others
navigating-stories · Apr 18, 2024 · d7015b9 · d7015b9
2 parents 1b94d3e + fcf635c
commit d7015b9
Show file tree

Hide file tree

Showing 4 changed files with 60 additions and 25 deletions.
diff --git a/orangecontrib/storynavigation/modules/util.py b/orangecontrib/storynavigation/modules/util.py
@@ -7,6 +7,7 @@
 import string
 import pandas as pd
 import storynavigation.modules.constants as constants
+from nltk.tokenize import sent_tokenize
 
 def entity_tag_already_exists(ents, start, end):
     for ent in ents:
@@ -83,8 +84,25 @@ def load_spacy_pipeline(name):
         nlp.add_pipe("sentencizer")
     return nlp
 
-
 def preprocess_text(text):
+    # Match all letter followed by whitespace following by newline character (no fullstop)
+    # Basically, add a fullstop where it should be and was forgotten
+    # e.g. "Its where Kody went   
+    #       He landed in India."
+    # replaced with:
+    # "Its where Kody went. He landed in India."
+    regex_pattern = r'([a-zA-Z])\s*\n([A-Z])'
+    replacement_pattern = r'\1. \2' # replace with first letter, fullstop, space and uppercase letter
+    processed_text_step1 = re.sub(regex_pattern, replacement_pattern, text)
+    # Change all newlines to spaces
+    processed_text_step2 = processed_text_step1.replace("\n", " ")
+    # Remove all quotes
+    quote_pattern = r'[\'\"‘’“”]'
+    processed_text_step3 = re.sub(quote_pattern, '', processed_text_step2)
+    # return sentences (tokenized from text)
+    return sent_tokenize(processed_text_step3)
+
+def preprocess_text_complex(text):
     """Preprocesses story text. A lot of stories in the Corona in de stad dataset
     have sentences with no period at the end followed immediately by newline characters.
     This function processes these and other issues to make the resulting text suitable for
@@ -101,7 +119,7 @@ def preprocess_text(text):
     for i in re.finditer("\n[A-Z]", text):
         startindex = i.start()
         match_indices.append(startindex + 1)
-    match_indices.append(None)
+    # match_indices.append(None)
     # split the text into clauses (based on regex matches) - clauses can be single or multiple sentences
     clauses = [
         text[match_indices[i] : match_indices[i + 1]]

diff --git a/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py b/orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py
@@ -48,6 +48,9 @@
 import storynavigation.modules.util as util
 import storynavigation.modules.error_handling as error_handling
 
+from thefuzz import fuzz
+from thefuzz import process
+
 HTML = """
 <!doctype html>
 <html>
@@ -639,6 +642,7 @@ def advance(progress):
 
         selected_storyids = []
         otherids = []
+
         for doc_count, c_index in enumerate(sorted(self.selected_documents)):
             selected_storyids.append('ST' + str(c_index))
             otherids.append(str(c_index))
@@ -656,7 +660,6 @@ def advance(progress):
 
         return self.actor_results_df, self.valid_stories, self.selected_actor_results_df, self.selected_custom_freq, self.full_custom_freq
 
-
     def reset_widget(self):
         self.stories = None
         self.story_elements = None
@@ -714,9 +717,33 @@ def list_docs(self):
             docs = self.regenerate_docs()
             self.doc_list_model.setup_data(self.stories.titles.tolist(), docs)
 
+    def get_el_story_text(self, df):
+        return ' '.join(df['sentence'].unique().tolist())               # Concatenate all unique sentences in a dataframe column into a single story text
+
+    def fuzzy_match_text(self, text1, text2):
+        return fuzz.ratio(text1, text2)                                 # Fuzzy string matching of two story texts
+
+    def find_matching_story_in_story_elements(self, c_index, story_text):
+        for storyid, story_df in self.story_elements_dict.items():      # Loop through dataframes for each story (subset of rows of the Elements table)
+            el_story_text = self.get_el_story_text(story_df)            # Concatenate the sentences in the current dataframe into a single story string
+            score = self.fuzzy_match_text(el_story_text, story_text)    # Check if the current story text is the same as the selected story text
+            if score >= 90:
+                return int(storyid)                                     # If the stories match, return the Elements storyid (the correct story id)
+        return c_index                                                  # Otherwise, return the default storyid given by the doclist model
+
     def get_selected_indexes(self) -> Set[int]:
         m = self.doc_list.model().mapToSource
-        return {m(i).row() for i in self.doc_list.selectionModel().selectedRows()}
+        result = set()
+        for i in self.doc_list.selectionModel().selectedRows():         # Each i represents a new selected story
+            c_index = m(i).row()                                        # Get the currently selected story i index (int)
+            obj = self.regenerate_docs()[c_index]                       # get the story object at c_index location in the doc_list model, obj (str) : has the structure 'filename path/to/filename.ext story-text'
+            story_text = ' '.join(obj.split()[2:])                      # Only select the story text itself from obj (third component)
+            sentences = util.preprocess_text(story_text)                # Preprocess story i text to match similar output sentences to Elements table (sentences)
+            sen_fullstop = [sen+'.' for sen in sentences]               # Add a fullstop after each sentence
+            proc_story_text = ' '.join(sen_fullstop)                    # Concatenate sentences together to create a story string
+            correct_story_id = self.find_matching_story_in_story_elements(c_index, proc_story_text)     # Find the matching story in Elements table for story i
+            result.add(correct_story_id)                                # Add the correct story_id to the selected documents index
+        return result
 
     def set_selection(self) -> None:
         """
@@ -841,10 +868,11 @@ def show_docs(self, slider_engaged=False):
                     value = os.path.join(feature.attributes.get("origin", ""), value)
                     value = '<img src="{}"></img>'.format(value)
 
-                text += (
-                    f'<tr><td class="variables"><strong>{feature.name}:</strong></td>'
-                    f'<td class="content">{value}</td></tr>'
-                )
+                if feature.name.lower() == "content" or feature.name.lower() == "text":
+                    text += (
+                        # f'<tr><td class="variables"><strong>{feature.name}:</strong></td>'
+                        f'<td class="content">{value}</td></tr>'
+                    )
 
             parts.append(text)
 
@@ -926,6 +954,8 @@ def on_done(self, res: int):
 
         # deal with stories that do not have entry in story elements frame
         if self.stories is not None:
+            # print("3. (on_done func): ", self.stories)
+            # print()
             domain = Domain([], metas=self.display_features)
             metas = []
             for item in self.valid_stories:
@@ -962,19 +992,6 @@ def on_done(self, res: int):
     def on_exception(self, ex):
         raise ex
 
-    # def update_info(self):
-    #     # self.pos_checkboxes = [self.sc, self.nc]
-    #     if self.stories is not None:
-    #         has_tokens = self.stories.has_tokens()
-    #         self.n_matching = f"{self.doc_list.model().rowCount()}/{len(self.stories)}"
-    #         self.n_tokens = sum(map(len, self.stories.tokens)) if has_tokens else "n/a"
-    #         self.n_types = len(self.stories.dictionary) if has_tokens else "n/a"
-    #     else:
-    #         self.n_matching = "n/a"
-    #         self.n_matches = "n/a"
-    #         self.n_tokens = "n/a"
-    #         self.n_types = "n/a"
-
     @gui.deferred
     def commit(self):
         # self.pos_checkboxes = [self.sc, self.nc]
@@ -1045,8 +1062,4 @@ def migrate_context(cls, context, version):
 
 if __name__ == "__main__":
     from orangewidget.utils.widgetpreview import WidgetPreview
-#     from orangecontrib.text.preprocess import BASE_TOKENIZER
-#     corpus_ = Corpus.from_file("book-excerpts")
-#     corpus_ = corpus_[:3]
-#     corpus_ = BASE_TOKENIZER(corpus_)
     WidgetPreview(OWSNActorAnalysis).run(None)
diff --git a/requirements.txt b/requirements.txt
@@ -5,11 +5,13 @@ Orange3-Text
 Orange3-network
 pandas
 spacy
+nltk
 dhtmlparser3
 textblob
 textblob-nl
 pydot
 graphviz
+scipy==1.12.0
 thefuzz
 beautifulsoup4
 coverage

diff --git a/setup.cfg b/setup.cfg
@@ -45,6 +45,8 @@ install_requires =
     spacy >= 3.7.2
     scipy == 1.12.0
     dhtmlparser3 >= 3.0.17
+    nltk >= 3.8.1
+    scipy == 1.12.0
     textblob >= 0.17.1
     textblob-nl >= 0.0.1
     pydot >= 1.4.2