Skip to content

Commit

Permalink
Merge pull request #58 from navigating-stories/issue39
Browse files Browse the repository at this point in the history
fixes issue #39 and others
  • Loading branch information
kodymoodley committed Apr 18, 2024
2 parents 1b94d3e + fcf635c commit d7015b9
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 25 deletions.
22 changes: 20 additions & 2 deletions orangecontrib/storynavigation/modules/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import string
import pandas as pd
import storynavigation.modules.constants as constants
from nltk.tokenize import sent_tokenize

def entity_tag_already_exists(ents, start, end):
for ent in ents:
Expand Down Expand Up @@ -83,8 +84,25 @@ def load_spacy_pipeline(name):
nlp.add_pipe("sentencizer")
return nlp


def preprocess_text(text):
# Match all letter followed by whitespace following by newline character (no fullstop)
# Basically, add a fullstop where it should be and was forgotten
# e.g. "Its where Kody went
# He landed in India."
# replaced with:
# "Its where Kody went. He landed in India."
regex_pattern = r'([a-zA-Z])\s*\n([A-Z])'
replacement_pattern = r'\1. \2' # replace with first letter, fullstop, space and uppercase letter
processed_text_step1 = re.sub(regex_pattern, replacement_pattern, text)
# Change all newlines to spaces
processed_text_step2 = processed_text_step1.replace("\n", " ")
# Remove all quotes
quote_pattern = r'[\'\"‘’“”]'
processed_text_step3 = re.sub(quote_pattern, '', processed_text_step2)
# return sentences (tokenized from text)
return sent_tokenize(processed_text_step3)

def preprocess_text_complex(text):
"""Preprocesses story text. A lot of stories in the Corona in de stad dataset
have sentences with no period at the end followed immediately by newline characters.
This function processes these and other issues to make the resulting text suitable for
Expand All @@ -101,7 +119,7 @@ def preprocess_text(text):
for i in re.finditer("\n[A-Z]", text):
startindex = i.start()
match_indices.append(startindex + 1)
match_indices.append(None)
# match_indices.append(None)
# split the text into clauses (based on regex matches) - clauses can be single or multiple sentences
clauses = [
text[match_indices[i] : match_indices[i + 1]]
Expand Down
59 changes: 36 additions & 23 deletions orangecontrib/storynavigation/widgets/OWSNActorAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@
import storynavigation.modules.util as util
import storynavigation.modules.error_handling as error_handling

from thefuzz import fuzz
from thefuzz import process

HTML = """
<!doctype html>
<html>
Expand Down Expand Up @@ -639,6 +642,7 @@ def advance(progress):

selected_storyids = []
otherids = []

for doc_count, c_index in enumerate(sorted(self.selected_documents)):
selected_storyids.append('ST' + str(c_index))
otherids.append(str(c_index))
Expand All @@ -656,7 +660,6 @@ def advance(progress):

return self.actor_results_df, self.valid_stories, self.selected_actor_results_df, self.selected_custom_freq, self.full_custom_freq


def reset_widget(self):
self.stories = None
self.story_elements = None
Expand Down Expand Up @@ -714,9 +717,33 @@ def list_docs(self):
docs = self.regenerate_docs()
self.doc_list_model.setup_data(self.stories.titles.tolist(), docs)

def get_el_story_text(self, df):
return ' '.join(df['sentence'].unique().tolist()) # Concatenate all unique sentences in a dataframe column into a single story text

def fuzzy_match_text(self, text1, text2):
return fuzz.ratio(text1, text2) # Fuzzy string matching of two story texts

def find_matching_story_in_story_elements(self, c_index, story_text):
for storyid, story_df in self.story_elements_dict.items(): # Loop through dataframes for each story (subset of rows of the Elements table)
el_story_text = self.get_el_story_text(story_df) # Concatenate the sentences in the current dataframe into a single story string
score = self.fuzzy_match_text(el_story_text, story_text) # Check if the current story text is the same as the selected story text
if score >= 90:
return int(storyid) # If the stories match, return the Elements storyid (the correct story id)
return c_index # Otherwise, return the default storyid given by the doclist model

def get_selected_indexes(self) -> Set[int]:
m = self.doc_list.model().mapToSource
return {m(i).row() for i in self.doc_list.selectionModel().selectedRows()}
result = set()
for i in self.doc_list.selectionModel().selectedRows(): # Each i represents a new selected story
c_index = m(i).row() # Get the currently selected story i index (int)
obj = self.regenerate_docs()[c_index] # get the story object at c_index location in the doc_list model, obj (str) : has the structure 'filename path/to/filename.ext story-text'
story_text = ' '.join(obj.split()[2:]) # Only select the story text itself from obj (third component)
sentences = util.preprocess_text(story_text) # Preprocess story i text to match similar output sentences to Elements table (sentences)
sen_fullstop = [sen+'.' for sen in sentences] # Add a fullstop after each sentence
proc_story_text = ' '.join(sen_fullstop) # Concatenate sentences together to create a story string
correct_story_id = self.find_matching_story_in_story_elements(c_index, proc_story_text) # Find the matching story in Elements table for story i
result.add(correct_story_id) # Add the correct story_id to the selected documents index
return result

def set_selection(self) -> None:
"""
Expand Down Expand Up @@ -841,10 +868,11 @@ def show_docs(self, slider_engaged=False):
value = os.path.join(feature.attributes.get("origin", ""), value)
value = '<img src="{}"></img>'.format(value)

text += (
f'<tr><td class="variables"><strong>{feature.name}:</strong></td>'
f'<td class="content">{value}</td></tr>'
)
if feature.name.lower() == "content" or feature.name.lower() == "text":
text += (
# f'<tr><td class="variables"><strong>{feature.name}:</strong></td>'
f'<td class="content">{value}</td></tr>'
)

parts.append(text)

Expand Down Expand Up @@ -926,6 +954,8 @@ def on_done(self, res: int):

# deal with stories that do not have entry in story elements frame
if self.stories is not None:
# print("3. (on_done func): ", self.stories)
# print()
domain = Domain([], metas=self.display_features)
metas = []
for item in self.valid_stories:
Expand Down Expand Up @@ -962,19 +992,6 @@ def on_done(self, res: int):
def on_exception(self, ex):
raise ex

# def update_info(self):
# # self.pos_checkboxes = [self.sc, self.nc]
# if self.stories is not None:
# has_tokens = self.stories.has_tokens()
# self.n_matching = f"{self.doc_list.model().rowCount()}/{len(self.stories)}"
# self.n_tokens = sum(map(len, self.stories.tokens)) if has_tokens else "n/a"
# self.n_types = len(self.stories.dictionary) if has_tokens else "n/a"
# else:
# self.n_matching = "n/a"
# self.n_matches = "n/a"
# self.n_tokens = "n/a"
# self.n_types = "n/a"

@gui.deferred
def commit(self):
# self.pos_checkboxes = [self.sc, self.nc]
Expand Down Expand Up @@ -1045,8 +1062,4 @@ def migrate_context(cls, context, version):

if __name__ == "__main__":
from orangewidget.utils.widgetpreview import WidgetPreview
# from orangecontrib.text.preprocess import BASE_TOKENIZER
# corpus_ = Corpus.from_file("book-excerpts")
# corpus_ = corpus_[:3]
# corpus_ = BASE_TOKENIZER(corpus_)
WidgetPreview(OWSNActorAnalysis).run(None)
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@ Orange3-Text
Orange3-network
pandas
spacy
nltk
dhtmlparser3
textblob
textblob-nl
pydot
graphviz
scipy==1.12.0
thefuzz
beautifulsoup4
coverage
Expand Down
2 changes: 2 additions & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ install_requires =
spacy >= 3.7.2
scipy == 1.12.0
dhtmlparser3 >= 3.0.17
nltk >= 3.8.1
scipy == 1.12.0
textblob >= 0.17.1
textblob-nl >= 0.0.1
pydot >= 1.4.2
Expand Down

0 comments on commit d7015b9

Please sign in to comment.