In [2]:
from decimal import Decimal, ROUND_HALF_UP
from constants import EXPONENTIAL_K_VALUE
from intervaltree import Interval, IntervalTree
from NewsSentiment import TargetSentimentClassifier


def scaling(avg_array_list, k=3, linear=False):
    neutral_points = positive_points = negative_points = 0

    for avg_array in avg_array_list:
        for i, avg_value in enumerate(avg_array):

            if linear:
                points = avg_value * 100
            else:
                points = (avg_value ** k) * 100

            if i == 0:
                neutral_points += points
            elif i == 1:
                positive_points += points
            elif i == 2:
                negative_points += points
    return [neutral_points, positive_points, negative_points]


def average_array(probabilities):
    num_probabilities = len(probabilities)
    neutral_total = positive_total = negative_total = 0

    for prob_data in probabilities:
        if not prob_data:
            continue
        class_prob = prob_data['class_prob']
        class_label = prob_data['class_label']

        if class_label == 'neutral':
            neutral_total += class_prob
        elif class_label == 'positive':
            positive_total += class_prob
        elif class_label == 'negative':
            negative_total += class_prob

    neutral_avg = neutral_total / num_probabilities if num_probabilities > 0 else 0
    positive_avg = positive_total / num_probabilities if num_probabilities > 0 else 0
    negative_avg = negative_total / num_probabilities if num_probabilities > 0 else 0

    return [neutral_avg, positive_avg, negative_avg]


def round_array_to_1dp(arr):
    decimal_array = [Decimal(str(x)) for x in arr]
    rounded_array = [x.quantize(Decimal('0.0'), rounding=ROUND_HALF_UP) for x in decimal_array]
    rounded_sum = sum(rounded_array)
    adjustment = Decimal('100') - rounded_sum
    rounded_array[-1] += adjustment
    rounded_array = [float(x) for x in rounded_array]
    return rounded_array


def percentage_contribution(elements):
    total = sum(elements)
    percentage_contributions = [(element / total) * 100 for element in elements]
    return round_array_to_1dp(percentage_contributions)


class SentimentAnalyser:

    def __init__(self):
        self.tsc = TargetSentimentClassifier()

    def bounds_sentiment(self, mention_start, mention_end, sentence_start, sentence_end,
                         article_text, database_id):
        try:
            left_segment = article_text[sentence_start:mention_start]
            mention_segment = article_text[mention_start:mention_end]
            right_segment = article_text[mention_end:sentence_end]

            # Could add logging here to see the quality of sentence segmentation.
            start_time = time.time()
            sentiment = self.tsc.infer_from_text(left_segment, mention_segment, right_segment)
            elapsed_time = time.time() - start_time

            if elapsed_time > 5:
                print(f"News Sentiment Time > 5 seconds so: {elapsed_time} seconds")
            # print(sentiment[0])

            return sentiment[0]

        except Exception as e:
            print(f"Error during sentiment analysis: {e}")
            print(f"LEFT: {left_segment}")
            print(f"MENTION: {mention_segment}")
            print(f"RIGHT: {right_segment}")

            print(f"Creating BoundError:\n"
                  f"Article ID: {database_id}\n"
                  f"Bound Start: {mention_start}\n"
                  f"Bound End: {mention_end}\n"
                  f"Left Segment: {left_segment}\n"
                  f"Mention Segment: {mention_segment}\n"
                  f"Right Segment: {right_segment}\n"
                  f"Error Message: Exception during sentiment analysis")

            return None

    def process_clustered_entities(self, clustered_entities, sentence_bounds, article_text,
                                   database_id,
                                   debug):
        START_HIGHLIGHT = '\033[0m'
        END_HIGHLIGHT = '\033[94m'
        GREEN = '\033[92m'
        END_COLOR = '\033[0m'

        bounds_tree = IntervalTree(Interval(start, end) for start, end in
                                   sentence_bounds)

        bounds_sentiment = {}

        ''' Some entities at this point may not have been fully consolidated.
            Running the model is the most intensive part of this process.
            Since non consolidated entities likely have the same coref cluster.
            Running model over save cluster more than once is wasteful.'''

        cluster_id_mapping = {}  # Map cluster_id to bounds_sentiment

        for entity in clustered_entities:
            entity_name = entity['Entity Name']
            if 'entity_db_id' in entity:
                entity_db_id = entity['entity_db_id']
            else:
                print("Process clustered entities would skip...")
                print(entity_name)
                continue
            cluster_positions = entity['Cluster Info']['Cluster Positions']
            cluster_id = entity['Cluster Info']['Cluster ID']

            # Check if the cluster_id has been seen before
            if cluster_id in cluster_id_mapping:
                # print('Using cached bounds sentiment')
                # If so, use the cached bounds_sentiment
                for entry in cluster_id_mapping[cluster_id]:
                    bounds_key = entry['bounds_key']

                    if entity_name not in bounds_sentiment[bounds_key]:
                        bounds_sentiment[bounds_key][entity_name] = {}

                    if entity_db_id not in bounds_sentiment[bounds_key][entity_name]:
                        bounds_sentiment[bounds_key][entity_name][entity_db_id] = []

                    bounds_sentiment[bounds_key][entity_name][entity_db_id].append(entry['result'])

            else:
                cluster_id_mapping[cluster_id] = []

                for mention_start, mention_end in cluster_positions:
                    overlap = bounds_tree.overlap(mention_start, mention_end)
                    if overlap:
                        for interval in overlap:
                            sentence_start, sentence_end = interval.begin, interval.end
                            bounds_key = (sentence_start, sentence_end)

                            if bounds_key not in bounds_sentiment:
                                bounds_sentiment[bounds_key] = {}

                            if entity_name not in bounds_sentiment[bounds_key]:
                                bounds_sentiment[bounds_key][entity_name] = {}

                            if entity_db_id not in bounds_sentiment[bounds_key][entity_name]:
                                bounds_sentiment[bounds_key][entity_name][entity_db_id] = []

                            highlighted_text = (
                                    START_HIGHLIGHT +
                                    article_text[sentence_start:mention_start] + END_HIGHLIGHT +
                                    article_text[mention_start:mention_end] + START_HIGHLIGHT +
                                    article_text[mention_end:sentence_end] + END_HIGHLIGHT)

                            result = self.bounds_sentiment(mention_start, mention_end,
                                                           sentence_start, sentence_end,
                                                           article_text, database_id)

                            bounds_sentiment[bounds_key][entity_name][entity_db_id].append(
                                result)

                            cluster_id_mapping[cluster_id].append({
                                'bounds_key': bounds_key,
                                'result': result
                            })

                            if debug:
                                print(
                                    START_HIGHLIGHT + f"{entity_name} - Mention ({mention_start}, {mention_end}) is within bounds ({sentence_start}, {sentence_end})")
                                print(highlighted_text)
                                print(
                                    GREEN + f"NewsSentiment Candidateappearance{len(bounds_sentiment[bounds_key][entity_name][entity_db_id])}" + END_COLOR)

        return bounds_sentiment

    @staticmethod
    def average_sentiment_results(source_article_id, bounds_sentiment, article_text):
        if bounds_sentiment is None:
            print("Error: bounds_sentiment is None")
            return
        entity_averages = {}
        for bounds_key, entity_results in bounds_sentiment.items():
            for entity_name, entity_db_ids in entity_results.items():
                # print("Entity DB IDs: ")
                # print(entity_db_ids)
                for entity_db_id, results in entity_db_ids.items():

                    if not results:  # Empty results for an entity? Skip...
                        continue
                    # print(results)
                    avg = average_array(results)

                    # Store entity - bound mention - bound text - average result in database

                    if entity_name not in entity_averages:
                        entity_averages[entity_name] = {
                            "entity_db_ids": [entity_db_id],
                            "bounds_keys": [bounds_key],
                            "sentiment_scores": [avg],
                            "text": [article_text[bounds_key[0]:bounds_key[1]]],
                        }
                    else:
                        entity_averages[entity_name]["entity_db_ids"].append(entity_db_id)
                        entity_averages[entity_name]["bounds_keys"].append(bounds_key)
                        entity_averages[entity_name]["sentiment_scores"].append(avg)
                        entity_averages[entity_name]["text"].append(
                            article_text[bounds_key[0]:bounds_key[1]])

        # print('Sentiment Scores Format: [Neutral, Positive, Negative]')
        for entity_name, averages in entity_averages.items():
            entity_db_id = averages['entity_db_ids'][0]
            # print(f"Averages for {entity_name} (Entity DB ID: {entity_db_id}):")
            sentiment_scores = averages['sentiment_scores']
            text = averages['text']
            bounds_keys = averages['bounds_keys']

            for i, scores in enumerate(sentiment_scores):
                # print("Sentiment Scores:", scores)
                # print("Text:", text[i])
                # print("Bounds Keys:", bounds_keys[i])
                print()

                # DatabaseUtils.insert_bound_mention_data(entity_name, source_article_id,
                #                                         entity_db_id,
                #                                         scores, text[i],
                #                                         bounds_keys[i])
                
                print(f"Inserting bound mention data: \n"
                      f"Entity Name: {entity_name}\n"
                      f"Source Article ID: {source_article_id}\n"
                      f"Entity Database ID: {entity_db_id}\n"
                      f"Scores: {scores}\n"
                      f"Text Snippet: {text[i]}\n"
                      f"Bounds Keys: {bounds_keys[i]}")

            num_bound = len(averages['sentiment_scores'])
            scaled_classification = scaling(averages['sentiment_scores'],
                                            k=EXPONENTIAL_K_VALUE)

            # Can't scale an array of [0, 0, 0] -> Divide by zero error.
            if sum(scaled_classification) == 0:
                # print(scaled_classification)
                continue

            exp_percent = percentage_contribution(scaled_classification)

            linear_scaled_classification = scaling(averages['sentiment_scores'],
                                                   linear=True)
            linear_percent = percentage_contribution(
                linear_scaled_classification)
            
              # DatabaseUtils.insert_overall_sentiment(source_article_id, entity_db_id, num_bound,
              #                              linear_percent[0],
              #                              linear_percent[1],
              #                              linear_percent[2],
              #                              exp_percent[0], exp_percent[1], exp_percent[2])

            print("Inserting overall sentiment data:")
            print(f"Source Article ID: {source_article_id}")
            print(f"Entity Name: {entity_name}")
            print(f"Number of Bounds: {num_bound}")
            print(f"Linear Percent - Neutral: {linear_percent[0]}")
            print(f"Linear Percent - Positive: {linear_percent[1]}")
            print(f"Linear Percent - Negative: {linear_percent[2]}")
            print(f"Exponential Percent - Neutral: {exp_percent[0]}")
            print(f"Exponential Percent - Positive: {exp_percent[1]}")
            print(f"Exponential Percent - Negative: {exp_percent[2]}")


In [59]:
import nltk
import ppdeep
from lexicalrichness import LexicalRichness


def calculate_statistics(text_body):
    lex = LexicalRichness(text_body)
    common_stop_words = ["the", "and", "is", "of", "in", "it", "that", "to", "with"]
    tokens = nltk.word_tokenize(text_body)
    stop_word_counts = {word: tokens.count(word) for word in common_stop_words}

    try:
        vocd_int = lex.vocd()
    except ValueError:
        vocd_int = None

    linguistic_stats = {
        "fuzzy_hash": ppdeep.hash(text_body),
        "word_count": len(tokens),
        "terms_count": lex.terms,
        "vocd": vocd_int,
        "yulek": lex.yulek,
        "simpsond": lex.simpsond,
        "the_count": stop_word_counts["the"],
        "and_count": stop_word_counts["and"],
        "is_count": stop_word_counts["is"],
        "of_count": stop_word_counts["of"],
        "in_count": stop_word_counts["in"],
        "it_count": stop_word_counts["it"],
        "that_count": stop_word_counts["that"],
        "to_count": stop_word_counts["to"],
        "with_count": stop_word_counts["with"],
    }

    return linguistic_stats



class ArticleUpdate:
    def __init__(self, text_body, article_model):
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')

        self.text_body = text_body  # Added by trafilatura
        self.linguistic_stats = None
        self.article_model = article_model

    def get_statistics(self):
        self.linguistic_stats = calculate_statistics(self.text_body)



In [60]:
import math
import re
import urllib.robotparser
from collections import defaultdict
from functools import reduce
import requests
from bs4 import BeautifulSoup
from textblob import TextBlob
from constants import (ENTITY_THRESHOLD_PERCENT,
                        MENTION_REQ_PER,
                        MERGE_REMOVAL_INDICATOR,
                        COMBINED_REMOVAL_INDICATOR,
                        COMBINED_CLUSTER_ID_SEPARATOR,
                        SIMILAR_SEARCH_DAYS,
                        PREVIEW_IMG_TIMEOUT)


def can_fetch_url(url_to_check):
    """Determine if the URL can be fetched by all crawlers - adding politeness / adherence to robot
    policy."""
    parsed_url = urlparse(url_to_check)
    base_url = parsed_url.scheme + "://" + parsed_url.netloc
    rules = urllib.robotparser.RobotFileParser()
    rules.set_url(base_url + "/robots.txt")
    rules.read()
    return rules.can_fetch("*", url_to_check)


def get_preview_image_url(url):
    try:
        response = requests.get(url, timeout=PREVIEW_IMG_TIMEOUT)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            og_image = soup.find('meta', property='og:image')
            if og_image:
                return og_image['content']

            # Twitter Card image tag
            twitter_image = soup.find(name='twitter:image')
            if twitter_image:
                return twitter_image['content']

    except Exception as e:
        print(f"Error fetching preview image URL: {e}")


def merge_positions(entities, word):
    """Merge all instances of the same entity into a single entry with multiple
    positions. Where same means a lowercase word match. Place them into an
    'entities' dictionary."""
    entity_key = (word.text + word.label_).lower()
    if entity_key in entities:
        entities[entity_key][1].append([word.start_char, word.end_char])
    else:
        entities[entity_key] = [word.text, [[word.start_char, word
        .end_char]], word.label_]
    return entities


def cleanse_cluster_text(cluster_text):
    return [word.strip() for word in cluster_text if word.lower().strip() not
            in undesired_words]

undesired_words = ["i", "he", "his", "she", "they", "it", "this", "that",
                   "these", "those", "the", "a", "an", "of"]


def remove_titles(text):
    title_pattern = r"^(Mr|Mrs|Ms|Miss|Dr|Prof|Rev|Capt|Sir|Madam|Mx|Esq|Hon|Gen|Col|Sgt|Fr|Sr|Jr|Lord|Lady)\s"
    text = re.sub(title_pattern, "", text)

    '''10th Nov adding as:
        {'Entity Name': 'Keith', 'Positions': [[11664, 11669], [12301, 12306], [14453, 14458], 
        [15005, 15010], [15286, 15291]], 'Label': 'PERSON', 'Num Positions': 5, 'Cluster Info': 
        {'Cluster ID': 12, 'Cluster Text': ['Keith', 'Keith', 'Keith', 'Keith', 'Hugo Keith KC',
        'Keith', 'Keith', 'Keith'], 'Cluster Positions': [(11664, 11669), (12301, 12306), 
        (12312, 12314), (13026, 13031), (13464, 13469), (14374, 14387), (14453, 14458),
        (15005, 15010), (15286, 15291)]}}
        would have been updated to Hugo Keith had it not been for KC which made it 3 words'''

    # Pattern for titles at the end
    title_pattern_end = r"\s*(KC|QC)\s*$"
    text = re.sub(title_pattern_end, "", text)

    return text


def insert_intervals(initial_list, new_values):
    """The insert_intervals function enables the segmentation of sentences provided by
    TextBlob (as shown in the cell below) to be further divided into smaller boundaries. This
     division is based on the identification of specific points where it is deemed necessary
     to split sentences, particularly within the context of news articles."""

    def insert_recursive(intervals, values):
        if not values:
            return intervals  # Base case: Return the intervals when there are no more values to insert.

        value = values[0]
        result = []
        for interval in intervals:
            if interval[0] <= value <= interval[1]:
                # If the value falls within an existing interval, split the interval into two parts.
                # The first part goes from the interval's start to the value (inclusive), and
                # the second part goes from the value+1 to the interval's end.
                if interval[0] < value:
                    # To mess around with intervals change value + - offset here.
                    result.append((interval[0], value))
                if value < interval[1]:
                    # To mess around with intervals change value + - offset here.
                    result.append((value + 1, interval[1]))
            else:
                # If the value doesn't fall within the interval, keep the interval as is.
                result.append(interval)
        # Recursively process other values.
        return insert_recursive(result, values[1:])

        # Recursive function call

    updated_list = insert_recursive(initial_list, new_values)
    return updated_list


def is_substring(entity1, entity2):
    return entity1.lower() in entity2.lower() or entity2.lower() in entity1.lower()


def combine_entities(entities, cluster_text):
    combined_entity = None

    for entity1 in entities:
        for entity2 in entities:
            if entity1 != entity2:
                combined1 = entity1 + ' ' + entity2
                combined2 = entity2 + ' ' + entity1

                if combined1 in cluster_text or combined2 in cluster_text:
                    combined_entity = combined1 if combined1 in cluster_text \
                        else combined2
                    break

    return combined_entity


def update_entity_name(entry):
    """Calls remove titles and removes possessives before checking if an entity name is a
     substring of a 2 word entry in the coref cluster e.g. Johnson should become Boris Johnson"""

    entity_name = entry['Entity Name']
    cluster_text = entry['Cluster Info']['Cluster Text']

    for text in cluster_text:
        # Remove titles as not relevant
        text = remove_titles(text)
        # Replaces left / right quotation mark with standard single quotation mark
        text = text.replace('’', "'").replace('‘', "'")
        # Remove possessive markers for comparison
        text = text.replace("’s", "")
        # Remove space and quote
        text = text.replace(" '", "")
        # Check if the current entity name is a substring of a 2-word cluster text entry
        if len(text.split()) == 2 and entity_name in text:
            entry['Entity Name'] = text
            break
    return entry


def clean_up_substrings(clustered_entities):
    longest_names = {}
    entities_to_keep = []

    # Identify longest names in cluster ID and remove shorter ones
    for entity in clustered_entities:
        cluster_id = entity['Cluster Info']['Cluster ID']
        entity_name = entity['Entity Name']
        current_longest = longest_names.get(cluster_id, "")

        if len(entity_name) > len(current_longest):
            # Remove the shorter entity without adding it to the list of entities to keep
            if current_longest:
                entities_to_keep = [e for e in clustered_entities if
                                    e['Cluster Info']['Cluster ID'] != cluster_id]
            longest_names[cluster_id] = entity_name

            # Add the entity to the list of entities to keep
            entities_to_keep.append(entity)

    # Set merge indicator for entities with more than one associated name in the original list
    for entity in clustered_entities:
        cluster_id = entity['Cluster Info']['Cluster ID']
        if len([e for e in entities_to_keep if
                e['Cluster Info']['Cluster ID'] == cluster_id]) > 1:
            entity['Num Positions'] = int(MERGE_REMOVAL_INDICATOR)
            entity['Positions'] = int(MERGE_REMOVAL_INDICATOR)
    return entities_to_keep


def create_entity_entry(entity_name, positions, label, num_positions):
    return {
        'Entity Name': entity_name,
        'Positions': positions,
        'Label': label,
        'Num Positions': num_positions,
        'Cluster Info': []
    }


class Article:

    def __init__(self, url, headline, text_body, NER, date, author, site_name):
        self.url = url
        self.NER = NER
        self.headline = headline
        self.image_url = None
        self.description = None
        self.text_body = text_body  # Added by trafilatura
        self.coref_clusters = None
        self.people_entities = None  # NER results here.
        self.sentence_bounds = None
        self.num_sentences = None
        self.mention_threshold = None
        self.entity_to_cluster_mapping = []
        self.clustered_entities = None
        self.database_candidate = False
        self.database_id = None
        self.bounds_sentiment = None
        self.sentiment_analyser = None
        self.publication_date = date
        self.author = author
        self.site_name = site_name
        self.linguistic_stats = None

    def set_sentiment_analyser(self, sa):

        if sa is None:
            self.sentiment_analyser = SentimentAnalyser()
        else:
            self.sentiment_analyser = sa

    def get_bounds_sentiment(self):
            self.bounds_sentiment = self.sentiment_analyser.process_clustered_entities(
                clustered_entities=self.clustered_entities, sentence_bounds=self.sentence_bounds,
                article_text=self.text_body, database_id=self.database_id,
                debug=True)

    def print_clustered_entities(self):
        for entry in self.clustered_entities:
            print(entry)
            print()

    def print_entity_to_cluster_mapping(self):
        for entry in self.entity_to_cluster_mapping:
            print(entry)
            print()

    def determine_entity_to_cluster_mapping(self):

        """
        Removing pronouns & other 'useless words' (his, her, he, I etc) then doing a % match rate on
        entity text across cluster entries for each entity. If match rate % exceeds threshold
        pair them.

        Improvement: Spliting the entity names into part words e.g. Sadiq Khan will be split into
        Sadiq and Khan for evaluation purposes. This way if they are mostly mentioned by first
        or second name the match still has an opportunity to take place.
        """

        for entity_type, entities in self.people_entities.items():
            for entity in entities:
                entity_name, positions, label, num_positions = entity
                entity_entry = create_entity_entry(entity_name, positions, label,
                                                   num_positions)
                self.process_clusters_for_entity(entity_entry, entity_name)

    def process_clusters_for_entity(self, entity_entry, entity_name):
        cluster_id = 0
        print(f"Processing clusters for entity: {entity_name}")
    
        for index, (cluster_text, cluster_positions, _) in self.coref_clusters:
            cluster_id += 1
            print(f"\nCluster ID: {cluster_id}, Original Text: {cluster_text}")
    
            cluster_text = cleanse_cluster_text(cluster_text)
            cleaned_cluster_text = [remove_titles(text) for text in cluster_text]
    
            if len(cleaned_cluster_text) < 4:
                print(f"Skipping cluster {cluster_id} as it has length < 4")
                continue
    
            total_coref_words = " ".join(cleaned_cluster_text)
            entity_parts = entity_name.split()
            max_percentage = 0.0
            winning_entity_part = None
    
            for entity_part in entity_parts:
                entity_count = total_coref_words.count(entity_part)
                percentage = entity_count / len(cleaned_cluster_text) * 100  # Adjusted for readability
                
                if percentage > max_percentage:
                    max_percentage = percentage
                    winning_entity_part = entity_part
                    
                print(f"Matching '{entity_part}' in cluster {cluster_id}: {percentage:.2f}% match.")
    
            if max_percentage >= ENTITY_THRESHOLD_PERCENT:
                print(f"Accepting cluster {cluster_id} for '{entity_name}' with {max_percentage:.2f}% match on part '{winning_entity_part}'.")
                cluster_entry = {
                    'Cluster ID': cluster_id,
                    'Cluster Text': cluster_text,
                    'Cluster Positions': cluster_positions
                }
                
                entity_entry['Cluster Info'] = cluster_entry
                self.entity_to_cluster_mapping.append(entity_entry)
                break
    
        if not entity_entry.get('Cluster Info'):
            print(f"No suitable clusters found for entity: {entity_name}")


    def set_coref_clusters(self, sorted_combined_clusters):
        # Add an ID to each cluster
        self.coref_clusters = list(enumerate(sorted_combined_clusters))

    def source_ner_people(self):

        """SpaCy is a popular NLP library that offers pre-trained models for various languages, and
            its NER component is capable of recognising and categorising named entities within text.
            It is utilised here to identify PERSON entities"""

        NER = spacy.load("en_core_web_sm")
        article_text = self.text_body
        article = NER(article_text)

        '''# Recommended mention - 'Discard a cluster c in a document d if |Mc| ≤ 0.2|Sd|,  
        where |...| is the number of mentions of a cluster (Mc) and sentences in a document (Sd)
        (NEWS-MTSC approach)'''

        entity_types = ["CARDINAL", "DATE", "EVENT", "FAC", "GPE", "LANGUAGE", "LAW",
                        "LOC", "MONEY", "NORP", "ORDINAL", "ORG", "PERCENT",
                        "PERSON", "PRODUCT", "QUANTITY", "TIME", "WORK_OF_ART"]

        entity_type_to_entities = {
            entity_type: [
                [
                    entity_text,
                    positions,
                    label,
                    len(positions)
                ] for entity_text, positions, label in reduce(
                    merge_positions,
                    filter(lambda word: word.label_ == entity_type, article.ents),
                    {}
                ).values()
            ] for entity_type in entity_types
        }

        for entity_type, entities in entity_type_to_entities.items():
            print(f"Entity Type: {entity_type}")
            for entity in entities:
                entity_text, positions, label, num_positions = entity
                if entity_type == 'PERSON':
                    print(f"Entity: {entity_text}")
                    print(f"Positions: {positions}")
                    print(f"Label: {label}")
                    print(f"Number of Positions: {num_positions}")
                    print()

        people_entities = {entity_type: entity_info for entity_type, entity_info in
                           entity_type_to_entities.items() if entity_type == 'PERSON'}

        for entity in entity_type_to_entities[entity_type]:
            entity_text, positions, label, num_positions = entity
            if entity_type not in people_entities:
                people_entities[entity_type] = []
            people_entities[entity_type].append({
                'Entity': entity_text,
                'Positions': positions,
                'Label': label,
                'Number of Positions': num_positions
            })

        self.people_entities = people_entities

    def determine_sentences(self):

        """
        Process the article text, tokenise by sentence and add custom adjustments to the
        tokenization using insert intervals below.
        spaCy was not satisfactory for accurately tokenising for sentence start / end
        characters. Trying textblob instead. TextBlob is a Python library for processing textual
        data that is bulit upon NLTK.

        TextBlob can provide me with the start and end of sentences by using the sentences
        attribute of a TextBlob object. This attribute returns a list of Sentence objects, each
        of which has a start and end property that indicates the index of the first and last
        character of the sentence within the original text."""

        # Process the article text and adjust tokenization
        article_text = self.text_body
        blob = TextBlob(article_text)
        sentences = blob.sentences

        # Determine custom tokenization:
        hyphen_sentences = re.split(r'\n-', article_text)
        # In testing article a new line and '-' hyphen use typically means consider
        # a different sentence.
        hyphen_nl_pos = [pos for pos, char in enumerate(article_text) if
                         article_text[pos:pos + 2] == '\n-']
        extra_split = hyphen_nl_pos

        sentence_bounds = [(int(sentence.start), int(sentence.end)) for sentence
                           in sentences]

        # Insert custom intervals
        updated_list = insert_intervals(sentence_bounds, extra_split)

        self.sentence_bounds = updated_list
        self.num_sentences = len(list(sentence_bounds))
        self.mention_threshold = math.floor(self.num_sentences * MENTION_REQ_PER)
        
        print("Processing article text for custom tokenization...")

        print(f"Original Article Text: {article_text[:100]}...")
        print(f"Number of sentences detected by TextBlob: {len(sentences)}")
        print(f"Hyphen-newline positions (custom splits): {hyphen_nl_pos}")
        print("Sentence bounds before custom adjustments:")
        print(sentence_bounds)
        
        print("Updated sentence bounds after inserting custom intervals:")
        print(updated_list)
        
        print(f"Total number of sentences after adjustments: {self.num_sentences}")
        print(f"Mention threshold (floor of sentences * {MENTION_REQ_PER}): {self.mention_threshold}")

    def entity_cluster_map_consolidation(self):
        """
        Purposes of below code:

        1. Substring Matching: If one entity is a substring of another entity, they
        are considered as candidates for consolidation. For example, if "Rishi" is a
        substring of "Rishi Sunak," the code merges them into the longer version, "Rishi Sunak."

        2. First and Second Name Combination: If there are two entities, one
        representing the first name and the other representing the last name, and
        they share the same coreference cluster, the code attempts to combine them
        into a single entity.

        3. 9th November - add while loop to see if continuing consolidation until no more
        consolidation takes place results in a better consolidation as theorised.

        4. 16th November - resolve instances of 'King\n11:43' which should be King.

        """

        cluster_dict = defaultdict(list)
        for entry in self.entity_to_cluster_mapping:
            entity_name = entry['Entity Name']
            cluster_id = entry['Cluster Info']['Cluster ID']
            cluster_dict[cluster_id].append(entry)

        # While loop to continue consolidation until no more consolidation can be done
        consolidation_done = True
        while consolidation_done:
            consolidation_done = False
            for cluster_id, entries in cluster_dict.items():
                if len(entries) > 1:
                    combined_entry = None
                    for i, entry1 in enumerate(entries):
                        for j, entry2 in enumerate(entries):
                            # print(f"\nBefore update:\nEntry 1: {entry1}\nEntry 2: {entry2}")
                            entry1 = update_entity_name(entry1)
                            entry2 = update_entity_name(entry2)
                            # print(f"After update:\nEntry 1: {entry1}\nEntry 2: {entry2}")
                            
                            if i < j:
                                entity_1_name = entry1['Entity Name']
                                entity_2_name = entry2['Entity Name']

                                cluster_text = entries[0]['Cluster Info']['Cluster Text']
                                combined_entity = combine_entities([entity_1_name,
                                                                    entity_2_name],
                                                                   cluster_text)

                                if is_substring(entity_1_name, entity_2_name):
                                    if len(entity_1_name) > len(entity_2_name):
                                        if entry2 in entries:
                                            entries.remove(entry2)
                                            
                                            print(f"Removing {entity_2_name} as substring of "
                                                  f"{entity_1_name}")
                                        # Coreference cluster will be used anyway -200 to
                                        # indicate merge via removal.
                                        entry1['Num Positions'] = int(MERGE_REMOVAL_INDICATOR)
                                        entry1['Positions'] = int(MERGE_REMOVAL_INDICATOR)

                                    else:
                                        if entry1 in entries:
                                            print(f"Removing {entity_1_name} as substring of "
                                                  f"{entity_2_name}")
                                            entries.remove(entry1)
                                            
                                        # Coreference cluster will be used anyway -200 to
                                        # indicate merge via removal.
                                        entry2['Num Positions'] = int(MERGE_REMOVAL_INDICATOR)
                                        entry2['Positions'] = int(MERGE_REMOVAL_INDICATOR)

                                    consolidation_done = True
                                    # exit inner for loop
                                    break
                                elif combined_entity in cluster_text:
                                    print(f"{entry1['Entity Name']} and {entry2['Entity Name']} "
                                          "exist as a "
                                          "combination in cluster "
                                          "text making a new cluster by combining positions")
                                    combined_entry = {
                                        'Entity Name': combined_entity,
                                        # Coreference cluster will be used anyway -100 to
                                        # indicate merge via combined entity.
                                        'Positions': int(COMBINED_REMOVAL_INDICATOR),
                                        'Label': entry1['Label'],
                                        # Coreference cluster will be used anyway -100 to
                                        # indicate merge via combined entity.
                                        'Num Positions': int(COMBINED_REMOVAL_INDICATOR),
                                        'Cluster Info': entry1['Cluster Info']
                                    }
                                    entries.remove(entry1)
                                    entries.remove(entry2)
                                    consolidation_done = True
                                    # exit inner for loop
                                    break
                        if combined_entry:
                            entries.append(combined_entry)
                        if consolidation_done:
                            # exit outer for loop
                            break

            '''
            Now look across cluster ids (above we stayed within a cluster id) for substrings of
            entity names and merge together cluster ids, positions and text.
            '''
            for cluster_id1, entries1 in cluster_dict.items():
                for cluster_id2, entries2 in cluster_dict.items():
                    if cluster_id1 != cluster_id2:  # Prevents comparing entries within the same
                        # cluster
                        for entry1 in entries1:
                            for entry2 in entries2:
                                # print(f"\nBefore update:\nEntry 1: {entry1}\nEntry 2: {entry2}")
                                entry1 = update_entity_name(entry1)
                                entry2 = update_entity_name(entry2)
                                # print(f"After update:\nEntry 1: {entry1}\nEntry 2: {entry2}")

                                entity_1_name = entry1['Entity Name']
                                entity_2_name = entry2['Entity Name']
                                # Check if an entity name is a substring of another.
                                if entity_1_name in entity_2_name or entity_2_name in entity_1_name:
                                    print(f'cross cluster merge triggered as {entity_1_name} '
                                          'in {entity_2_name} or {entity_2_name} in '
                                          '{entity_1_name}')
                                    print(entity_1_name)
                                    print(entity_2_name)
                                    # Combine cluster IDs with '0000' in between as a strong
                                    # indicator.
                                    combined_cluster_id = f"{entry1['Cluster Info']['Cluster ID']}{COMBINED_CLUSTER_ID_SEPARATOR}{entry2['Cluster Info']['Cluster ID']}"

                                    # Set the new cluster ID
                                    entry1['Cluster Info']['Cluster ID'] = combined_cluster_id
                                    entry2['Cluster Info']['Cluster ID'] = combined_cluster_id

                                    # Append cluster texts and positions to entry 1.
                                    entry1['Cluster Info']['Cluster Text'].extend(
                                        entry2['Cluster Info']['Cluster Text'])
                                    entry1['Cluster Info']['Cluster Positions'].extend(
                                        entry2['Cluster Info']['Cluster Positions'])

                                    if len(entity_2_name) > len(entity_1_name):
                                        entry1['Entity Name'] = entity_2_name
                                    entries2.remove(entry2)
                                    consolidation_done = True

        clustered_entities = [entry for entries in cluster_dict.values() for entry in entries]

        before_len = len(clustered_entities)

        cleaned_entities = []

        # Resolve \n in entity name instances and any below mention threshold.
        for entity in clustered_entities:
            entity_name = entity['Entity Name']
            cleaned_name = re.split(r'\n\d*:', entity_name)[0].strip()

            entity_name = remove_titles(cleaned_name)

            '''Found 'Entity Name': 'Reginald D. Hunter’s' - handle removing 's from last word'''

            # Replaces left / right quotation mark with standard single quotation mark
            entity_name = entity_name.replace('’', "'").replace('‘', "'")

            # Handle the relatively common case of Meghan Markle '  i.e the space then quote mark
            entity_name = entity_name.rstrip("'")

            # Split the entity name into words
            words = entity_name.split()

            # Check if the last word ends with 's, and if so, remove it
            if words and words[-1].endswith("'s"):
                words[-1] = words[-1][:-2]  # Remove 's from the last word

            # Join the words back into the entity name
            cleaned_name = ' '.join(words)

            # Capitalise the first letter of each word and make the rest lowercase
            cleaned_name = ' '.join(word.capitalize() for word in words)

            # Remove spaces @ start and end string
            cleaned_name = cleaned_name.strip()

            # Replace the original entity name with the cleaned name
            entity['Entity Name'] = cleaned_name

            cluster_positions = entity['Cluster Info']['Cluster Positions']
            cluster_id = entity['Cluster Info']['Cluster ID']

            # Count the number of entries in cluster_positions
            num_entries = len(cluster_positions)

            # Check if the number of entries is below the threshold
            if num_entries < self.mention_threshold:
                print(f"Cluster ID {cluster_id} has {num_entries} entries, which is below the "
                      f"threshold of {self.mention_threshold}.")
                # print(f"Removing entity {entity['Entity Name']} with Cluster ID {cluster_id} due "
                #       f"to low mention count:")
                # print(entity['Cluster Info'])
                clustered_entities.remove(entity)
                continue

            # At most, an entity should have the first, middle and last name.
            if len(cleaned_name.split()) > 3:
                print(f"Cluster ID {cluster_id} has {num_entries} entries, which is below the "
                      f"threshold of {self.mention_threshold}.")
                print(
                    f"Removing entity {entity['Entity Name']} with Cluster ID {cluster_id} due "
                    f"to low mention count:")
                print(entity['Cluster Info'])
                clustered_entities.remove(entity)
                continue

            # If the entity is not removed, add it to the cleaned list
            cleaned_entities.append(entity)

        clustered_entities = cleaned_entities
        new_length = len(clustered_entities)

        print(f"number of entities before mention threshold: {before_len}")
        print(f"number of entities after mention threshold: {new_length}")

        clustered_entities = clean_up_substrings(clustered_entities)

        self.clustered_entities = clustered_entities

        # Is this article going to go on the web app? If clustered_entities > 0 then yes so get
        # article parts and insert into database.
        if new_length > 0:
            self.set_database_candidate_true()

    def set_database_candidate_true(self):
        self.database_candidate = True

    def get_average_sentiment_results(self):
        self.sentiment_analyser.average_sentiment_results(self.database_id, self.bounds_sentiment,
                                                          self
                                                          .text_body)

    def save_to_database(self):

        self.image_url = get_preview_image_url(self.url)
        
        print("Would normally save article stats to database now")
        
        print(f"""
                fuzzy_hash={self.linguistic_stats["fuzzy_hash"]},
                word_count={self.linguistic_stats["word_count"]},
                terms_count={self.linguistic_stats["terms_count"]},
                vocd={self.linguistic_stats["vocd"]},
                yulek={self.linguistic_stats["yulek"]},
                simpsond={self.linguistic_stats["simpsond"]},
                the_count={self.linguistic_stats["the_count"]},
                and_count={self.linguistic_stats["and_count"]},
                is_count={self.linguistic_stats["is_count"]},
                of_count={self.linguistic_stats["of_count"]},
                in_count={self.linguistic_stats["in_count"]},
                to_count={self.linguistic_stats["to_count"]},
                it_count={self.linguistic_stats["it_count"]},
                that_count={self.linguistic_stats["that_count"]},
                with_count={self.linguistic_stats["with_count"]},
                """)

    def get_statistics(self):
        self.linguistic_stats = calculate_statistics(self.text_body)



In [61]:
from datetime import datetime
import urllib.robotparser
import trafilatura
import spacy
import time
import logging
import urllib.request
import socket


from fastcoref import FCoref
from urllib.parse import urlparse
from urllib.robotparser import RobotFileParser


def can_fetch_url(url_to_check):
    """Determine if the URL can be fetched by all crawlers - adding politeness / adherence to
        robot policy."""
    parsed_url = urlparse(url_to_check)
    base_url = parsed_url.scheme + "://" + parsed_url.netloc

    rules = RobotFileParser()
    try:
        with urllib.request.urlopen(base_url + "/robots.txt", timeout=5) as response:
            rules.parse(response.read().decode('utf-8').splitlines())
        return rules.can_fetch("*", url_to_check)
    except urllib.error.URLError as e:
        print(f"Error accessing robots.txt: {e}")
    except socket.timeout as e:
        print(f"Timeout occurred: {e}")

    # Default to False as if Robot can't be checked then not compliant + the site may timeout.
    return False


def perform_coreference_resolution(article_texts, batch_size=100):
    model = FCoref(device='mps')
    predictions = model.predict(texts=article_texts, max_tokens_in_batch=batch_size)

    # Empty list to store clusters for each article
    article_text_clusters = []

    for prediction in predictions:
        clusters_text = prediction.get_clusters()
        clusters_positions = prediction.get_clusters(as_strings=False)
        combined_clusters = [(text, positions, len(text)) for text, positions in
                             zip(clusters_text, clusters_positions)]
        sorted_combined_clusters = sorted(combined_clusters, key=lambda x: x[2], reverse=True)
        article_text_clusters.append(sorted_combined_clusters)
            
    for i, (clusterText, clusterPos, cluster_text_count) in enumerate(sorted_combined_clusters, 1):
        print(f"Cluster {i} : {clusterText}")
        print(f"Cluster Text Count: {cluster_text_count}")
        print("Positions:", clusterPos)
        

    return article_text_clusters


In [62]:
left_segment = ''
mention_segment = 'He'
right_segment = 'got a round of applause'

# start_time = time.time()
tsc = TargetSentimentClassifier()
sentiment = tsc.infer_from_text(left_segment, mention_segment, right_segment)
print(sentiment[0])
            # elapsed_time = time.time() - start_time

{'class_id': 2, 'class_label': 'positive', 'class_prob': 0.9137029647827148}


In [5]:
tsc = TargetSentimentClassifier()
sentiment = tsc.infer_from_text("The plan devised by  " ,"Russell", " fell apart.")
print(sentiment[0])

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'class_id': 0, 'class_label': 'negative', 'class_prob': 0.6077610850334167}


In [None]:
tsc = TargetSentimentClassifier()

In [8]:

sentiment = tsc.infer_from_text("" , "Peter's" , "excellent plan has been impacted by Paul's "
                                                 "dreadful delivery of it")
print(sentiment[0])

sentiment = tsc.infer_from_text("Peter's excellent plan has been impacted by ", "Paul's",
                                                 "dreadful delivery of it")
print(sentiment[0])

{'class_id': 2, 'class_label': 'positive', 'class_prob': 0.5761538147926331}
{'class_id': 0, 'class_label': 'negative', 'class_prob': 0.9824183583259583}


In [63]:
left_segment = 'The case was back in court on Friday where lawyers for the Biden administration argued that Harry might have made up stories of his drug use to “sell books.” The Heritage Foundation’s Nile Gardiner said, according to the Telegraph, that this was a “ridiculous argument,” saying, “He has never denied anything in his own book... including the extensive widespread drug use.” Speaking outside court after the hearing, Gardiner said it was “highly unlikely” Prince Harry entered as a diplomat as “he had no official role on behalf of British people, his own relationship with the royal family, that was at a low point as the judge himself actually referenced in his remarks.” The Heritage Foundation previously argued the former royal waived his right to privacy when he “sold every aspect of his private life for, in some estimates, over $135 million,” adding that'
mention_segment = 'his'
right_segment = 'right to privacy when he “sold every aspect of his private life for, in some \
estimates, over $135 million,” adding that his claims of his right to privacy have been “met with widespread public ridicule.” During Friday’s court session Harry’s recent interview on Good Morning America in which he said he had considered applying for US citizenship, was mentioned.'

try:
    # start_time = time.time()
    tsc = TargetSentimentClassifier()
    sentiment = tsc.infer_from_text(left_segment, mention_segment, right_segment)
    print(sentiment[0])
                # elapsed_time = time.time() - start_time
except Exception as e:
    print(f"Error during sentiment analysis: [{type(e)}] {e}")

Error during sentiment analysis: [<class 'NewsSentiment.customexceptions.TooLongTextException'>] 


In [64]:
# url =  'https://www.thesun.co.uk/news/25217908/rishi-sunak-slashed-small-boat-arrivals-illegal-migration/'
# headline  = '<b>Rishi Sunak</b> has slashed small boat arrivals &amp; voters worried about illegal migration should back him to sort it'

url = ('https://www.express.co.uk/news/politics/1871347/sir-keir-starmer-labour-polls-populatiry-reform')
headline  = 'Sir <b>Keir Starmer</b>&#39;s popularity drops while some Tory voters vow to vote Reform'
article_list = []
try:
    if can_fetch_url(url):
        downloaded = trafilatura.fetch_url(url)
        # Extract metadata
        metadata = trafilatura.extract_metadata(downloaded)
        # print(metadata.date)

        # Extract publication date
        date_str = metadata.date
        naive_datetime = datetime.strptime(date_str, '%Y-%m-%d')

        # datetime aware to a satisfy model
        publication_date = naive_datetime

        # Extract some useful trafilatura metadata.
        author = metadata.author

        site_name = metadata.sitename
        article_text = trafilatura.extract(downloaded, favour_recall=True,
                                           include_comments=False, include_images=False,
                                           include_tables=False)

        if article_text and len(article_text) > 249:
            article_obj = Article(url, headline, article_text, None,  publication_date,
                                  author, site_name)
            article_list.append(article_obj)
        elif article_text is None:
            print('article text is None')
        elif len(article_text) < 250:
            print(article_text)
            

except Exception as e:
    print(f"Error processing article: {url}")
    print(f"Error message: {str(e)}")
    
article_objects = article_list


In [65]:
article_texts = [article.text_body for article in article_objects]
article_text_clusters = perform_coreference_resolution(article_texts)

print(article_text_clusters)

for article, clusters in zip(article_objects, article_text_clusters):
    article.set_coref_clusters(clusters)

03/02/2024 18:26:45 - INFO - 	 missing_keys: []
03/02/2024 18:26:45 - INFO - 	 unexpected_keys: []
03/02/2024 18:26:45 - INFO - 	 mismatched_keys: []
03/02/2024 18:26:45 - INFO - 	 error_msgs: []
03/02/2024 18:26:45 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
03/02/2024 18:26:45 - INFO - 	 Tokenize 1 inputs...


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

03/02/2024 18:26:45 - INFO - 	 ***** Running Inference on 1 texts *****


Inference:   0%|          | 0/1 [00:00<?, ?it/s]

Cluster 1 : ["Labour's", 'the Opposition party', 'Labour’s', 'Labour', 'Labour', 'Labour', 'his Party', 'Labour', 'Labour', 'Labour', 'Labour']
Cluster Text Count: 11
Positions: [(78, 86), (146, 166), (230, 238), (373, 379), (439, 445), (963, 969), (1042, 1051), (1240, 1246), (1767, 1773), (1895, 1901), (2163, 2169)]
Cluster 2 : ['Tory', 'the Tories', 'the Conservatives', 'Tory', 'the Conservatives', 'Tory', 'Tory', 'Tory', 'the party’s']
Cluster Text Count: 9
Positions: [(47, 51), (249, 259), (403, 420), (2005, 2009), (2046, 2063), (2107, 2111), (2557, 2561), (2579, 2583), (2712, 2723)]
Cluster 3 : ['Hoyle', 'Speaker Sir Lindsay Hoyle', 'the Speaker', 'Hoyle', 'Sir Lindsay', 'he', 'Sir Lindsay', 'he', 'his']
Cluster Text Count: 9
Positions: [(224, 229), (1064, 1089), (1197, 1208), (1324, 1329), (1734, 1745), (1751, 1753), (1824, 1835), (1866, 1868), (1961, 1964)]
Cluster 4 : ["Sir Keir Starmer's", 'Keir Starmer', 'Sir Keir Starmer', 'his', 'his', 'the Labour leader', 'his', 'Sir Keir'

In [66]:
start_time = time.time()
article.source_ner_people()
print(f"NER People Time: {time.time() - start_time} seconds")

Entity Type: CARDINAL
Entity Type: DATE
Entity Type: EVENT
Entity Type: FAC
Entity Type: GPE
Entity Type: LANGUAGE
Entity Type: LAW
Entity Type: LOC
Entity Type: MONEY
Entity Type: NORP
Entity Type: ORDINAL
Entity Type: ORG
Entity Type: PERCENT
Entity Type: PERSON
Entity: Keir Starmer
Positions: [[4, 16], [168, 180], [578, 590]]
Label: PERSON
Number of Positions: 3

Entity: Lindsay Hoyle
Positions: [[1076, 1089]]
Label: PERSON
Number of Positions: 1

Entity: Hoyle
Positions: [[1324, 1329]]
Label: PERSON
Number of Positions: 1

Entity: Keir
Positions: [[1389, 1393]]
Label: PERSON
Number of Positions: 1

Entity: Penny Mordaunt
Positions: [[1466, 1480]]
Label: PERSON
Number of Positions: 1

Entity: Lindsay
Positions: [[1738, 1745], [1828, 1835]]
Label: PERSON
Number of Positions: 2

Entity: Lee Anderson MP
Positions: [[2527, 2542]]
Label: PERSON
Number of Positions: 1

Entity: Tory
Positions: [[2579, 2583]]
Label: PERSON
Number of Positions: 1

Entity: Sadiq Khan
Positions: [[2618, 2628]]

In [67]:
start_time = time.time()
article.determine_sentences()
print(f"Sentence determined Time: {time.time() - start_time} seconds")

Processing article text for custom tokenization...
Original Article Text: Sir Keir Starmer's popularity drops while some Tory voters vow to vote Reform
Labour's popularity dr...
Number of sentences detected by TextBlob: 22
Hyphen-newline positions (custom splits): []
Sentence bounds before custom adjustments:
[(0, 167), (168, 315), (316, 428), (429, 563), (564, 659), (660, 827), (828, 858), (859, 938), (939, 1164), (1165, 1271), (1272, 1378), (1379, 1614), (1615, 1823), (1824, 1970), (1971, 2089), (2090, 2170), (2171, 2274), (2275, 2440), (2441, 2567), (2568, 2676), (2677, 2794), (2795, 2880)]
Updated sentence bounds after inserting custom intervals:
[(0, 167), (168, 315), (316, 428), (429, 563), (564, 659), (660, 827), (828, 858), (859, 938), (939, 1164), (1165, 1271), (1272, 1378), (1379, 1614), (1615, 1823), (1824, 1970), (1971, 2089), (2090, 2170), (2171, 2274), (2275, 2440), (2441, 2567), (2568, 2676), (2677, 2794), (2795, 2880)]
Total number of sentences after adjustments: 22
Men

In [68]:
start_time = time.time()
article.determine_entity_to_cluster_mapping()
print(f"Entity to cluster map time: {time.time() - start_time} seconds")

Processing clusters for entity: Keir Starmer

Cluster ID: 1, Original Text: ["Labour's", 'the Opposition party', 'Labour’s', 'Labour', 'Labour', 'Labour', 'his Party', 'Labour', 'Labour', 'Labour', 'Labour']
Matching 'Keir' in cluster 1: 0.00% match.
Matching 'Starmer' in cluster 1: 0.00% match.

Cluster ID: 2, Original Text: ['Tory', 'the Tories', 'the Conservatives', 'Tory', 'the Conservatives', 'Tory', 'Tory', 'Tory', 'the party’s']
Matching 'Keir' in cluster 2: 0.00% match.
Matching 'Starmer' in cluster 2: 0.00% match.

Cluster ID: 3, Original Text: ['Hoyle', 'Speaker Sir Lindsay Hoyle', 'the Speaker', 'Hoyle', 'Sir Lindsay', 'he', 'Sir Lindsay', 'he', 'his']
Matching 'Keir' in cluster 3: 0.00% match.
Matching 'Starmer' in cluster 3: 0.00% match.

Cluster ID: 4, Original Text: ["Sir Keir Starmer's", 'Keir Starmer', 'Sir Keir Starmer', 'his', 'his', 'the Labour leader', 'his', 'Sir Keir']
Matching 'Keir' in cluster 4: 80.00% match.
Matching 'Starmer' in cluster 4: 60.00% match.
Acce

In [69]:
start_time = time.time()
article.entity_cluster_map_consolidation()
print(f"Entity cluster map consolidation Time: {time.time() - start_time} seconds")

Removing Keir Starmer's as substring of Keir Starmer's
Removing Hoyle as substring of Lindsay Hoyle
Removing Anderson as substring of Lee Anderson MP
Removing Lindsay as substring of Lindsay Hoyle
number of entities before mention threshold: 4
number of entities after mention threshold: 4
Entity cluster map consolidation Time: 0.0009729862213134766 seconds


In [70]:
article.print_clustered_entities()

{'Entity Name': 'Keir Starmer', 'Positions': -200, 'Label': 'PERSON', 'Num Positions': -200, 'Cluster Info': {'Cluster ID': 4, 'Cluster Text': ["Sir Keir Starmer's", 'Keir Starmer', 'Sir Keir Starmer', 'the Labour leader', 'Sir Keir'], 'Cluster Positions': [(0, 18), (168, 180), (574, 590), (600, 603), (900, 903), (959, 976), (1042, 1045), (1385, 1393)]}}

{'Entity Name': 'Lindsay Hoyle', 'Positions': -200, 'Label': 'PERSON', 'Num Positions': -200, 'Cluster Info': {'Cluster ID': 3, 'Cluster Text': ['Hoyle', 'Speaker Sir Lindsay Hoyle', 'the Speaker', 'Hoyle', 'Sir Lindsay', 'Sir Lindsay'], 'Cluster Positions': [(224, 229), (1064, 1089), (1197, 1208), (1324, 1329), (1734, 1745), (1751, 1753), (1824, 1835), (1866, 1868), (1961, 1964)]}}

{'Entity Name': 'Lee Anderson Mp', 'Positions': -200, 'Label': 'PERSON', 'Num Positions': -200, 'Cluster Info': {'Cluster ID': 5, 'Cluster Text': ['Lee Anderson MP, who lost the Tory whip', 'The former Tory deputy chairman', 'Mr Anderson', 'the MP’s'], 'C

In [71]:
if article.database_candidate:
    # article.save_to_database() already done earlier in similar check now

    if article.database_id != -1:
        for entity_data in article.clustered_entities:
            entity_name = entity_data['Entity Name']
            print("Would now be given a database id for the entity: ", entity_name)
            # entity_db_id = DatabaseUtils.insert_entity(entity_name, article.database_id)
            entity_data['entity_db_id'] = 999

        sentiment_analyser_demo = SentimentAnalyser()
        article.set_sentiment_analyser(sentiment_analyser_demo)

        start_time = time.time()
        article.get_bounds_sentiment()
        print(f"Bounds sentiment time: {time.time() - start_time} seconds")

        start_time = time.time()
        article.get_average_sentiment_results()
        # print(f"Average results time: {time.time() - start_time} seconds")
        # < 0.5 seconds

        print('Article would now be set as processed, similar rejection = False')
        # article.set_db_processed(True, similar_rejection=False)

elif not article.database_candidate:
    # print("Not enough mentions to add")
    print("Article set db processed, similar rejection = false")
    # article.set_db_processed(True, similar_rejection=False)
else:
    print("Article already exists in the database")

Would now be given a database id for the entity:  Keir Starmer
Would now be given a database id for the entity:  Lindsay Hoyle
Would now be given a database id for the entity:  Lee Anderson Mp
Would now be given a database id for the entity:  Tory
[0mKeir Starmer - Mention (0, 18) is within bounds (0, 167)
[0m[94mSir Keir Starmer's[0m popularity drops while some Tory voters vow to vote Reform
Labour's popularity drops following a rocky week in Westminster for the Opposition party.[94m
[92mNewsSentiment Candidateappearance1[0m
[0mKeir Starmer - Mention (168, 180) is within bounds (168, 315)
[0m[94mKeir Starmer[0m denies threatening to withdraw support for Hoyle
Labour’s lead over the Tories has dropped by three points, according to a fresh poll.[94m
[92mNewsSentiment Candidateappearance1[0m
[0mKeir Starmer - Mention (574, 590) is within bounds (564, 659)
[0mMeanwhile [94mSir Keir Starmer[0m recorded his lowest approval rating since May last year at plus two.[94m
[92m