In [3]:
import nltk
import json
from nltk.tokenize import sent_tokenize
import pandas as pd
import spacy
from spacy.matcher import Matcher, PhraseMatcher

In [4]:
# Load the JSON file
with open('datasets/EXTRACTED_FILINGS.json', 'r') as json_file:
    data = json.load(json_file)

In [5]:
df = pd.DataFrame(data)

In [6]:
content = df[['item_1A','item_7','item_7A']]
content.head()

Unnamed: 0,item_1A,item_7,item_7A
0,The healthcare products distribution industry ...,Cautionary Note Regarding Forward-Looking Stat...,"We are exposed to market risks, which include ..."
1,The healthcare products distribution industry ...,Cautionary Note Regarding Forward-Looking Stat...,"We are exposed to market risks, which include ..."
2,The healthcare products distribution industry ...,Cautionary Note Regarding Forward-Looking Stat...,"We are exposed to market risks, which include ..."
3,Declining economic conditions could adversely ...,Cautionary Note Regarding Forward-Looking Stat...,"We are exposed to market risks, which include ..."
4,Declining economic conditions could adversely ...,Cautionary Note Regarding Forward-Looking Stat...,"We are exposed to market risks, which include ..."


In [7]:
test_content = content.iloc[:50]

# Use Spacy to extract sentences from the text 


Performs rule-based matching on the text to extract sentences.

In [8]:
# Spacy installation
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

In [20]:
# import modules
import spacy
# import Matcher
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher

In [42]:
import spacy

load_model = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
words = 'financing'
" ".join([token.lemma_ for token in load_model(words)])

'finance'

## newest version of extracting sentences

next steps: find out a way to speed up the process of extracting sentences + write a function to extract sentences from a list of texts

### run on the whole dataset

In [10]:
# old version, input is a dataframe with 3 columns (item_1A, item_7, item_7A)
def extract_sentences_with_metrics(text_data):

    # Load the spaCy language model
    nlp = spacy.load('en_core_web_sm')

    # Define custom spaCy matchers for financial metrics

    # Revenue Matcher
    revenue_matcher = Matcher(nlp.vocab)
    revenue_matcher.add("revenue_match", [[{'LOWER': {'IN': ['income', 'proceeds', 'takings', 'receipts', 'sales', 'turnover']}}]])

    # Net Income Matcher
    net_income_matcher = PhraseMatcher(nlp.vocab)
    net_income_patterns = [nlp(text) for text in ('net income', 'profit', 'earnings', 'bottom line')]
    net_income_matcher.add("net_income_match", None, *net_income_patterns)

    # EBIT Matcher
    ebit_matcher = PhraseMatcher(nlp.vocab)
    ebit_patterns = [nlp(text) for text in ('ebit', 'earnings before interest and taxes', 'operating profit', 'operating income')]
    ebit_matcher.add("ebit_match", None, *ebit_patterns)

    # EPS Matcher
    eps_matcher = PhraseMatcher(nlp.vocab)
    eps_patterns = [nlp(text) for text in ('eps', 'earnings per share')]
    eps_matcher.add("eps_match", None, *eps_patterns)

    # Cash Flow Matchers
    cash_flow_matcher = Matcher(nlp.vocab)
    operating_patterns = [
        {'LOWER': {'in': ['cash', 'flow']}},
        {'LEMMA': {'in': ['operate','operation']}},
    ]

    operating_patterns1 = [
        {'LOWER': {'in': ['cffo','cfo']}},
    ]

    investing_patterns = [
        {'LOWER': {'in': ['cash', 'flow']}},
        {'LEMMA': {'in': ['invest','investment']}},
    ]

    financing_patterns = [
        {'LOWER': {'in': ['cash', 'flow']}},
        {'LEMMA': {'in': ['finance', 'funding']}},
    ]


    cash_flow_matcher.add("operating_match", [operating_patterns])
    cash_flow_matcher.add("operating_match1", [operating_patterns1])
    cash_flow_matcher.add("investing_match", [investing_patterns])
    cash_flow_matcher.add("financing_match", [financing_patterns])

    # Initialize lists to store categorized results for both financial metrics and cash flows
    sentences_metrics = []

    # Define a mapping of matchers to metric names
    matcher_to_metric = {
        revenue_matcher: "Revenue",
        net_income_matcher: "Net Income",
        ebit_matcher: "EBIT",
        eps_matcher: "EPS",
        cash_flow_matcher: "Cash Flow"
    }

    # Iterate through each column of the DataFrame
    for column in text_data.columns:
        # Iterate through each row in the column
        for idx, text in enumerate(text_data[column]):
            # Split the text into sentences
            sentences = [sentence.text for sentence in nlp(text).sents]

            # Iterate through each sentence
            for sentence in sentences:
                doc = nlp(sentence)

                # Check for financial metrics matches
                for matcher, metric_name in matcher_to_metric.items():
                    matches = matcher(doc)
                    if matches:
                        for match_id, start, end in matches:
                            if matcher == cash_flow_matcher:
                                matches = matcher(doc)
                                for match_id, start, end in matches:
                                    if nlp.vocab.strings[match_id] == "operating_match":
                                        sentences_metrics.append({'Sentence': sentence, 'Metric': 'Cash Flow (Operating)'})
                                    elif nlp.vocab.strings[match_id] == "operating_match1":
                                        sentences_metrics.append({'Sentence': sentence, 'Metric': 'Cash Flow (Operating)'})
                                    elif nlp.vocab.strings[match_id] == "investing_match":
                                        sentences_metrics.append({'Sentence': sentence, 'Metric': 'Cash Flow (Investing)'})
                                    elif nlp.vocab.strings[match_id] == "financing_match":
                                        sentences_metrics.append({'Sentence': sentence, 'Metric': 'Cash Flow (Financing)'})
                                    
                            else: sentences_metrics.append({'Sentence': sentence, 'Metric': metric_name})


    # Create a DataFrame with combined results
    sentences_metrics_df = pd.DataFrame(sentences_metrics)

    # Print the resulting DataFrame
    sentences_metrics_df.head()
    return sentences_metrics_df 


In [None]:
def extract_sentences_with_metrics(doc):

    # Load the spaCy language model
    nlp = spacy.load('en_core_web_sm')

    # Define custom spaCy matchers for financial metrics

    # Revenue Matcher
    revenue_matcher = Matcher(nlp.vocab)
    revenue_matcher.add("revenue_match", [[{'LOWER': {'IN': ['income', 'proceeds', 'takings', 'receipts', 'sales', 'turnover']}}]])

    # Net Income Matcher
    net_income_matcher = PhraseMatcher(nlp.vocab)
    net_income_patterns = [nlp(text) for text in ('net income', 'profit', 'earnings', 'bottom line')]
    net_income_matcher.add("net_income_match", None, *net_income_patterns)

    # EBIT Matcher
    ebit_matcher = PhraseMatcher(nlp.vocab)
    ebit_patterns = [nlp(text) for text in ('ebit', 'earnings before interest and taxes', 'operating profit', 'operating income')]
    ebit_matcher.add("ebit_match", None, *ebit_patterns)

    # EPS Matcher
    eps_matcher = PhraseMatcher(nlp.vocab)
    eps_patterns = [nlp(text) for text in ('eps', 'earnings per share')]
    eps_matcher.add("eps_match", None, *eps_patterns)

    # Cash Flow Matchers
    cash_flow_matcher = Matcher(nlp.vocab)
    operating_patterns = [
        {'LOWER': {'in': ['cash', 'flow']}},
        {'LEMMA': {'in': ['operate','operation']}},
    ]

    operating_patterns1 = [
        {'LOWER': {'in': ['cffo','cfo']}},
    ]

    investing_patterns = [
        {'LOWER': {'in': ['cash', 'flow']}},
        {'LEMMA': {'in': ['invest','investment']}},
    ]

    financing_patterns = [
        {'LOWER': {'in': ['cash', 'flow']}},
        {'LEMMA': {'in': ['finance', 'funding']}},
    ]


    cash_flow_matcher.add("operating_match", [operating_patterns])
    cash_flow_matcher.add("operating_match1", [operating_patterns1])
    cash_flow_matcher.add("investing_match", [investing_patterns])
    cash_flow_matcher.add("financing_match", [financing_patterns])

    # Initialize lists to store categorized results and 
    sentences_metrics = []

    # Define a mapping of matchers to metric names
    matcher_to_metric = {
        revenue_matcher: "Revenue",
        net_income_matcher: "Net Income",
        ebit_matcher: "EBIT",
        eps_matcher: "EPS",
        cash_flow_matcher: "Cash Flow"
    }

    
        # Iterate through each row in the column
        for idx, text in enumerate(text_data[column]):
            # Split the text into sentences
            sentences = [sentence.text for sentence in nlp(text).sents]

            # Iterate through each sentence
            for sentence in sentences:
                doc = nlp(sentence)

                # Check for financial metrics matches
                for matcher, metric_name in matcher_to_metric.items():
                    matches = matcher(doc)
                    if matches:
                        for match_id, start, end in matches:
                            if matcher == cash_flow_matcher:
                                matches = matcher(doc)
                                for match_id, start, end in matches:
                                    if nlp.vocab.strings[match_id] == "operating_match":
                                        sentences_metrics.append({'Sentence': sentence, 'Metric': 'Cash Flow (Operating)'})
                                    elif nlp.vocab.strings[match_id] == "operating_match1":
                                        sentences_metrics.append({'Sentence': sentence, 'Metric': 'Cash Flow (Operating)'})
                                    elif nlp.vocab.strings[match_id] == "investing_match":
                                        sentences_metrics.append({'Sentence': sentence, 'Metric': 'Cash Flow (Investing)'})
                                    elif nlp.vocab.strings[match_id] == "financing_match":
                                        sentences_metrics.append({'Sentence': sentence, 'Metric': 'Cash Flow (Financing)'})
                                    
                            else: sentences_metrics.append({'Sentence': sentence, 'Metric': metric_name})


    # Create a DataFrame with combined results
    sentences_metrics_df = pd.DataFrame(sentences_metrics)

    # Print the resulting DataFrame
    sentences_metrics_df.head()
    return sentences_metrics_df 


In [None]:
sentences_metrics_df = extract_sentences_with_metrics(test_content)

In [66]:
sentences_metrics_df.to_csv('../datasets/intermediary/sentences_metrics.csv', index=False)

### extract FLS

In [None]:
def extract_fls(doc,date):
    #performs rule-based-matching
    #takes a string and a year as input and returns individual sentences that contain a match
    
    # Load the spaCy language model
    nlp = spacy.load('en_core_web_sm')
    #initialize as nlp object and prepare list for storing matches
    
    nlp_doc = nlp(doc)
    fls=[]
    
    #two seperate matchers, so that pattern 3 can be checked separately 
    matcher = Matcher(nlp.vocab)
    matcher2 = Matcher(nlp.vocab)
    pattern1 =  [{"TEXT": {"IN": ["next", "subsequent", "following", "upcoming", "incoming", "coming", "succeeding", "carryforward"]}},
            {"TEXT": {"IN": ["month", "quarter", "year", "fiscal", "taxable", "period"]}}]
    pattern2 = [{"LEMMA": {"IN": ["aim", "anticipate", "assume", "commit", "estimate", "expect", "forecast", "foresee", "hope", "intend", "plan", "predict", "project", "seek","target"]},"POS": "VERB"}]
    pattern3 = [{"TEXT": {"REGEX": "[1-2][0-9][0-9][0-9]"}, "LENGTH": 4}]
    matcher.add('pattern1',[pattern1])
    matcher.add('pattern2',[pattern2])
    matcher2.add('pattern3',[pattern3])
    
    #if patterns were found in a sentence, append it to the list
    for sen in nlp_doc.sents:
        if matcher(sen) != []:
            fls.append(sen.text)
            continue
            
        #if no matches for patterns 1 or 2 are found, check matches for pattern 3 and check if they are higher than the year provided as input.
        elif matcher2(sen) != []:
            years=[]
            for match_id, start, end in matcher2(sen):
                years.append(int(sen[start:end].text))
            if max(years) > date:
                fls.append(sen.text)
                
    return fls

### Run data in batches

In [None]:
import concurrent.futures

# Define a function for processing a single data point
def process_data(data_point,batch_size):
    extract_sentences_with_metrics(data_point)
    pass


# Split your dataset into smaller batches
data_batches = [data[i:i+batch_size] for i in range(0, len(data), batch_size)]

# Process data points in parallel
with concurrent.futures.ProcessPoolExecutor() as executor:
    results = list(executor.map(process_data, data_batches))


In [12]:
# Test with test_content 
# Define rules for matcher
import spacy
from spacy.matcher import Matcher
import pandas as pd

# Load the spaCy language model
nlp = spacy.load('en_core_web_sm')

# Define custom spaCy rules using Matcher
matcher = Matcher(nlp.vocab)
matcher.add("revenue_match", [[{'LOWER': {'in': ['income', 'proceeds', 'takings', 'receipts', 'sales', 'turnover']}}]])
matcher.add("net_income_match", [[{'LOWER': {'in': ['profit', 'earnings', 'loss', 'bottom line', 'net income', 'net loss', 'net profit', 'net earnings']}}]])
matcher.add("ebit_match", [[{'LOWER': {'in': ['ebit', 'earnings before interest and taxes', 'operating profit', 'operating income']}}]])
matcher.add("eps_match", [[{'LOWER': {'in': ['eps', 'earnings per share']}}]])
# Initialize an empty list to store matching sentences
matching_sentences = []

# Iterate through each column of the DataFrame
for column in test_content.columns:
    # Iterate through each row in the column
    for idx, text in enumerate(test_content[column]):
        # Split the text into sentences
        sentences = [sentence.text for sentence in nlp(text).sents]
        
        # Iterate through each sentence
        for sentence in sentences:
            doc = nlp(sentence)
            matches = matcher(doc)
            
            # Check if there are any matches
            if matches:
                # Check if 'income' is in the sentence
                if 'net income' in sentence.lower():
                    # Consider 'net income' as a pattern
                    matching_sentences.append({'Column': column, 'Sentence': sentence, 'metric': 'net income'})
                else:
                    # Consider revenue as a pattern
                    matching_sentences.append({'Column': column, 'Sentence': sentence, 'metric': 'revenue'})
                

# Create a new DataFrame with matching sentences
result_df = pd.DataFrame(matching_sentences)

# Print the resulting DataFrame
print(result_df)


KeyboardInterrupt: 