In [1]:
import nltk
import json
from nltk.tokenize import sent_tokenize
import pandas as pd
import spacy
from spacy.matcher import Matcher, PhraseMatcher

In [2]:
# Load the JSON file
with open('../datasets/EXTRACTED_FILINGS.json', 'r') as json_file:
    data = json.load(json_file)
df = pd.DataFrame(data)

In [3]:
df['year'] = pd.to_numeric(df['year'])
filtered_df = df[df['year']>=2015]
filtered_df.reset_index(drop=True, inplace=True)
filtered_df.head()

Unnamed: 0,cik,company,year,item_1A,item_7,item_7A
0,1000228,HENRY SCHEIN INC,2015,The risks described below could have a materia...,Cautionary Note Regarding Forward-Looking Stat...,We are exposed to market risks as well as chan...
1,1000228,HENRY SCHEIN INC,2016,The risks described below could have a materia...,Cautionary Note Regarding Forward-Looking Stat...,We are exposed to market risks as well as chan...
2,1000228,HENRY SCHEIN INC,2018,The risks described below could have a materia...,Cautionary Note Regarding Forward-Looking Stat...,We are exposed to market risks as well as chan...
3,1000228,HENRY SCHEIN INC,2018,The risks described below could have a materia...,Cautionary Note Regarding Forward-Looking Stat...,We are exposed to market risks as well as chan...
4,1000228,HENRY SCHEIN INC,2019,The risks described below could have a materia...,Cautionary Note Regarding Forward-Looking Stat...,We are exposed to market risks as well as chan...


In [4]:
filtered_df.to_json('../datasets/intermediary/EXTRACTED_FILINGS_2015.json', orient='records')

In [5]:
content = filtered_df[['item_1A']]
test_df = content.iloc[:10]
test_content = test_df['item_1A'][0]

In [6]:
test_content



# Use Spacy to extract sentences from the text 


Performs rule-based matching on the text to extract sentences.

In [8]:
# Spacy installation
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

In [73]:
load_model = spacy.load('en_core_web_sm', disable=['ner', 'parser'])
words = 'profitability'
" ".join([token.lemma_ for token in load_model(words)])

'profitability'

## newest version of extracting sentences

### extract sentences with metrics

In [7]:
def extract_sentences_with_metrics(text_data):
    # input is a string of text data
    # output is a dataframe with two columns: Sentence and Metric

    # Load the spaCy language model
    nlp = spacy.load('en_core_web_sm')

    # Define custom spaCy matchers for financial metrics

    # Revenue Matcher
    revenue_matcher = Matcher(nlp.vocab)
    revenue_matcher.add("revenue_match", [[{'LOWER': {'IN': ['income', 'proceeds', 'takings', 'receipts', 'sales', 'turnover']}}]])

    # Net Income Matcher
    net_income_matcher = PhraseMatcher(nlp.vocab)
    net_income_patterns = [nlp(text) for text in ('net income', 'profit', 'profits','profitability','earnings', 'bottom line')]
    net_income_matcher.add("net_income_match", None, *net_income_patterns)

    # EBIT Matcher
    ebit_matcher = PhraseMatcher(nlp.vocab)
    ebit_patterns = [nlp(text) for text in ('ebit', 'earnings before interest and taxes', 'operating profit', 'operating profits', 'operating income')]
    ebit_matcher.add("ebit_match", None, *ebit_patterns)

    # EPS Matcher
    eps_matcher = PhraseMatcher(nlp.vocab)
    eps_patterns = [nlp(text) for text in ('eps', 'earnings per share')]
    eps_matcher.add("eps_match", None, *eps_patterns)

    # Cash Flow Matchers
    cash_flow_matcher = Matcher(nlp.vocab)
    operating_patterns = [
        {'LOWER': {'in': ['cash', 'flow']}},
        {'LEMMA': {'in': ['operate','operation']}},
    ]

    operating_patterns1 = [
        {'LOWER': {'in': ['cffo','cfo']}},
    ]

    investing_patterns = [
        {'LOWER': {'in': ['cash', 'flow']}},
        {'LEMMA': {'in': ['invest','investment']}},
    ]

    financing_patterns = [
        {'LOWER': {'in': ['cash', 'flow']}},
        {'LEMMA': {'in': ['finance', 'funding']}},
    ]


    cash_flow_matcher.add("operating_match", [operating_patterns])
    cash_flow_matcher.add("operating_match1", [operating_patterns1])
    cash_flow_matcher.add("investing_match", [investing_patterns])
    cash_flow_matcher.add("financing_match", [financing_patterns])

    
    # Define a mapping of matchers to metric names
    matcher_to_metric = {
        revenue_matcher: "Revenue",
        net_income_matcher: "Net Income",
        ebit_matcher: "EBIT",
        eps_matcher: "EPS",
        cash_flow_matcher: "Cash Flow"
    }

    # Initialize nlp object and list for storing matches
    nlp_doc = nlp(text_data)
    sentences_metrics = []
    
    # Split the text into sentences
    sentences = [sentence.text for sentence in nlp_doc.sents]

    # Iterate through each sentence
    for sentence in sentences:
        doc = nlp(sentence)
        matched_metrics = set() # Initialize set for storing matched metrics to avoid duplicates
        # Check for financial metrics matches
        for matcher, metric_name in matcher_to_metric.items():
            matches = matcher(doc)
            if matches:
                for match_id, start, end in matches:
                    if matcher == cash_flow_matcher:
                        for match_id, start, end in matches:
                            if nlp.vocab.strings[match_id] == "operating_match":
                                matched_metrics.add('Cash Flow (Operating)')
                            elif nlp.vocab.strings[match_id] == "operating_match1":
                                matched_metrics.add('Cash Flow (Operating)')
                            elif nlp.vocab.strings[match_id] == "investing_match":
                                matched_metrics.add('Cash Flow (Investing)')
                            elif nlp.vocab.strings[match_id] == "financing_match":
                                matched_metrics.add('Cash Flow (Financing)')
                    else: matched_metrics.add(metric_name)
        # Append the sentence with matched metrics to the result list
        for matched_metric in matched_metrics:
            sentences_metrics.append({'Sentence': sentence, 'Metric': matched_metric})

    # Create a DataFrame with combined results
    sentences_metrics_df = pd.DataFrame(sentences_metrics)
    
    return sentences_metrics_df 


In [66]:
sentences_metrics_df = extract_sentences_with_metrics(test_content)

### clean extra linebreaks

In [8]:
def clean_newlines(sentence_list):
    #takes list as input and gives list as output
    #cleans unnecesary linebreaks etc.
    
    
    for q in range(len(sentence_list)):
        sentence_list[q] = sentence_list[q].strip() 
        sentence_list[q] = sentence_list[q].replace('\n', ' ') 
        sentence_list[q] = sentence_list[q].replace('\r', '') 
        sentence_list[q] = sentence_list[q].replace(' ', ' ') 
        sentence_list[q] = sentence_list[q].replace(' ', ' ')
        sentence_list[q] = sentence_list[q].replace('\xa0',' ')
        sentence_list[q] = sentence_list[q].replace('&nbsp;',' ')
        sentence_list[q] = sentence_list[q].replace('&#160;',' ')
        while '  ' in sentence_list[q]:
            sentence_list[q] = sentence_list[q].replace('  ',' ')
        
    return sentence_list

### extract FLS

In [9]:
def categorize_fls(sentence,date):
    #performs rule-based-matching
    #takes a sentence and a year as input and returns individual sentences that contain a match
    
    # Load the spaCy language model
    nlp = spacy.load('en_core_web_sm')
    
    #initialize as nlp object and prepare list for storing matches
    sen = nlp(sentence)
       
    #two seperate matchers, so that pattern 3 can be checked separately 
    matcher = Matcher(nlp.vocab)
    matcher2 = Matcher(nlp.vocab)
    pattern1 =  [{"TEXT": {"IN": ["next", "subsequent", "following", "upcoming", "incoming", "coming", "succeeding", "carryforward"]}},
            {"TEXT": {"IN": ["month", "quarter", "year", "fiscal", "taxable", "period"]}}]
    pattern2 = [{"LEMMA": {"IN": ["aim", "anticipate", "assume", "commit", "estimate", "expect", "forecast", "foresee", "hope", "intend", "plan", "predict", "project", "seek","target"]},"POS": "VERB"}]
    pattern3 = [{"TEXT": {"REGEX": "[1-2][0-9][0-9][0-9]"}, "LENGTH": 4}]
    matcher.add('pattern1',[pattern1])
    matcher.add('pattern2',[pattern2])
    matcher2.add('pattern3',[pattern3])
    
    #if patterns were found in a sentence, append it to the list
    
    if matcher(sen) != []:
        return "FLS"
        
    #if no matches for patterns 1 or 2 are found, check matches for pattern 3 and check if they are higher than the year provided as input.
    elif matcher2(sen) != []:
        years=[]
        for match_id, start, end in matcher2(sen):
            years.append(int(sen[start:end].text))
        if max(years) > date:
            return "FLS"
    return "Non-FLS"

In [None]:
# apply the categorize_fls function to the dataframe
sentences_metrics_df['FLS'] = sentences_metrics_df.apply(lambda x: categorize_fls(x['Sentence'], 2005), axis=1)
sentences_metrics_df

### build pipeline for extracting FLS with metrics

In [24]:
test_df = filtered_df.iloc[:10]
test_df['item_1A'][1]



In [12]:
def extract_sentences_complete(data):
    # input is a dataframe converted from 10-K filings in JSON format
    # output is a dataframe including the original data 
    # and the extracted sentences, the corresponding metrics and FLS classification
    
    results = []
    items = ['item_1A', 'item_7', 'item_7A']
    for item in items:
        fls_with_metrics_item = pd.DataFrame(columns=['Sentence', 'Metric'])
        for index,row in data.iterrows():
            # extract sentences with metrics from the current row
            sentences_with_metrics = extract_sentences_with_metrics(row[item])
            if sentences_with_metrics.empty:
                continue
            sentences_with_metrics_clean = clean_newlines(sentences_with_metrics['Sentence'])
            sentences_with_metrics['Sentence'] = sentences_with_metrics_clean
            
            # create a dataframe with the current row's data
            sentences_with_metrics['Item'] = item 
            sentences_with_metrics['Year'] = row['year']
            sentences_with_metrics['CIK'] = row['cik']
            sentences_with_metrics['Company'] = row['company']
            
            # append the result for the current row to the item's result DataFrame
            fls_with_metrics_item = pd.concat([fls_with_metrics_item, sentences_with_metrics], ignore_index=True)

            # print progress
            print(f"Finished extracting sentences for {item} of {row['company']} ({row['year']}) - {index+1}/{len(data)}")
        # apply categorize_fls function to the dataframe
        fls_with_metrics_item['FLS'] = fls_with_metrics_item.apply(
            lambda x: categorize_fls(x['Sentence'], x['Year']), axis=1
            )
        # append the result for the current row to the item's result DataFrame
        results.append(fls_with_metrics_item)

        # print progress
        print(f"FINISHED EXTRACTING SENTENCES FOR {item}.")

    # concatenate all dataframes in the list
    fls_with_metrics = pd.concat(results, ignore_index=True)

    # drop possible duplicates
    fls_with_metrics.drop_duplicates(inplace=True)
    return fls_with_metrics

In [13]:
fls_with_metrics = extract_sentences_complete(filtered_df)

Finished extracting sentences for item_1A of HENRY SCHEIN INC (2015) - 1/3470
Finished extracting sentences for item_1A of HENRY SCHEIN INC (2016) - 2/3470
Finished extracting sentences for item_1A of HENRY SCHEIN INC (2018) - 3/3470
Finished extracting sentences for item_1A of HENRY SCHEIN INC (2018) - 4/3470
Finished extracting sentences for item_1A of HENRY SCHEIN INC (2019) - 5/3470
Finished extracting sentences for item_1A of HENRY SCHEIN INC (2020) - 6/3470
Finished extracting sentences for item_1A of HENRY SCHEIN INC (2021) - 7/3470
Finished extracting sentences for item_1A of WATERS CORP /DE/ (2015) - 8/3470
Finished extracting sentences for item_1A of WATERS CORP /DE/ (2016) - 9/3470
Finished extracting sentences for item_1A of WATERS CORP /DE/ (2017) - 10/3470
Finished extracting sentences for item_1A of WATERS CORP /DE/ (2018) - 11/3470
Finished extracting sentences for item_1A of WATERS CORP /DE/ (2019) - 12/3470
Finished extracting sentences for item_1A of WATERS CORP /DE/

In [79]:
test_df.shape

(10, 6)

In [64]:
items = ['item_1A', 'item_7', 'item_7A']
for item in items:
    fls_with_metrics_item = pd.DataFrame(columns=['Sentence', 'Metric'])
    for index,row in filtered_df.iterrows():
        # extract sentences with metrics from the current row
        sentences_with_metrics = extract_sentences_with_metrics(row[item])
        if sentences_with_metrics.empty:
                continue
        sentences_with_metrics_clean = clean_newlines(sentences_with_metrics['Sentence'])
        sentences_with_metrics['Sentence'] = sentences_with_metrics_clean
        
        # create a dataframe with the current row's data
        sentences_with_metrics['Item'] = item 
        sentences_with_metrics['Year'] = row['year']
        sentences_with_metrics['CIK'] = row['cik']
        sentences_with_metrics['Company'] = row['company']
        
        # append the result for the current row to the item's result DataFrame
        fls_with_metrics_item = pd.concat([fls_with_metrics_item, sentences_with_metrics], ignore_index=True)

        company = row['company']
        year = row['year']
        # print progress
        print(f"Finished extracting sentences for {company} {year} - {item}, data no {index}")


Finished extracting sentences for HENRY SCHEIN INC - item_1A
Finished extracting sentences for HENRY SCHEIN INC - item_1A
Finished extracting sentences for HENRY SCHEIN INC - item_1A
Finished extracting sentences for HENRY SCHEIN INC - item_1A
Finished extracting sentences for HENRY SCHEIN INC - item_1A
Finished extracting sentences for HENRY SCHEIN INC - item_1A
Finished extracting sentences for HENRY SCHEIN INC - item_1A
Finished extracting sentences for WATERS CORP /DE/ - item_1A
Finished extracting sentences for WATERS CORP /DE/ - item_1A
Finished extracting sentences for WATERS CORP /DE/ - item_1A
Finished extracting sentences for WATERS CORP /DE/ - item_1A
Finished extracting sentences for WATERS CORP /DE/ - item_1A
Finished extracting sentences for WATERS CORP /DE/ - item_1A
Finished extracting sentences for WATERS CORP /DE/ - item_1A
Finished extracting sentences for ESTEE LAUDER COMPANIES INC - item_1A
Finished extracting sentences for ESTEE LAUDER COMPANIES INC - item_1A
Fini

In [65]:
fls_with_metrics_item.shape

(11994, 6)

In [66]:
# apply categorize_fls function to the dataframe
fls_with_metrics_item['FLS'] = fls_with_metrics_item.apply(
    lambda x: categorize_fls(x['Sentence'], x['Year']), axis=1
    )

# drop possible duplicates
fls_with_metrics_item.drop_duplicates(inplace=True)

In [71]:
fls_with_metrics_item['Item'].unique()

array(['item_7A'], dtype=object)

In [58]:
fls_with_metrics = extract_sentences_complete(filtered_df)
fls_with_metrics

KeyboardInterrupt: 

In [44]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

def process_row(item, row):
    fls_with_metrics_item = pd.DataFrame(columns=['Sentence', 'Metric'])

    sentences_with_metrics = extract_sentences_with_metrics(row[item])
    sentences_with_metrics_clean = clean_newlines(sentences_with_metrics['Sentence'])
    sentences_with_metrics['Sentence'] = sentences_with_metrics_clean

    # Create a dataframe with the current row's data
    sentences_with_metrics['Item'] = item
    sentences_with_metrics['Year'] = row['year']
    sentences_with_metrics['CIK'] = row['cik']
    sentences_with_metrics['Company'] = row['company']

    # Append the result for the current row to the item's result DataFrame
    fls_with_metrics_item = pd.concat([fls_with_metrics_item, sentences_with_metrics], ignore_index=True)

    # Apply categorize_fls function to the dataframe
    fls_with_metrics_item['FLS'] = fls_with_metrics_item.apply(
        lambda x: categorize_fls(x.loc['Sentence'], x.loc['Year']), axis=1
    )

    return fls_with_metrics_item

def extract_sentences_complete(data):
    results = []
    items = ['item_1A', 'item_7', 'item_7A']

    with ThreadPoolExecutor(max_workers=20) as executor:
        futures = []

        for item in items:
            for _, row in data.iterrows():
                futures.append(executor.submit(process_row, item, row))

        for future in futures:
            result = future.result()
            results.append(result)

    # Concatenate all dataframes in the list
    fls_with_metrics = pd.concat(results, ignore_index=True)

    # Drop possible duplicates
    fls_with_metrics.drop_duplicates(inplace=True)

    return fls_with_metrics


In [45]:
fls_with_metrics = extract_sentences_complete(test_df)

In [46]:
fls_with_metrics

Unnamed: 0,Sentence,Metric,Item,Year,CIK,Company,FLS
0,"In the future, we may be unable to compete suc...",Net Income,item_1A,2015.0,1000228,HENRY SCHEIN INC,Non-FLS
1,The failure of manufacturers of products regul...,Revenue,item_1A,2015.0,1000228,HENRY SCHEIN INC,Non-FLS
2,An extended interruption in the supply of our ...,Revenue,item_1A,2015.0,1000228,HENRY SCHEIN INC,Non-FLS
3,Our revenues and profitability depend on our r...,Revenue,item_1A,2015.0,1000228,HENRY SCHEIN INC,Non-FLS
4,Our revenues and profitability depend on our r...,Net Income,item_1A,2015.0,1000228,HENRY SCHEIN INC,Non-FLS
...,...,...,...,...,...,...,...
1515,Assuming a hypothetical adverse change of 10% ...,Revenue,item_7A,2016.0,1000697,WATERS CORP /DE/,FLS
1516,The Company is a global company that operates ...,Revenue,item_7A,2017.0,1000697,WATERS CORP /DE/,Non-FLS
1517,The Company’s foreign currency exchange contra...,Revenue,item_7A,2017.0,1000697,WATERS CORP /DE/,FLS
1518,The Company’s foreign currency exchange contra...,Net Income,item_7A,2017.0,1000697,WATERS CORP /DE/,FLS


### Run data in batches

In [None]:
import concurrent.futures

# Define a function for processing a single data point
def process_data(data_point,batch_size):
    extract_sentences_complete(data_point)
    pass


# Split your dataset into smaller batches
data_batches = [data[i:i+batch_size] for i in range(0, len(data), batch_size)]

# Process data points in parallel
with concurrent.futures.ProcessPoolExecutor() as executor:
    results = list(executor.map(process_data, data_batches))


In [12]:
# Test with test_content 
# Define rules for matcher
import spacy
from spacy.matcher import Matcher
import pandas as pd

# Load the spaCy language model
nlp = spacy.load('en_core_web_sm')

# Define custom spaCy rules using Matcher
matcher = Matcher(nlp.vocab)
matcher.add("revenue_match", [[{'LOWER': {'in': ['income', 'proceeds', 'takings', 'receipts', 'sales', 'turnover']}}]])
matcher.add("net_income_match", [[{'LOWER': {'in': ['profit', 'earnings', 'loss', 'bottom line', 'net income', 'net loss', 'net profit', 'net earnings']}}]])
matcher.add("ebit_match", [[{'LOWER': {'in': ['ebit', 'earnings before interest and taxes', 'operating profit', 'operating income']}}]])
matcher.add("eps_match", [[{'LOWER': {'in': ['eps', 'earnings per share']}}]])
# Initialize an empty list to store matching sentences
matching_sentences = []

# Iterate through each column of the DataFrame
for column in test_content.columns:
    # Iterate through each row in the column
    for idx, text in enumerate(test_content[column]):
        # Split the text into sentences
        sentences = [sentence.text for sentence in nlp(text).sents]
        
        # Iterate through each sentence
        for sentence in sentences:
            doc = nlp(sentence)
            matches = matcher(doc)
            
            # Check if there are any matches
            if matches:
                # Check if 'income' is in the sentence
                if 'net income' in sentence.lower():
                    # Consider 'net income' as a pattern
                    matching_sentences.append({'Column': column, 'Sentence': sentence, 'metric': 'net income'})
                else:
                    # Consider revenue as a pattern
                    matching_sentences.append({'Column': column, 'Sentence': sentence, 'metric': 'revenue'})
                

# Create a new DataFrame with matching sentences
result_df = pd.DataFrame(matching_sentences)

# Print the resulting DataFrame
print(result_df)


KeyboardInterrupt: 