# Información y Programación Financiera, TI, Algoritmos y Ciencia de Datos

Neftalí Valdez

<a href="http://twitter.com/neftalivldz" target="_blank">@neftalivldz</a> | <a href="mailto:nvaldez@tec.mx">nvaldez@tec.mx</a>

Referencias

<a href="https://developers.refinitiv.com/en/article-catalog/article/using-ai-modeling-to-interpret-10-Q-filings
" target="_blank">Artículo Original de Nick Zincone</a> 

<a href="https://pypi.org/project/sec-api/" target="_blank"> SEC API</a> 

<a href="https://huggingface.co/yiyanghkust/finbert-fls" target="_blank">Hugging Face.- Forward Looking Statements</a>


In [4]:
#!pip3 install transformers
#!pip3 install torch
#!conda install -c pytorch torchtext
#!conda install pytorch torchvision -c pytorch
#!pip3 install sec-api

In [56]:
import eikon as ek  # the Eikon Python wrapper package
import numpy as np  # NumPy
import pandas as pd  # pandas
import cufflinks as cf  # Cufflinks
import configparser as cp
import datetime as dt
cf.set_config_file(offline=True)  # set the plotting mode to offline

In [57]:
# NLP package used to aid in text manipulation
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')

# Machine Learning modules used to prepare and measure text
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch

# HTML text processing
from bs4 import BeautifulSoup

# Helper modules
import matplotlib.pyplot as plt
from tqdm.notebook import trange # Progress bar
import pandas as pd

from sec_api import QueryApi

pd.set_option('display.max_colwidth', 60)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/neftalivaldez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [58]:
cfg = cp.ConfigParser()
cfg.read('../refinitiv.cfg')
ek.set_app_key(cfg['eikon']['app_id'])

In [59]:
cfg = cp.ConfigParser()
cfg.read('../secapi.cfg')
sec = cfg['sec']['app_id']


In [7]:
today = dt.date.today()
start = today - dt.timedelta(days=365.2*5)
print(today, start)

2024-04-30 2019-05-01


In [8]:
fields = ['TR.PortfolioConstituentName','TR.PortfolioWeight']
data, err = ek.get_data(['Portfolio(RETO2023_B)'],fields)
ric = data['Instrument'][3]

In [60]:
lista = ['TSLA.O', 'NVDA.O']
ric = lista[1]
ric

'NVDA.O'

In [43]:
data['Instrument']

0               BIMBOA.MX
1                   ATT.L
2             CEMEXCPO.MX
3                  TSLA.O
4    NVDAF162360000.U^F23
Name: Instrument, dtype: string

In [61]:
ric = 'NVDA.O'

In [46]:
#help(ek.get_symbology)

In [62]:
tick = ek.get_symbology(ric, from_symbol_type="RIC", to_symbol_type="ticker")['ticker'][0]
tick

'NVDA'

In [63]:
# Query the Filings service using the Refinitiv Data Library for Python.
#
# Retrieve SEC filings for the specific company - specify the text we want to retrieve
# is defined within the "Management Discussion" section.

queryApi = QueryApi(api_key=sec)

query = {
  "query": { "query_string": {
      "query": "ticker:NVDA AND filedAt:{2022-01-01 TO 2024-04-30} AND formType:\"10-Q\""
    } },
  "from": "0",
  "size": "10",
  "sort": [{ "filedAt": { "order": "desc" } }]
}

filings = queryApi.get_filings(query)

print(filings)

{'total': {'value': 6, 'relation': 'eq'}, 'query': {'from': 0, 'size': 10}, 'filings': [{'id': '28ce272242a825619862ff8229ded641', 'accessionNo': '0001045810-23-000227', 'cik': '1045810', 'ticker': 'NVDA', 'companyName': 'NVIDIA CORP', 'companyNameLong': 'NVIDIA CORP (Filer)', 'formType': '10-Q', 'description': 'Form 10-Q - Quarterly report [Sections 13 or 15(d)]', 'filedAt': '2023-11-21T16:36:29-05:00', 'linkToTxt': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581023000227/0001045810-23-000227.txt', 'linkToHtml': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581023000227/0001045810-23-000227-index.htm', 'linkToXbrl': '', 'linkToFilingDetails': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581023000227/nvda-20231029.htm', 'entities': [{'companyName': 'NVIDIA CORP (Filer)', 'cik': '1045810', 'irsNo': '943177549', 'stateOfIncorporation': 'DE', 'fiscalYearEnd': '0128', 'type': '10-Q', 'act': '34', 'fileNo': '000-23985', 'filmNo': '231428716', 'sic': '3674 Semic

In [64]:
filings['query']

{'from': 0, 'size': 10}

In [65]:
filings['filings']

[{'id': '28ce272242a825619862ff8229ded641',
  'accessionNo': '0001045810-23-000227',
  'cik': '1045810',
  'ticker': 'NVDA',
  'companyName': 'NVIDIA CORP',
  'companyNameLong': 'NVIDIA CORP (Filer)',
  'formType': '10-Q',
  'description': 'Form 10-Q - Quarterly report [Sections 13 or 15(d)]',
  'filedAt': '2023-11-21T16:36:29-05:00',
  'linkToTxt': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581023000227/0001045810-23-000227.txt',
  'linkToHtml': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581023000227/0001045810-23-000227-index.htm',
  'linkToXbrl': '',
  'linkToFilingDetails': 'https://www.sec.gov/Archives/edgar/data/1045810/000104581023000227/nvda-20231029.htm',
  'entities': [{'companyName': 'NVIDIA CORP (Filer)',
    'cik': '1045810',
    'irsNo': '943177549',
    'stateOfIncorporation': 'DE',
    'fiscalYearEnd': '0128',
    'type': '10-Q',
    'act': '34',
    'fileNo': '000-23985',
    'filmNo': '231428716',
    'sic': '3674 Semiconductors &amp; Related D

In [66]:
df = pd.json_normalize(filings['filings'])
df

Unnamed: 0,id,accessionNo,cik,ticker,companyName,companyNameLong,formType,description,filedAt,linkToTxt,linkToHtml,linkToXbrl,linkToFilingDetails,entities,documentFormatFiles,dataFiles,seriesAndClassesContractsInformation,periodOfReport
0,28ce272242a825619862ff8229ded641,0001045810-23-000227,1045810,NVDA,NVIDIA CORP,NVIDIA CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2023-11-21T16:36:29-05:00,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,"[{'companyName': 'NVIDIA CORP (Filer)', 'cik': '1045810'...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '6', 'description': 'XBRL TAXONOMY EXTENSI...",[],2023-10-29
1,1b43abf35c0422330e884ed73dd3fcaa,0001045810-23-000175,1045810,NVDA,NVIDIA CORP,NVIDIA CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2023-08-25T19:36:34-04:00,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,"[{'companyName': 'NVIDIA CORP (Filer)', 'cik': '1045810'...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '6', 'description': 'XBRL TAXONOMY EXTENSI...",[],2023-07-30
2,cb64b196a3443a46e8c12ba48bf12fd0,0001045810-23-000093,1045810,NVDA,NVIDIA CORP,NVIDIA CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2023-05-26T16:17:32-04:00,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,"[{'companyName': 'NVIDIA CORP (Filer)', 'cik': '1045810'...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '6', 'description': 'XBRL TAXONOMY EXTENSI...",[],2023-04-30
3,834fb6492757158b4ca1f857af1ace9e,0001045810-22-000166,1045810,NVDA,NVIDIA CORP,NVIDIA CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2022-11-18T16:35:31-05:00,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,"[{'companyName': 'NVIDIA CORP (Filer)', 'cik': '1045810'...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '6', 'description': 'XBRL TAXONOMY EXTENSI...",[],2022-10-30
4,afe312b0e853303ba74ad0b226c5b817,0001045810-22-000147,1045810,NVDA,NVIDIA CORP,NVIDIA CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2022-08-31T17:01:35-04:00,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,"[{'companyName': 'NVIDIA CORP (Filer)', 'cik': '1045810'...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '6', 'description': 'XBRL TAXONOMY EXTENSI...",[],2022-07-31
5,c3252ee80279fcc59376bebfac42c0a3,0001045810-22-000079,1045810,NVDA,NVIDIA CORP,NVIDIA CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2022-05-27T16:21:55-04:00,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,"[{'companyName': 'NVIDIA CORP (Filer)', 'cik': '1045810'...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '6', 'description': 'XBRL TAXONOMY EXTENSI...",[],2022-05-01


In [67]:
from sec_api import ExtractorApi

extractorApi = ExtractorApi(sec)

#
# 10-Q example
#
# Tesla 10-Q filing
filing_url_10q = "https://www.sec.gov/Archives/edgar/data/1045810/000104581023000227/0001045810-23-000227.txt"

# get the original HTML of section 7 "Management’s Discussion and Analysis of Financial Condition and Results of Operations"
section_html = extractorApi.get_section(filing_url_10q, "part1item2", "html")
beautifulSoupText = BeautifulSoup(section_html, "html.parser").text.replace(u'\xa0', ' ').replace('\r', ' ')
beautifulSoupText


'ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONSForward-Looking Statements This Quarterly Report on Form 10-Q contains forward-looking statements which are based on our management’s beliefs and assumptions and on information currently available to our management. In some cases, you can identify forward-looking statements by terms such as “may,” “will,” “should,” “could,” “goal,” “would,” “expect,” “plan,” “anticipate,” “believe,” “estimate,” “project,” “predict,” “potential” and similar expressions intended to identify forward-looking statements. These statements involve known and unknown risks, uncertainties and other factors, which may cause our actual results, performance, time frames or achievements to be materially different from any future results, performance, time frames or achievements expressed or implied by the forward-looking statements. We discuss many of these risks, uncertainties and other factors in this Quarterly Report on 

In [68]:
# Parse Section Pending
df

Unnamed: 0,id,accessionNo,cik,ticker,companyName,companyNameLong,formType,description,filedAt,linkToTxt,linkToHtml,linkToXbrl,linkToFilingDetails,entities,documentFormatFiles,dataFiles,seriesAndClassesContractsInformation,periodOfReport
0,28ce272242a825619862ff8229ded641,0001045810-23-000227,1045810,NVDA,NVIDIA CORP,NVIDIA CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2023-11-21T16:36:29-05:00,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,"[{'companyName': 'NVIDIA CORP (Filer)', 'cik': '1045810'...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '6', 'description': 'XBRL TAXONOMY EXTENSI...",[],2023-10-29
1,1b43abf35c0422330e884ed73dd3fcaa,0001045810-23-000175,1045810,NVDA,NVIDIA CORP,NVIDIA CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2023-08-25T19:36:34-04:00,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,"[{'companyName': 'NVIDIA CORP (Filer)', 'cik': '1045810'...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '6', 'description': 'XBRL TAXONOMY EXTENSI...",[],2023-07-30
2,cb64b196a3443a46e8c12ba48bf12fd0,0001045810-23-000093,1045810,NVDA,NVIDIA CORP,NVIDIA CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2023-05-26T16:17:32-04:00,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,"[{'companyName': 'NVIDIA CORP (Filer)', 'cik': '1045810'...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '6', 'description': 'XBRL TAXONOMY EXTENSI...",[],2023-04-30
3,834fb6492757158b4ca1f857af1ace9e,0001045810-22-000166,1045810,NVDA,NVIDIA CORP,NVIDIA CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2022-11-18T16:35:31-05:00,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,"[{'companyName': 'NVIDIA CORP (Filer)', 'cik': '1045810'...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '6', 'description': 'XBRL TAXONOMY EXTENSI...",[],2022-10-30
4,afe312b0e853303ba74ad0b226c5b817,0001045810-22-000147,1045810,NVDA,NVIDIA CORP,NVIDIA CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2022-08-31T17:01:35-04:00,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,"[{'companyName': 'NVIDIA CORP (Filer)', 'cik': '1045810'...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '6', 'description': 'XBRL TAXONOMY EXTENSI...",[],2022-07-31
5,c3252ee80279fcc59376bebfac42c0a3,0001045810-22-000079,1045810,NVDA,NVIDIA CORP,NVIDIA CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2022-05-27T16:21:55-04:00,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,,https://www.sec.gov/Archives/edgar/data/1045810/00010458...,"[{'companyName': 'NVIDIA CORP (Filer)', 'cik': '1045810'...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '6', 'description': 'XBRL TAXONOMY EXTENSI...",[],2022-05-01


In [69]:
# Declare our final results table
results = pd.DataFrame()
text = []
dates = []
section = []

# Pull out the filings text for each report
for i in range(0,len(df)):
    extractorApi = ExtractorApi(sec)
    filing_url_10q = df.linkToHtml.values.tolist()[i]

    # get the original HTML of section 7 "Management’s Discussion and Analysis of Financial Condition and Results of Operations"
    section_html = extractorApi.get_section(filing_url_10q, "part1item2", "html")
    beautifulSoupText = BeautifulSoup(section_html, "html.parser").text.replace(u'\xa0', ' ').replace('\r', ' ')
    # Clean the data and capture it for later processing
    text.append(beautifulSoupText)
    dates.append(df.filedAt.values.tolist()[i])
    section.append('ManagementDiscussion')
    

In [72]:
section

['ManagementDiscussion',
 'ManagementDiscussion',
 'ManagementDiscussion',
 'ManagementDiscussion',
 'ManagementDiscussion',
 'ManagementDiscussion']

In [73]:
results['text'] = text
results['FilingDate'] = dates
results['section'] = section

In [74]:
results

Unnamed: 0,text,FilingDate,section
0,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-11-21T16:36:29-05:00,ManagementDiscussion
1,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-08-25T19:36:34-04:00,ManagementDiscussion
2,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-05-26T16:17:32-04:00,ManagementDiscussion
3,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-11-18T16:35:31-05:00,ManagementDiscussion
4,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-08-31T17:01:35-04:00,ManagementDiscussion
5,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-05-27T16:21:55-04:00,ManagementDiscussion


In [23]:
# Load models

In [75]:
# Load the models
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)

In [76]:
# Download the Pre-trained transformer used to process our raw text
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)

In [77]:
prediction = nlp("At the same time, our competitors are rapidly developing and deploying cloud-based services for consumers and business customer", top_k=3)
prediction

[{'label': 'Not FLS', 'score': 0.9801842570304871},
 {'label': 'Specific FLS', 'score': 0.010232682339847088},
 {'label': 'Non-specific FLS', 'score': 0.009583091363310814}]

In [78]:
# Sentiment - Download the Pre-trained transformer used to process our raw text
sent_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

In [79]:
# Sentiment - Download the FinBert model used to process our transformed data
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")


In [29]:
# Capture closing prices

In [80]:
ric

'NVDA.O'

In [82]:
# Container to hold the Closing Prices based on the filing date
prices = []

# Walk through the collection of filings and pull out the reported filing date
num_rows = len(results)

for i in range(num_rows):
    date = results.iloc[i]['FilingDate']
    end = dt.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S%z")
    start = dt.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S%z") - dt.timedelta(minutes=1)
    try:
        response = ek.get_timeseries([ric], fields = ['CLOSE'], interval='minute', start_date=start, end_date=end)
        print(response)
        prices.append(response.iloc[0]['CLOSE'])
    except:
        prices.append(None)

if prices:
    results['close'] = prices

NVDA.O                CLOSE
Date                       
2023-11-21 21:36:00  492.64
NVDA.O                CLOSE
Date                       
2023-08-25 23:36:00  459.09
NVDA.O                CLOSE
Date                       
2023-05-26 20:17:00  391.42


2024-04-30 16:42:09,978 P[47088] [MainThread 8663292736] Error with NVDA.O: No data available for the requested date range
2024-04-30 16:42:09,979 P[47088] [MainThread 8663292736] NVDA.O: No data available for the requested date range | 
2024-04-30 16:42:10,337 P[47088] [MainThread 8663292736] Error with NVDA.O: No data available for the requested date range
2024-04-30 16:42:10,338 P[47088] [MainThread 8663292736] NVDA.O: No data available for the requested date range | 
2024-04-30 16:42:10,701 P[47088] [MainThread 8663292736] Error with NVDA.O: No data available for the requested date range
2024-04-30 16:42:10,702 P[47088] [MainThread 8663292736] NVDA.O: No data available for the requested date range | 


In [83]:
results

Unnamed: 0,text,FilingDate,section,close
0,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-11-21T16:36:29-05:00,ManagementDiscussion,492.64
1,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-08-25T19:36:34-04:00,ManagementDiscussion,459.09
2,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-05-26T16:17:32-04:00,ManagementDiscussion,391.42
3,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-11-18T16:35:31-05:00,ManagementDiscussion,
4,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-08-31T17:01:35-04:00,ManagementDiscussion,
5,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-05-27T16:21:55-04:00,ManagementDiscussion,


In [84]:
def n_4(x, y):
    z = x + y + 4
    return print(z)

In [85]:
n_4(1,2)

7


In [86]:
def evaluate(filings):
    # Container to hold the percentages of FLS sentences within each filing
    fls_pct = []

    # Container to hold the sentiment scores
    scores = []
    
    # Walk through the collection of filings and feed into the FinBert models
    num_rows = len(filings)
    for i in range(num_rows):
        # Pull out the "Management Section" text from our filings
        management_section = filings.iloc[i]['text']
        
        # For this section, break it into individual sentences
        sentences = sent_tokenize(management_section)
        
        # Initialize our FLS container
        fls = []
        
        # Define the container to collect stats related to the sentiment scores
        # for all forward-looking statement
        sentiments = torch.Tensor([0,0,0])
        
        # Process each sentence, converting into tokens required by the FinBert model.
        for sentence in sentences:
            # FLS prediction
            prediction = nlp(sentence[:512], top_k=3)[0]['label']

            # Capture FLS statements
            if prediction.startswith("Specific") or prediction.startswith("Non"):
                fls.append(sentence)
                # Tokenize - The FinBert model requires tensor-based tokens as input. For any given
                # sentence, I must ensure the length must does not exceed the models self-imposed limit.
                encoded_input = sent_tokenizer(sentence, return_tensors="pt", truncation=True)
                
                with torch.no_grad():
                    # Run the sentence through the model...
                    output = model(**encoded_input)

                    # The prediction will be in the form of a probability
                    fls_sentiment = torch.nn.functional.softmax(output.logits, dim=-1)
                    
                    # Tally the predictions for each sentence
                    sentiments = sentiments+fls_sentiment

        # Record the percentage of FLS sentences
        fls_pct.append(len(fls)/len(sentences)*100)
        
        # Record the resulting sentiment for 'FLS' sentences within this section
        sentiments = sentiments.divide(len(sentences))
        
        score = model.config.id2label[sentiments.argmax().item()]
        print(f'Filing: {i+1} contains {len(sentences)} sentences of which {len(fls)} are "FLS" with a sentiment of: {sentiments} => {score}')
        scores.append(score)
    
    # Add the measures to our results table
    filings['fls_pct'] = fls_pct
    filings[f'fls_sentiment'] = scores

In [87]:
# Plot the data
def plot(x_axis, title, y1_label, x2_axis, **kwargs):
    plt.style.use('dark_background')
    plt.rcParams['figure.figsize'] = (17,8)
    fig, ax = plt.subplots()
    for label, data in kwargs.items():
        # Special label '_' to plot vertical bar
        if label == '_':
            x = 0
            for s in data:
                plt.axvline(x, ymax=0.25, color=s['color'], label=s['label'], linestyle="--")
                x += 1
        else:
            ax.plot(x_axis, data, label=label)
    ax2 = ax.twinx()
    ax2.plot(x_axis, x2_axis, label="Closing Price", color='red')
    ax.tick_params(labelrotation=90)
    # Remove the spines from the graph - leave the bottom
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    # Add a faint grid
    ax.yaxis.grid(True, alpha=0.2)
    ax.xaxis.grid(True, alpha=0.2)
    # Add labels and a title. Note the use of `labelpad` and `pad` to add some
    # extra space between the text and the tick labels.
    ax.set_ylabel(y1_label, labelpad=12, fontsize=14, color='cyan')
    ax.set_title(title, pad=15, fontsize=16, color='cyan')
    if len(kwargs) > 1:
        ax.legend(loc='upper left')
    ax2.legend(loc='upper right')
    fig.tight_layout()

In [88]:
def sentiment_bars(scores):
    bars = []
    frequency = {'negative':0, 'neutral':0, 'positive':0}
    
    for s in scores:
        bar = {'color': '', 'label': ''}
        if s == 'negative':
            color = 'red'
        elif s == 'neutral':
            color = 'yellow'
        else:
            color = 'green'
            
        bar['color'] = color
        bar['label'] = s if frequency[s] == 0 else '_'
        frequency[s] += 1
        bars.append(bar)
        
    return bars

In [89]:
# Evaluate our predictions and compare against the sentement scores based on the 
# FLS sentences.
evaluate(results)

Filing: 1 contains 125 sentences of which 27 are "FLS" with a sentiment of: tensor([[0.0377, 0.1199, 0.0585]]) => negative
Filing: 2 contains 112 sentences of which 17 are "FLS" with a sentiment of: tensor([[0.0418, 0.0727, 0.0373]]) => negative
Filing: 3 contains 103 sentences of which 15 are "FLS" with a sentiment of: tensor([[0.0273, 0.0851, 0.0333]]) => negative
Filing: 4 contains 134 sentences of which 36 are "FLS" with a sentiment of: tensor([[0.0218, 0.1763, 0.0706]]) => negative
Filing: 5 contains 160 sentences of which 37 are "FLS" with a sentiment of: tensor([[0.0206, 0.1514, 0.0593]]) => negative
Filing: 6 contains 123 sentences of which 31 are "FLS" with a sentiment of: tensor([[0.0367, 0.1543, 0.0611]]) => negative


In [90]:
results

Unnamed: 0,text,FilingDate,section,close,fls_pct,fls_sentiment
0,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-11-21T16:36:29-05:00,ManagementDiscussion,492.64,21.6,negative
1,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-08-25T19:36:34-04:00,ManagementDiscussion,459.09,15.178571,negative
2,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-05-26T16:17:32-04:00,ManagementDiscussion,391.42,14.563107,negative
3,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-11-18T16:35:31-05:00,ManagementDiscussion,,26.865672,negative
4,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-08-31T17:01:35-04:00,ManagementDiscussion,,23.125,negative
5,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-05-27T16:21:55-04:00,ManagementDiscussion,,25.203252,negative


In [91]:
# Top 10...
results.head(10)


Unnamed: 0,text,FilingDate,section,close,fls_pct,fls_sentiment
0,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-11-21T16:36:29-05:00,ManagementDiscussion,492.64,21.6,negative
1,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-08-25T19:36:34-04:00,ManagementDiscussion,459.09,15.178571,negative
2,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-05-26T16:17:32-04:00,ManagementDiscussion,391.42,14.563107,negative
3,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-11-18T16:35:31-05:00,ManagementDiscussion,,26.865672,negative
4,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-08-31T17:01:35-04:00,ManagementDiscussion,,23.125,negative
5,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-05-27T16:21:55-04:00,ManagementDiscussion,,25.203252,negative


In [None]:
filings = results
# Container to hold the percentages of FLS sentences within each filing
fls_pct = []

# Container to hold the sentiment scores
scores = []
    
# Walk through the collection of filings and feed into the FinBert models
num_rows = len(filings)
#for i in range(num_rows):
i = 0
# Pull out the "Management Section" text from our filings
management_section = filings.iloc[i]['text']

# For this section, break it into individual sentences
sentences = sent_tokenize(management_section)

# Initialize our FLS container
fls = []

# Define the container to collect stats related to the sentiment scores
# for all forward-looking statement
sentiments = torch.Tensor([0,0,0])

# Process each sentence, converting into tokens required by the FinBert model.
for sentence in sentences:
    #print(sentence)
    # FLS prediction
    prediction = nlp(sentence[:512], top_k=3)[0]['label']
    #print(prediction)

    # Capture FLS statements
    if prediction.startswith("Specific") or prediction.startswith("Non"):
        print(sentence)
        print(prediction)
        fls.append(sentence)
        # Tokenize - The FinBert model requires tensor-based tokens as input. For any given
        # sentence, I must ensure the length must does not exceed the models self-imposed limit.
        encoded_input = sent_tokenizer(sentence, return_tensors="pt", truncation=True)
        #print(encoded_input)
        
        with torch.no_grad():
            # Run the sentence through the model...
            output = model(**encoded_input)

            # The prediction will be in the form of a probability
            fls_sentiment = torch.nn.functional.softmax(output.logits, dim=-1)
                    
            # Tally the predictions for each sentence
            sentiments = sentiments+fls_sentiment
        print(sentiments)
# Record the percentage of FLS sentences
fls_pct.append(len(fls)/len(sentences)*100)
        
# Record the resulting sentiment for 'FLS' sentences within this section
sentiments = sentiments.divide(len(sentences))
        
score = model.config.id2label[sentiments.argmax().item()]
print(f'Filing: {i+1} contains {len(sentences)} sentences of which {len(fls)} are "FLS" with a sentiment of: {sentiments} => {score}')
        

In [93]:
results

Unnamed: 0,text,FilingDate,section,close,fls_pct,fls_sentiment
0,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-11-21T16:36:29-05:00,ManagementDiscussion,492.64,21.6,negative
1,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-08-25T19:36:34-04:00,ManagementDiscussion,459.09,15.178571,negative
2,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-05-26T16:17:32-04:00,ManagementDiscussion,391.42,14.563107,negative
3,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-11-18T16:35:31-05:00,ManagementDiscussion,,26.865672,negative
4,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-08-31T17:01:35-04:00,ManagementDiscussion,,23.125,negative
5,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-05-27T16:21:55-04:00,ManagementDiscussion,,25.203252,negative


In [92]:
# Visualize the results
plot(results['FilingDate'],"Distribution of 10-Q filings for",'% of FLS sentences', results['close'], percent_fls=results['fls_pct'], _=sentiment_bars(results['fls_sentiment']))

SyntaxError: invalid character in identifier (4174397727.py, line 2)