# Información y Programación Financiera, TI, Algoritmos y Ciencia de Datos

Neftalí Valdez

<a href="http://twitter.com/neftalivldz" target="_blank">@neftalivldz</a> | <a href="mailto:nvaldez@tec.mx">nvaldez@tec.mx</a>

Referencias

<a href="https://developers.refinitiv.com/en/article-catalog/article/using-ai-modeling-to-interpret-10-Q-filings
" target="_blank">Artículo Original de Nick Zincone</a> 

<a href="https://pypi.org/project/sec-api/" target="_blank"> SEC API</a> 

<a href="https://huggingface.co/yiyanghkust/finbert-fls" target="_blank">Hugging Face.- Forward Looking Statements</a>


In [4]:
#!pip3 install transformers
#!pip3 install torch
#!conda install -c pytorch torchtext
#!conda install pytorch torchvision -c pytorch
#!pip3 install sec-api

In [9]:
import eikon as ek  # the Eikon Python wrapper package
import numpy as np  # NumPy
import pandas as pd  # pandas
import cufflinks as cf  # Cufflinks
import configparser as cp
import datetime as dt
cf.set_config_file(offline=True)  # set the plotting mode to offline

In [241]:
# NLP package used to aid in text manipulation
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('punkt')

# Machine Learning modules used to prepare and measure text
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch

# HTML text processing
from bs4 import BeautifulSoup

# Helper modules
import matplotlib.pyplot as plt
from tqdm.notebook import trange # Progress bar
import pandas as pd

from sec_api import QueryApi

pd.set_option('display.max_colwidth', 60)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/neftalivaldez/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [188]:
cfg = cp.ConfigParser()
cfg.read('../refinitiv.cfg')
ek.set_app_key(cfg['eikon']['app_id'])

In [189]:
cfg = cp.ConfigParser()
cfg.read('../secapi.cfg')
sec = cfg['sec']['app_id']


In [190]:
today = dt.date.today()
start = today - dt.timedelta(days=365.2*5)
print(today, start)

2023-04-20 2018-04-20


In [191]:
fields = ['TR.PortfolioConstituentName','TR.PortfolioWeight']
data, err = ek.get_data(['Portfolio(RETO2023_B)'],fields)
ric = data['Instrument'][3]

In [251]:
ric = 'MSFT.O'

In [252]:
#help(ek.get_symbology)

In [253]:
tick = ek.get_symbology(ric, from_symbol_type="RIC", to_symbol_type="ticker")['ticker'][0]
tick

'MSFT'

In [258]:
# Query the Filings service using the Refinitiv Data Library for Python.
#
# Retrieve SEC filings for the specific company - specify the text we want to retrieve
# is defined within the "Management Discussion" section.

queryApi = QueryApi(api_key=sec)

query = {
  "query": { "query_string": {
      "query": "ticker:MSFT AND filedAt:{2022-01-01 TO 2023-12-31} AND formType:\"10-Q\""
    } },
  "from": "0",
  "size": "10",
  "sort": [{ "filedAt": { "order": "desc" } }]
}

filings = queryApi.get_filings(query)

print(filings)

{'total': {'value': 4, 'relation': 'eq'}, 'query': {'from': 0, 'size': 10}, 'filings': [{'id': '3f455f9e50fd9b413801d596a91f6554', 'accessionNo': '0001564590-23-000733', 'cik': '789019', 'ticker': 'MSFT', 'companyName': 'MICROSOFT CORP', 'companyNameLong': 'MICROSOFT CORP (Filer)', 'formType': '10-Q', 'description': 'Form 10-Q - Quarterly report [Sections 13 or 15(d)]', 'filedAt': '2023-01-24T16:34:20-05:00', 'linkToTxt': 'https://www.sec.gov/Archives/edgar/data/789019/000156459023000733/0001564590-23-000733.txt', 'linkToHtml': 'https://www.sec.gov/Archives/edgar/data/789019/000156459023000733/0001564590-23-000733-index.htm', 'linkToXbrl': '', 'linkToFilingDetails': 'https://www.sec.gov/Archives/edgar/data/789019/000156459023000733/msft-10q_20221231.htm', 'entities': [{'companyName': 'MICROSOFT CORP (Filer)', 'cik': '789019', 'irsNo': '911144442', 'stateOfIncorporation': 'WA', 'fiscalYearEnd': '0630', 'type': '10-Q', 'act': '34', 'fileNo': '001-37845', 'filmNo': '23548555', 'sic': '737

In [259]:
import json
data = json.dumps(filings)
#data

In [260]:
filings['filings']

[{'id': '3f455f9e50fd9b413801d596a91f6554',
  'accessionNo': '0001564590-23-000733',
  'cik': '789019',
  'ticker': 'MSFT',
  'companyName': 'MICROSOFT CORP',
  'companyNameLong': 'MICROSOFT CORP (Filer)',
  'formType': '10-Q',
  'description': 'Form 10-Q - Quarterly report [Sections 13 or 15(d)]',
  'filedAt': '2023-01-24T16:34:20-05:00',
  'linkToTxt': 'https://www.sec.gov/Archives/edgar/data/789019/000156459023000733/0001564590-23-000733.txt',
  'linkToHtml': 'https://www.sec.gov/Archives/edgar/data/789019/000156459023000733/0001564590-23-000733-index.htm',
  'linkToXbrl': '',
  'linkToFilingDetails': 'https://www.sec.gov/Archives/edgar/data/789019/000156459023000733/msft-10q_20221231.htm',
  'entities': [{'companyName': 'MICROSOFT CORP (Filer)',
    'cik': '789019',
    'irsNo': '911144442',
    'stateOfIncorporation': 'WA',
    'fiscalYearEnd': '0630',
    'type': '10-Q',
    'act': '34',
    'fileNo': '001-37845',
    'filmNo': '23548555',
    'sic': '7372 Services-Prepackaged So

In [261]:
df = pd.json_normalize(filings['filings'])
df

Unnamed: 0,id,accessionNo,cik,ticker,companyName,companyNameLong,formType,description,filedAt,linkToTxt,linkToHtml,linkToXbrl,linkToFilingDetails,entities,documentFormatFiles,dataFiles,seriesAndClassesContractsInformation,periodOfReport
0,3f455f9e50fd9b413801d596a91f6554,0001564590-23-000733,789019,MSFT,MICROSOFT CORP,MICROSOFT CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2023-01-24T16:34:20-05:00,https://www.sec.gov/Archives/edgar/data/789019/000156459...,https://www.sec.gov/Archives/edgar/data/789019/000156459...,,https://www.sec.gov/Archives/edgar/data/789019/000156459...,"[{'companyName': 'MICROSOFT CORP (Filer)', 'cik': '78901...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '7', 'description': 'XBRL TAXONOMY EXTENSI...",[],2022-12-31
1,4a746e26f404299c8bb632a6c864b1fa,0001564590-22-035087,789019,MSFT,MICROSOFT CORP,MICROSOFT CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2022-10-25T16:08:55-04:00,https://www.sec.gov/Archives/edgar/data/789019/000156459...,https://www.sec.gov/Archives/edgar/data/789019/000156459...,,https://www.sec.gov/Archives/edgar/data/789019/000156459...,"[{'companyName': 'MICROSOFT CORP (Filer)', 'cik': '78901...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '9', 'description': 'XBRL TAXONOMY EXTENSI...",[],2022-09-30
2,51ce4292e8873deb1d7be191dbd49eff,0001564590-22-015675,789019,MSFT,MICROSOFT CORP,MICROSOFT CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2022-04-26T16:08:55-04:00,https://www.sec.gov/Archives/edgar/data/789019/000156459...,https://www.sec.gov/Archives/edgar/data/789019/000156459...,,https://www.sec.gov/Archives/edgar/data/789019/000156459...,"[{'companyName': 'MICROSOFT CORP (Filer)', 'cik': '78901...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '7', 'description': 'XBRL TAXONOMY EXTENSI...",[],2022-03-31
3,35d48ca58bcc83e681e697225c42c614,0001564590-22-002324,789019,MSFT,MICROSOFT CORP,MICROSOFT CORP (Filer),10-Q,Form 10-Q - Quarterly report [Sections 13 or 15(d)],2022-01-25T16:09:04-05:00,https://www.sec.gov/Archives/edgar/data/789019/000156459...,https://www.sec.gov/Archives/edgar/data/789019/000156459...,,https://www.sec.gov/Archives/edgar/data/789019/000156459...,"[{'companyName': 'MICROSOFT CORP (Filer)', 'cik': '78901...","[{'sequence': '1', 'description': '10-Q', 'documentUrl':...","[{'sequence': '7', 'description': 'XBRL TAXONOMY EXTENSI...",[],2021-12-31


In [262]:
from sec_api import ExtractorApi

extractorApi = ExtractorApi(sec)

#
# 10-Q example
#
# Tesla 10-Q filing
filing_url_10q = "https://www.sec.gov/Archives/edgar/data/1318605/000095017022012936/tsla-ex32_1.htm"

# get the original HTML of section 7 "Management’s Discussion and Analysis of Financial Condition and Results of Operations"
section_html = extractorApi.get_section(filing_url_10q, "part1item2", "html")
beautifulSoupText = BeautifulSoup(section_html, "html.parser").text.replace(u'\xa0', ' ').replace('\r', ' ')
beautifulSoupText


'ITEM 2.MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS The following discussion and analysis should be read in conjunction with the consolidated financial statements and the related notes included elsewhere in this Quarterly Report on Form 10-Q. Overview Our mission is to accelerate the world’s transition to sustainable energy. We design, develop, manufacture, lease and sell high-performance fully electric vehicles, solar energy generation systems and energy storage products. We also offer maintenance, installation, operation, financial and other services related to our products. Additionally, we are increasingly focused on products and services based on artificial intelligence, robotics and automation. In 2022, we have produced 563,987 vehicles and delivered 564,743 vehicles through the second quarter, despite ongoing supply chain challenges and factory shutdowns. We are currently focused on increasing vehicle production and capacity, improving a

In [263]:
# Parse Section Pending


In [264]:
# Declare our final results table
results = pd.DataFrame()
text = []
dates = []
section = []

# Pull out the filings text for each report
for i in range(0,len(df)):
    extractorApi = ExtractorApi(sec)
    filing_url_10q = df.linkToHtml.values.tolist()[i]

    # get the original HTML of section 7 "Management’s Discussion and Analysis of Financial Condition and Results of Operations"
    section_html = extractorApi.get_section(filing_url_10q, "part1item2", "html")
    beautifulSoupText = BeautifulSoup(section_html, "html.parser").text.replace(u'\xa0', ' ').replace('\r', ' ')
    # Clean the data and capture it for later processing
    text.append(beautifulSoupText)
    dates.append(df.filedAt.values.tolist()[i])
    section.append('ManagementDiscussion')
    

In [265]:
results['text'] = text
results['FilingDate'] = dates
results['section'] = section

In [266]:
results

Unnamed: 0,text,FilingDate,section
0,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-01-24T16:34:20-05:00,ManagementDiscussion
1,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-10-25T16:08:55-04:00,ManagementDiscussion
2,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-04-26T16:08:55-04:00,ManagementDiscussion
3,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-01-25T16:09:04-05:00,ManagementDiscussion


In [267]:
# Load models

In [268]:
# Load the models
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)

In [269]:
# Download the Pre-trained transformer used to process our raw text
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)

In [270]:
prediction = nlp("The future for next years sales will increase by 10 %.", top_k=3)
prediction

[{'label': 'Specific FLS', 'score': 0.8574215769767761},
 {'label': 'Non-specific FLS', 'score': 0.1353205293416977},
 {'label': 'Not FLS', 'score': 0.007257900666445494}]

In [271]:
# Sentiment - Download the Pre-trained transformer used to process our raw text
sent_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

In [272]:
# Sentiment - Download the FinBert model used to process our transformed data
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")


In [273]:
# Capture closing prices

In [274]:
# Container to hold the Closing Prices based on the filing date
prices = []

# Walk through the collection of filings and pull out the reported filing date
num_rows = len(results)

for i in range(num_rows):
    date = results.iloc[i]['FilingDate']
    end = dt.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S%z")
    start = dt.datetime.strptime(date, "%Y-%m-%dT%H:%M:%S%z") - dt.timedelta(minutes=1)
    try:
        response = ek.get_timeseries([ric], fields = ['CLOSE'], interval='minute', start_date=start, end_date=end)
        print(response)
        prices.append(response.iloc[0]['CLOSE'])
    except:
        prices.append(None)

if prices:
    results['close'] = prices

MSFT.O                  CLOSE
Date                         
2023-01-24 21:34:00  251.5114
MSFT.O                CLOSE
Date                       
2022-10-25 20:08:00  247.31
MSFT.O                CLOSE
Date                       
2022-04-26 20:08:00  261.55


2023-04-20 14:36:59,425 P[25364] [MainThread 8769829504] Error with MSFT.O: No data available for the requested date range
2023-04-20 14:36:59,427 P[25364] [MainThread 8769829504] MSFT.O: No data available for the requested date range | 


In [275]:
results

Unnamed: 0,text,FilingDate,section,close
0,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-01-24T16:34:20-05:00,ManagementDiscussion,251.5114
1,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-10-25T16:08:55-04:00,ManagementDiscussion,247.31
2,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-04-26T16:08:55-04:00,ManagementDiscussion,261.55
3,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-01-25T16:09:04-05:00,ManagementDiscussion,


In [276]:
def evaluate(filings):
    # Container to hold the percentages of FLS sentences within each filing
    fls_pct = []

    # Container to hold the sentiment scores
    scores = []
    
    # Walk through the collection of filings and feed into the FinBert models
    num_rows = len(filings)
    for i in trange(num_rows):
        # Pull out the "Management Section" text from our filings
        management_section = filings.iloc[i]['section']
        
        # For this section, break it into individual sentences
        sentences = sent_tokenize(management_section)
        
        # Initialize our FLS container
        fls = []
        
        # Define the container to collect stats related to the sentiment scores
        # for all forward-looking statement
        sentiments = torch.Tensor([0,0,0])
        
        # Process each sentence, converting into tokens required by the FinBert model.
        for sentence in sentences:
            # FLS prediction
            prediction = nlp(sentence[:512], top_k=3)[0]['label']

            # Capture FLS statements
            if prediction.startswith("Specific") or prediction.startswith("Non"):
                fls.append(sentence)
                # Tokenize - The FinBert model requires tensor-based tokens as input. For any given
                # sentence, I must ensure the length must does not exceed the models self-imposed limit.
                encoded_input = sent_tokenizer(sentence, return_tensors="pt", truncation=True)
                
                with torch.no_grad():
                    # Run the sentence through the model...
                    output = model(**encoded_input)

                    # The prediction will be in the form of a probability
                    fls_sentiment = torch.nn.functional.softmax(output.logits, dim=-1)
                    
                    # Tally the predictions for each sentence
                    sentiments = sentiments+fls_sentiment

        # Record the percentage of FLS sentences
        fls_pct.append(len(fls)/len(sentences)*100)
        
        # Record the resulting sentiment for 'FLS' sentences within this section
        sentiments = sentiments.divide(len(sentences))
        
        score = model.config.id2label[sentiments.argmax().item()]
        print(f'Filing: {i+1} contains {len(sentences)} sentences of which {len(fls)} are "FLS" with a sentiment of: {sentiments} => {score}')
        scores.append(score)
    
    # Add the measures to our results table
    filings['fls_pct'] = fls_pct
    filings[f'fls_sentiment'] = scores

In [277]:
# Plot the data
def plot(x_axis, title, y1_label, x2_axis, **kwargs):
    plt.style.use('dark_background')
    plt.rcParams['figure.figsize'] = (17,8)
    fig, ax = plt.subplots()
    for label, data in kwargs.items():
        # Special label '_' to plot vertical bar
        if label == '_':
            x = 0
            for s in data:
                plt.axvline(x, ymax=0.25, color=s['color'], label=s['label'], linestyle="--")
                x += 1
        else:
            ax.plot(x_axis, data, label=label)
    ax2 = ax.twinx()
    ax2.plot(x_axis, x2_axis, label="Closing Price", color='red')
    ax.tick_params(labelrotation=90)
    # Remove the spines from the graph - leave the bottom
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    # Add a faint grid
    ax.yaxis.grid(True, alpha=0.2)
    ax.xaxis.grid(True, alpha=0.2)
    # Add labels and a title. Note the use of `labelpad` and `pad` to add some
    # extra space between the text and the tick labels.
    ax.set_ylabel(y1_label, labelpad=12, fontsize=14, color='cyan')
    ax.set_title(title, pad=15, fontsize=16, color='cyan')
    if len(kwargs) > 1:
        ax.legend(loc='upper left')
    ax2.legend(loc='upper right')
    fig.tight_layout()

In [278]:
def sentiment_bars(scores):
    bars = []
    frequency = {'negative':0, 'neutral':0, 'positive':0}
    
    for s in scores:
        bar = {'color': '', 'label': ''}
        if s == 'negative':
            color = 'red'
        elif s == 'neutral':
            color = 'yellow'
        else:
            color = 'green'
            
        bar['color'] = color
        bar['label'] = s if frequency[s] == 0 else '_'
        frequency[s] += 1
        bars.append(bar)
        
    return bars

In [279]:
# Evaluate our predictions and compare against the sentement scores based on the 
# FLS sentences.
evaluate(results)

  0%|          | 0/4 [00:00<?, ?it/s]

Filing: 1 contains 1 sentences of which 0 are "FLS" with a sentiment of: tensor([0., 0., 0.]) => positive
Filing: 2 contains 1 sentences of which 0 are "FLS" with a sentiment of: tensor([0., 0., 0.]) => positive
Filing: 3 contains 1 sentences of which 0 are "FLS" with a sentiment of: tensor([0., 0., 0.]) => positive
Filing: 4 contains 1 sentences of which 0 are "FLS" with a sentiment of: tensor([0., 0., 0.]) => positive


In [280]:
results

Unnamed: 0,text,FilingDate,section,close,fls_pct,fls_sentiment
0,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2023-01-24T16:34:20-05:00,ManagementDiscussion,251.5114,0.0,positive
1,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-10-25T16:08:55-04:00,ManagementDiscussion,247.31,0.0,positive
2,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-04-26T16:08:55-04:00,ManagementDiscussion,261.55,0.0,positive
3,ITEM 2. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIA...,2022-01-25T16:09:04-05:00,ManagementDiscussion,,0.0,positive


In [247]:
# Top 10...
results.head(10)


Unnamed: 0,text,FilingDate,section,close,fls_pct,fls_sentiment
0,ITEM 2.MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL...,2022-10-24T06:08:50-04:00,ManagementDiscussion,207.42,0.0,positive
1,ITEM 2.MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL...,2022-07-25T06:07:25-04:00,ManagementDiscussion,275.666391,0.0,positive
2,ITEM 2.MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL...,2022-04-22T21:36:59-04:00,ManagementDiscussion,,0.0,positive
