In [None]:
Nick Zincone
https://developers.refinitiv.com/en/article-catalog/article/using-ai-modeling-to-interpret-10-Q-filings
https://pypi.org/project/sec-api/

In [1]:
#!pip3 install transformers
#!pip3 install torch
#!conda install -c pytorch torchtext
#!conda install pytorch torchvision -c pytorch
#!pip3 install refinitiv.dataplatform
#!pip3 install refinitiv.data

In [38]:
# NLP package used to aid in text manipulation
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Machine Learning modules used to prepare and measure text
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch

# HTML text processing
from bs4 import BeautifulSoup

# Helper modules
import matplotlib.pyplot as plt
from tqdm.notebook import trange # Progress bar
import pandas as pd

# Refinitiv packages to extract filings data and retrieve price data
import refinitiv.data as rd
from refinitiv.data.content import filings

# Convenient modules to simplify API access to Filings
%run ./FilingsQuery.ipynb
%run ./SymbolLookup.ipynb
pd.set_option('display.max_colwidth', 60)

In [3]:
# Open a session to retrieve Filings data from RDP
rd.open_session()

<refinitiv.data.session.Definition object at 0x17cd6a5e0 {name='workspace'}>

In [4]:
# Load the models
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)

In [5]:
# Download the Pre-trained transformer used to process our raw text
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')


In [9]:
prediction = nlp("The future for next years sales will increase by 10 %.", top_k=3)
prediction

NameError: name 'pytorch' is not defined

In [10]:
# Sentiment - Download the Pre-trained transformer used to process our raw text
sent_tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")

Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [11]:
# Sentiment - Download the FinBert model used to process our transformed data
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")


Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [39]:
# Use the RIC to define our company identifier
identifier = "MSFT.O" # Tesla

# Data ranges to capture our data sets used to predict sentiment
start = "2010-01-01T00:00:00Z"
end = "2022-10-31T23:59:59Z"

In [40]:
# Query the Filings service using the Refinitiv Data Library for Python.
#
# Retrieve SEC filings for the specific company - specify the text we want to retrieve
# is defined within the "Management Discussion" section.
response = filings.search.Definition(
   query = FilingsQuery().form_type("10-Q")
                         .orgid(SymbolLookup(ric=identifier).convert())
                         .feed(FilingsQuery.Feed.EDGAR)
                         .sections(["ManagementDiscussion"])
                         .start_date(start)
                         .end_date(end)
                         .limit(100)
                         .graphQL
).get_data()

In [41]:
response.data.df

Unnamed: 0,DocumentTitle,Filename,MimeType,Dcn,DocId,FinancialFilingId
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,
4,,,,,,
...,...,...,...,...,...,...
181,,,,,,
182,,,,,,
183,,,,,,
184,,,,,,


In [23]:
# Retrieve and report the # of reports based on our query
reports = response.data.raw['data']['FinancialFiling']
organization = reports[0]["FilingOrganization"]['Names']['Name']['OrganizationName'][0]['Name']

In [24]:
# Declare our final results table
results = pd.DataFrame()
dates = []
text = []

# Pull out the filings text for each report
for filing in reversed(reports):
    section = filing['FilingDocument']['Sections']['ManagementDiscussion']['Text']
    filingDate = filing['FilingDocument']['DocumentSummary']['FilingDate']

    if section is not None:
        dates.append(filingDate[:10])
        
        # Process the text. Using BeautifulSoup and other means to clean the data
        beautifulSoupText = BeautifulSoup(section, "html.parser")

        # Clean the data and capture it for later processing
        text.append(beautifulSoupText.prettify(formatter=lambda s: s.replace(u'\xa0', ' ').replace('\r', ' ')))
        results['FilingDate'] = dates
        results['section'] = text

In [26]:
reports

[{'_metadata': {'totalCount': 38,
   'cursor': 'WzE2NjY1Njk2MDAwMDAsIjk3Njc1NzcyMjM1Il0='},
  'FilingOrganization': {'Names': {'Name': {'OrganizationName': [{'Name': 'TESLA, INC.'}]}}},
  'FilingDocument': {'Identifiers': [{'OrganizationId': None, 'Dcn': None}],
   'DocId': None,
   'FinancialFilingId': None,
   'Sections': {'ManagementDiscussion': {'Text': None}},
   'DocumentSummary': {'DocumentTitle': None,
    'FeedName': None,
    'FormType': None,
    'HighLevelCategory': None,
    'MidLevelCategory': None,
    'FilingDate': None,
    'SecAccessionNumber': None,
    'SizeInBytes': None},
   'FilesMetaData': [{'FileName': None, 'MimeType': None},
    {'FileName': None, 'MimeType': None},
    {'FileName': None, 'MimeType': None},
    {'FileName': None, 'MimeType': None},
    {'FileName': None, 'MimeType': None}]}},
 {'_metadata': {'totalCount': 38,
   'cursor': 'WzE2NTg3MDcyMDAwMDAsIjk3NjczNTAxMzc2Il0='},
  'FilingOrganization': {'Names': {'Name': {'OrganizationName': [{'Name': 'TE