In [None]:
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import nltk




In [None]:
headers = {
    'User-Agent': 'Independent albertomanzi92@gmail.com'
}

links = ['https://www.sec.gov/Archives/edgar/data/1318605/000095017023001409/tsla-20221231.htm',
         'https://www.sec.gov/Archives/edgar/data/320193/000032019322000108/aapl-20220924.htm',
         'https://www.sec.gov/Archives/edgar/data/51143/000155837023002376/ibm-20221231x10k.htm']




In [None]:
def clean_data(links):
    #instantiate dictionary to store data
    company_data = {}
    for link in links:
            # extract  ticker symbol from URL
        ticker = re.search(r'/([^/]+)-', link).group(1)
        company_data[ticker] = {}
        
        r = requests.get(link, headers=headers)
        raw_10k = r.text

        #regex to find specific items of 10-k
        regex = re.compile(r'(>Item(\s|&#160;|&#xA0;|&nbsp;)(1|1A|1B|2|7A|7|8|9)(?=\.|\s|$))|(ITEM\s(1|&#xA00;|1A|1B|2|7A|7|8|9)(?=\.|\s|$))|(>Item(\s|&#160;|&nbsp;)1(?=\.|\s|$))|(ITEM\s1(?=\.|\s|$))')

        # Use finditer to match the regex
        matches = regex.finditer(raw_10k)

        #Write a for loop to print the matches
        for match in matches:
            print(match)
            
        # Matches
        matches = regex.finditer(raw_10k)

        # Create  dataframe
        test_df = pd.DataFrame([(x.group(), x.start(), x.end()) for x in matches])
        test_df.columns = ['item', 'start', 'end']
        test_df['item'] = test_df.item.str.lower()

        # Get rid of unnecessary characters from the dataframe
        test_df.replace('&#160;',' ',regex=True,inplace=True)
        test_df.replace('&nbsp;',' ',regex=True,inplace=True)
        test_df.replace(' ','',regex=True,inplace=True)
        test_df.replace('\.','',regex=True,inplace=True)
        test_df.replace('>','',regex=True,inplace=True)

        #remove first half of entries as they are indexes
        pos_dat = test_df[(len(test_df)//2):]

        # Set item as the dataframe index
        pos_dat.set_index('item', inplace=True)

        # display the dataframe
        print(pos_dat)

        #divide into items
        item_1_raw = raw_10k[pos_dat['start'].iloc[0]:pos_dat['start'].iloc[1]]
        item_1a_raw = raw_10k[pos_dat['start'].iloc[1]:pos_dat['start'].iloc[2]]
        item_1b_raw = raw_10k[pos_dat['start'].iloc[2]:pos_dat['start'].iloc[3]]
        item_7_raw = raw_10k[pos_dat['start'].iloc[3]:pos_dat['start'].iloc[4]]
        item_7a_raw = raw_10k[pos_dat['start'].iloc[4]:pos_dat['start'].iloc[5]]
        item_8_raw = raw_10k[pos_dat['start'].iloc[5]:pos_dat['start'].iloc[6]]
        
        item_1_content = BeautifulSoup(item_1_raw, 'lxml').get_text()        
        item_1a_content = BeautifulSoup(item_1a_raw, 'lxml').get_text() 
        item_1b_content = BeautifulSoup(item_1b_raw, 'lxml').get_text()
        item_7_content = BeautifulSoup(item_7_raw, 'lxml')
        item_7a_content = BeautifulSoup(item_7a_raw, 'lxml')
        item_8_content = BeautifulSoup(item_8_raw, 'lxml')
        
        #add to dictionary
        tickerdict = {
        'Item 1': item_1_content,
        'Item 1A': item_1a_content,
        'Item 1B': item_1b_content,
        'Item 7': item_7_content,
        'Item 7A': item_7a_content,
        'Item 8': item_8_content
    }
        # add the ticker dictionary to the company data dictionary
        company_data[ticker] = tickerdict
        
    return company_data



In [None]:
company_data = clean_data(links)

Business Summary

In [None]:
def business_summary(input):
    # use pegasus model pre-trained on financial data
    from transformers import PegasusTokenizer, PegasusForConditionalGeneration, TFPegasusForConditionalGeneration

    # Load  model and  tokeniser 
    model_name = "human-centered-summarization/financial-summarization-pegasus"
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name)
    
    text_to_summarize = input
    input_ids = tokenizer(text_to_summarize, return_tensors="pt").input_ids
    
    # Generate output
    output = model.generate(
        input_ids, 
        max_length=75, 
        num_beams=5, 
        early_stopping=True
    )
    # Print the generated summary
    output = tokenizer.decode(output[0], skip_special_tokens=True)
    print(output)
    return output

In [None]:
for company in company_data:
    company_data[company]['Business Summary'] = business_summary(company_data[company]['Item 1'][0:400])

Core industry/themes

In [None]:
# import themes_df - pandas df containing industries and themes
from themes import themes_df
import torch
from transformers import pipeline


In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

classifier = pipeline("zero-shot-classification", model="sileod/deberta-v3-base-tasksource-nli", device=device, multi_label=True)

In [None]:
# classifies text into categories

def run_classifier(input, labels, classifier=classifier, threshold=0.5):
  results = classifier(input,labels)
  reach_threshold = len([item for item in results['scores'] if item > threshold])
  return results['labels'][0:(max(1,reach_threshold))]


In [None]:
#recursively goes through themes to find sub-themes

def find_themes(input_text, df):
  themes = []
  for sector in run_classifier(input_text, df['Sector'].unique()):
    print(sector)
    for industry in run_classifier(input_text, df[df['Sector'] == sector]['Industry'].unique()):
      print(industry)
      for subindustry in run_classifier(input_text, df[(df['Sector'] == sector) & (df['Industry'] == industry)]['Sub-Industry'].unique()):
        print(subindustry)
        for theme in run_classifier(input_text, df[(df['Sector'] == sector) & (df['Industry'] == industry) & (df['Sub-Industry'] == subindustry)]['Theme'].unique()):
          themes.append(df[df['Theme'] == theme].values.tolist()[0])
  return themes

In [None]:
for company in company_data:
     company_data[company]['Themes'] = find_themes(company_data[company]['Item 1'],themes_df)

Core products/services

In [14]:
#import gliner one-shot named entity recognition model
from gliner import GLiNER

model = GLiNER.from_pretrained("urchade/gliner_base")

labels = ["products", "services"]



: 

In [None]:
def find_prods(company, model=model, labels=labels):
    input = company['Item 1']
    products = []
    services = []
    entities = model.predict_entities(input, labels)
    for entity in entities:
        print(entity["text"], "=>", entity["label"])
        if entity['label'] == 'products':
            products.append(entity['text'])
        elif entity['label'] == 'services':
            services.append(entity['text'])
    print(products)
    print(services)
    return products,services

In [None]:
for company in company_data:
    company_data[company]['Products'],company_data[company]['Services'] = find_prods(company_data[company])

In [None]:
company_data['tsla']['Services']

Customers/market segments

In [None]:
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

In [None]:
def find_topics(company):
    text = company['Item 1']
    topics = [
    "Company overview", "Segment information", "Products and services", "Technology", "Sales and marketing", "Service and warranty", "Manufacturing",
    "Supply chain", "Government programs, incentives, and regulations", "Competition", "Customers", "Demographics", "Target market"]
    
    #break down input into iterable
    iterabletext = nltk.sent_tokenize(text)  
    print(len(iterabletext))
    topic_model = BERTopic(
    embedding_model="all-minilm-l6-v2", 
    min_topic_size=15,
    zeroshot_topic_list=topics,
    zeroshot_min_similarity=.85,
    representation_model=KeyBERTInspired()
)
    topics, _ = topic_model.fit_transform(iterabletext)
    return topics

In [None]:
for company in company_data:
    company_data[company]['Market segments'] = find_topics(company_data[company])

Financial results summary

In [None]:
def return_financials(company):
    #extract to beautifulsoup object
    tables = company['Item 8'].find_all('table')
    dfs = []
    for table in tables:
        df = pd.read_html(StringIO(str(table)))[0]
        dfs.append(df)
    
    #drop columns with more than half NA values
    for index,df in enumerate(dfs):
        threshold = df.shape[1]//2
        df = df.dropna(thresh=threshold)
        dfs[index] = df    
        
    #keep only first 8 tables 
    if len(dfs) > 8:
        dfs = dfs[0:8]
    return dfs

In [None]:
for company in company_data:
    company_data[company]['Financials'] = return_financials(company_data[company])

Final outcome

In [None]:
for company in company_data:
    print(company_data[company].keys())