In [6]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import string

import nltk
import ast


from transformers import BertTokenizer, BertModel
import torch

## Last cleaning

In [2]:
df = pd.read_csv('data/10_K_with_paragraphs.csv')

In [3]:
df = df[df['paragraphs'] != '[]']
df = df[df["date"]>2008]

In [4]:
value_counts = df['ticker'].value_counts()
tickers_to_keep = value_counts[value_counts > 3].index
df = df[df['ticker'].isin(tickers_to_keep)]

In [5]:
df.reset_index(drop=True, inplace=True)

In [6]:
display(df)

Unnamed: 0,date,filingDate,url,cik,ticker,paragraphs
0,2023,2023-01-31,https://www.sec.gov/Archives/edgar/data/131860...,1318605,TSLA,[' Item 9. Changes in and Disagreements with A...
1,2022,2022-02-07,https://www.sec.gov/Archives/edgar/data/131860...,1318605,TSLA,"[' \xa0 If an emerging growth company, indica..."
2,2021,2021-02-08,https://www.sec.gov/Archives/edgar/data/131860...,1318605,TSLA,"['BUSINESS Overview We design, develop, manufa..."
3,2020,2020-02-13,https://www.sec.gov/Archives/edgar/data/131860...,1318605,TSLA,"['BUSINESS \xa0 Overview We design, develop, m..."
4,2019,2019-02-19,https://www.sec.gov/Archives/edgar/data/131860...,1318605,TSLA,"['BUSINESS \xa0 Overview We design, develop, m..."
...,...,...,...,...,...,...
24818,2023,2023-02-24,https://www.sec.gov/Archives/edgar/data/106868...,1068689,ATDS,['\xa0 The risks described above should not be...
24819,2022,2022-03-31,https://www.sec.gov/Archives/edgar/data/106868...,1068689,ATDS,['\xa0 The risks described above should not be...
24820,2021,2021-03-23,https://www.sec.gov/Archives/edgar/data/106868...,1068689,ATDS,['Item 9. Changes in and Disagreements wit...
24821,2020,2020-04-17,https://www.sec.gov/Archives/edgar/data/106868...,1068689,ATDS,['Item 9. Changes in and Disagreements wit...


In [7]:
df.to_csv('data/10_K_with_relevant_paragraphs.csv', index=False)

## Splitter

In [None]:
def split_dataframe(df, num_parts, base_filename):
    # Calculate the number of rows in each part
    rows_per_part = int(np.ceil(len(df) / num_parts))
    
    for i in range(num_parts):
        start_row = i * rows_per_part
        end_row = min((i + 1) * rows_per_part, len(df))
        df_part = df.iloc[start_row:end_row]
        
        # Generate filename for each part
        filename = f"{base_filename}_part_{i+1}.csv"
        df_part.to_csv(filename, index=False)
        print(f"Saved {filename}")

# Split the dataframe into 20 parts
split_dataframe(df, num_parts=20, base_filename='data/10_K_with_relevant_paragraphs.csv')

## Import model and build function

In [None]:
# Load the FinBERT model and tokenizer from Hugging Face
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertModel.from_pretrained('yiyanghkust/finbert-tone')


In [8]:
def _10K_to_vector(paragraphs):
    # Tokenize the paragraphs
    inputs = tokenizer(paragraphs, return_tensors='pt', padding=True, truncation=True, max_length=512)
    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = np.array(outputs.last_hidden_state.mean(dim=1)) # Pooling method to get a single vector per paragraph
        norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
        #print(norms)
        #print(norms.shape)
        normalized_embd = embeddings / norms
    
        #print(normalized_embd)
        #print(normalized_embd.shape)
        
        # Step 2: Compute the average of the normalized vectors
        embd_vector = np.mean(normalized_embd, axis=0)
    
    
    #print(embd_vector)
    #print(embd_vector.shape)
    return embd_vector

## Run

In [9]:
df = pd.read_csv('data/10_K_with_relevant_paragraphs.csv_part_11.csv')

In [10]:
# Use tqdm to create a progress bar
for i in tqdm(range(len(df)), desc="Processing 10_Ks"):
    df.at[i, "vector"] = str(_10K_to_vector(ast.literal_eval(df.at[i, "paragraphs"])).tolist())
    df.at[i, "paragraphs"]=None
    
    
    # Save the DataFrame every 1000 steps
    if (i + 1) % 50 == 0:
        df.to_csv("data/10_K_with_vector_11.csv", index=False)
        

# Optionally, save the final DataFrame after the loop completes
df.to_csv("data/10_K_with_vector_11.csv", index=False)
print("DONE !")

Processing 10_Ks: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1242/1242 [23:42:26<00:00, 68.72s/it]


DONE !


In [11]:
df = pd.read_csv('data/10_K_with_relevant_paragraphs.csv_part_12.csv')

In [12]:
# Use tqdm to create a progress bar
for i in tqdm(range(len(df)), desc="Processing 10_Ks"):
    df.at[i, "vector"] = str(_10K_to_vector(ast.literal_eval(df.at[i, "paragraphs"])).tolist())
    df.at[i, "paragraphs"]=None
    
    
    # Save the DataFrame every 1000 steps
    if (i + 1) % 50 == 0:
        df.to_csv("data/10_K_with_vector_12.csv", index=False)
        

# Optionally, save the final DataFrame after the loop completes
df.to_csv("data/10_K_with_vector_12.csv", index=False)
print("DONE !")

Processing 10_Ks: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1242/1242 [23:17:35<00:00, 67.52s/it]


DONE !


In [13]:
df = pd.read_csv('data/10_K_with_relevant_paragraphs.csv_part_13.csv')

In [14]:
# Use tqdm to create a progress bar
for i in tqdm(range(len(df)), desc="Processing 10_Ks"):
    df.at[i, "vector"] = str(_10K_to_vector(ast.literal_eval(df.at[i, "paragraphs"])).tolist())
    df.at[i, "paragraphs"]=None
    
    
    # Save the DataFrame every 1000 steps
    if (i + 1) % 50 == 0:
        df.to_csv("data/10_K_with_vector_13.csv", index=False)
        

# Optionally, save the final DataFrame after the loop completes
df.to_csv("data/10_K_with_vector_13.csv", index=False)
print("DONE !")

Processing 10_Ks: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1242/1242 [23:24:56<00:00, 67.87s/it]


DONE !


In [15]:
df = pd.read_csv('data/10_K_with_relevant_paragraphs.csv_part_14.csv')

In [16]:
# Use tqdm to create a progress bar
for i in tqdm(range(len(df)), desc="Processing 10_Ks"):
    df.at[i, "vector"] = str(_10K_to_vector(ast.literal_eval(df.at[i, "paragraphs"])).tolist())
    df.at[i, "paragraphs"]=None
    
    
    # Save the DataFrame every 1000 steps
    if (i + 1) % 50 == 0:
        df.to_csv("data/10_K_with_vector_14.csv", index=False)
        

# Optionally, save the final DataFrame after the loop completes
df.to_csv("data/10_K_with_vector_14.csv", index=False)
print("DONE !")

Processing 10_Ks: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1242/1242 [26:30:45<00:00, 76.85s/it]


DONE !


In [17]:
df = pd.read_csv('data/10_K_with_relevant_paragraphs.csv_part_15.csv')

In [18]:
# Use tqdm to create a progress bar
for i in tqdm(range(len(df)), desc="Processing 10_Ks"):
    df.at[i, "vector"] = str(_10K_to_vector(ast.literal_eval(df.at[i, "paragraphs"])).tolist())
    df.at[i, "paragraphs"]=None
    
    
    # Save the DataFrame every 1000 steps
    if (i + 1) % 50 == 0:
        df.to_csv("data/10_K_with_vector_15.csv", index=False)
        

# Optionally, save the final DataFrame after the loop completes
df.to_csv("data/10_K_with_vector_15.csv", index=False)
print("DONE !")

Processing 10_Ks: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1242/1242 [29:04:26<00:00, 84.27s/it]


DONE !


In [None]:
#add the additional files to transform... 