In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import re
from wordfreq import top_n_list
import numpy as np
from nltk.corpus import stopwords
import requests
import requests_random_user_agent
from tqdm.notebook import tqdm
from nltk.stem import PorterStemmer
import pandas as pd
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize
import dask
dask.config.set(scheduler="processes")
from tqdm.dask import TqdmCallback
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import glob
from gensim.test.utils import datapath
import logging
import random
from random import sample
random.seed(1)
from fpdf import FPDF
from IPython.display import clear_output
import time
import pickle
import random

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
%run useful_functions.py

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\natha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


update 14


## TOKENS part

In [4]:
#read in the previously saved stocknames file
stocknames = pd.read_csv("../../data/stocknames.csv.gz", na_filter = False)
stocknames.replace('', np.nan, inplace = True)

In [None]:
#TEST #2017 2020 2021 2022 2023 2024
#tokenize each sentence of each 10-K statements and save the resulting tockens
save_path = '../../data/10k_statements_new/tokens/'
years = np.arange(2022,2025)

common_words = top_n_list('en', 100)
stop_words = stopwords.words('english')
alphabet = re.compile('[^a-z]')

delayed_tokens = dask.delayed(get_tokens)

list_missing_tokens=[]
for year in [2021,2022,2023,2017]: #years 
    #take the urls of all the 10-K fillings for each year
    urls = stocknames.filter(like = 'url_{}'.format(year))
    urls.index = stocknames.ticker
    urls = urls.dropna()
    
    #compute in batches
    len_batch = 250
    #ceil of euclidian division
    nb_batches = -(len(urls) // -len_batch)
    start = 0
    end = len_batch
    for batch in tqdm(range(nb_batches), desc = 'Tokenizing 10-Ks for {}'.format(year),leave = False):
        for j in range(10): #j attempt to create allpromises list
            #get a list of tokens for each sentence of each document
            allpromises = []
            for ticker, url in urls.iloc[start:end,:].itertuples():
                text=None
                for i in range(10): #i attempt to request text from url
                    try:
                        text = requests.get(url).text
                    except:
                        print("request failed")
                        time.sleep(120)
    
                    if text!=None:
                        break
                        
                if text==None:
                    print("after 10 attempts (20 min), the server still failed to respond, shutting down program...")
                    stop
                            
                        
                allpromises.append(delayed_tokens(text, ticker = ticker))
    
               
            there_was_no_error=True
            with TqdmCallback(desc = 'Tokenizing 10-Ks in batch', leave = False):
                
    
                try:
                    tokens = dask.compute(allpromises)[0]
    
    
                except Exception as e:
                            # Generate a random 6-digit number
                            random_number = random.randint(100000, 999999)

                    
                            print("error with tokens, error #"+str(random_number))
                            there_was_no_error=False
                            list_missing_tokens.append(allpromises)
    
                                            
                            #with open(str(year)+'missing_tokens_'+str(random_number)+'.pickle', 'wb') as f:
                            #    pickle.dump(list_missing_tokens, f)
    
    
            #save the tokens
            if there_was_no_error:
                for dict_ in tokens:
                            ticker = list(dict_.keys())[0]
                            tokens_df = pd.DataFrame(dict_[ticker])
                            tokens_df.to_csv(save_path+'{}/{}_tokens.csv.gz'.format(year, ticker))

                start += len_batch
                end += len_batch
                break

            else:
                print("rebuilding the 'allpromises' list, iteration : "+str(j))
                time.sleep(60)

            if j==9:
                print("impossible to build 'allpromises' after 10 iterations, shutting down program...")
                stop
                        
                

    time.sleep(120)
    
clear_output()
print('Done')

Tokenizing 10-Ks for 2021:   0%|          | 0/19 [00:00<?, ?it/s]

request failed


In [None]:
token_path = '../../data/10k_statements_new/tokens/{}/*'
#print summary statistics of the paragraphs from the 10-Ks
stats = []
index = []
for year in years:
    files = glob.glob(token_path.format(year))
    allpromises = []
    for file in files:
        allpromises.append(get_token_stats(file))
        ticker = re.findall('\d/(.*)_tokens.csv.gz', file)[0]
        index.extend([ticker+'_'+str(year)])
        
    with TqdmCallback(desc = 'Computing stats on tokens for {}'.format(year), leave = False):
        stats.extend(dask.compute(allpromises)[0])
        
clear_output()
pd.DataFrame(stats, columns = ['nb_paragraphs', 'avg_paragraph_len'], index = index).dropna().describe()

In [None]:
pd.DataFrame(stats, columns = ['nb_paragraphs', 'avg_paragraph_len'], index = index).dropna().hist(bins = 50)
plt.show()

In [None]:
#from here, there is code I don't currently used but I still want to have save somewhere

## item 1A

In [None]:
save_path = '../../data/10k_statements_new/item_1A_limits/'
years = np.arange(2007,2025)

delayed_tokens = dask.delayed(get_tokens)
for year in years:
    limits_df = pd.DataFrame()
    #take the urls of all the 10-K fillings for each year
    urls = stocknames.filter(like = 'url_{}'.format(year))
    urls.index = stocknames.ticker
    urls = urls.dropna()
    
    #compute in batches
    len_batch = 250
    #ceil of euclidian division
    nb_batches = -(len(urls) // -len_batch)
    start = 0
    end = len_batch
    for batch in tqdm(range(nb_batches), desc = 'Finding item 1A of 10-Ks for {}'.format(year),
                     leave = False):
        #get the index of the first and last paragraph of Item 1A of each document
        allpromises = []
        for ticker, url in urls.iloc[start:end,:].itertuples():
            text = requests.get(url).text
            allpromises.append(delayed_tokens(text, ticker = ticker, find_item1A_ = True))

        with TqdmCallback(desc = 'Finding item 1A of 10-Ks in batch', leave = False):
            limits = dask.compute(allpromises)[0]

        #save the limits
        temp = pd.concat([pd.DataFrame(l) for l in limits], axis=1)
        limits_df = pd.concat([limits_df, temp], axis=1)
        
        start += len_batch
        end += len_batch
    
    limits_df.index = ['start', 'stop']
    limits_df.to_csv(save_path+'{}_limits.csv'.format(year))
    time.sleep(90)
    
clear_output()
print('Done')

In [None]:
limit_path = '../../data/10k_statements_new/item_1A_limits/{}_limits.csv'
limits = pd.DataFrame()
for year in years:
    temp = pd.read_csv(limit_path.format(year), index_col = 0)
    temp.index = temp.index+'_'+str(year)
    limits = pd.concat([limits, temp], axis = 0)

## Paragraph vectors

In [None]:
token_path = '../../data/10k_statements_new/tokens/{}/*'
save_path = '../../data/10k_statements_new/paragraph_vectors/'
years = np.arange(2007,2025)
model_ = 'model_dbow_v200_e50_w15'

#Save the vectors for all firm-year observations
for year in years:
    files = glob.glob(token_path.format(year))
    
    model = get_best_model()

    for i,file in enumerate(tqdm(files, desc = f'Vectorizing paragraphs for {year}', leave = False)):
        ticker = re.findall('\d/(.*)_tokens.csv.gz', file)[0]
        tockens = pd.read_csv(file, index_col = 0, dtype = 'object').T

        vects = []
        sentences = []
        for j, col in enumerate(tockens.columns):
            v = get_vect(model, tockens[col].dropna().values, '10k{}_{}'.format(0,j))
            vects.append(v)
        
        vects = pd.DataFrame(vects)
        vects.to_csv(save_path+'{}/{}_vectors.csv.gz'.format(year, ticker), index = False)

clear_output()
print('Done')