<font size="6"> **Corpus Representation: SEC 10-K Fillings** </font>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%run ../nb_config.py

In [3]:
import nltk
import numpy as np
import pandas as pd
import scipy
import pickle
import pprint

from src.nlp_quant import bow_sent
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re
import os

from tqdm import tqdm

In [4]:
from src import utils
from src.load_data import load_sec10k, io_utils
from src.nlp_quant import bow_sent

In [5]:
cfg = utils.read_conf()

In [6]:
INPATH1 = os.path.join(io_utils.interim_path, 'sec_fillings', '')
INPATH2 = os.path.join(INPATH1,'ten_ks', '')
OUTPATH1 =  os.path.join(io_utils.interim_path, 'sec_fillings', '')
os.path.isdir(INPATH1), os.path.isdir(INPATH2), os.path.isdir(OUTPATH1)

(True, True, True)

In [22]:
INFILE1 = 'metadata.pkl'
OUTFILE1 = 'parsed_sentiment_loughran_mcdonald.csv'
OUTFILE2 = 'tenks_risk_tfidf_by_sent.pkl'
OUTFILE3 = 'tenks_risk_doc_len.csv'

# Load Sentiments vocabularies and NLTK Copora

In [9]:
with open(INPATH1 + INFILE1, 'rb') as file:
    metadata = pickle.load(file)

In [10]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manuelalberto.romero\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manuelalberto.romero\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Package wordnet is already up-to-date!


True

In [19]:
re_word_pattern = re.compile('\w+')
wlm = WordNetLemmatizer()
lemma_english_stopwords = bow_sent.lemmatize_words(wlm, stopwords.words('english'))

In [12]:
sentiment_df = load_sec10k.get_sentiment_loughran_mcdonald()

In [13]:
# Apply the same preprocessing to these words as the 10-k words
sentiment_df['word'] = bow_sent.lemmatize_words(wlm, sentiment_df['word'].str.lower())
sentiment_df = sentiment_df.drop_duplicates('word')

In [14]:
sentiment_df.sample(5)

Unnamed: 0,negative,positive,uncertainty,litigious,constraining,interesting,word
251,True,False,False,False,False,False,absenteeism
50507,False,False,False,True,False,False,offense
17744,True,False,False,False,False,False,cybercriminals
19600,True,False,False,False,False,False,deprecation
47017,True,False,False,False,False,False,monopolization


In [15]:
sentiments = sentiment_df.drop(columns=['word']).columns

sentiments_dict = {}
for sent_col in sentiments:
    sentiments_dict[sent_col] = sentiment_df.loc[sentiment_df[sent_col], 'word'].values

# Apply NLP Pipeline to 10Ks documents: TF-IDF Representation

Pipeline steps:
1. Tokenization
2. Lemmatization
3. Stop words Removal
4. Representation: TF-IDF
    * Compute a numerical reprentation of a corpus as a matrix, where each document is a row and each column is a vocabulary-token. Each value is a trade of between TF and IDF. Each document is characterized by a set of tokens frequency, therefore no strict semantinc relationships are captured
    * TF: Token Frequency: Token j frequency in document i
    * IDF: Inverse Document Frequency: A penalization on number of documents that token j appears

In [21]:
tf_idf_by_sent = bow_sent.batch_tfidf(inpath=INPATH2, batch_size=250,
                                      lemmatizer=wlm, stopwords=lemma_english_stopwords, re_word_pattern=re_word_pattern,
                                      vocabs=sentiments_dict)

Extracting tf-idf:   0%|                                                                     | 0/34 [00:00<?, ?batch/s]INFO:src.nlp_quant.bow_sent:Tickers in batch: ['abbv', 'adi', 'aee', 'abt', 'ads', 'adm', 'adp', 'aap', 'abc', 'adbe', 'aapl', 'aal', 'acn', 'adsk']
Extracting tf-idf:   3%|█▋                                                        | 1/34 [02:14<1:14:01, 134.59s/batch]INFO:src.nlp_quant.bow_sent:Tickers in batch: ['aig', 'aet', 'aee', 'afl', 'agn', 'aiv', 'alk', 'alb', 'aep', 'aes', 'akam', 'ajg', 'algn', 'aiz']
Extracting tf-idf:   6%|███▍                                                      | 2/34 [05:45<1:23:55, 157.36s/batch]INFO:src.nlp_quant.bow_sent:Tickers in batch: ['all', 'anss', 'amgn', 'alxn', 'antm', 'amp', 'alk', 'ame', 'alle', 'amat', 'amg', 'amd', 'amzn', 'amt']
Extracting tf-idf:   9%|█████                                                     | 3/34 [08:52<1:25:54, 166.28s/batch]INFO:src.nlp_quant.bow_sent:Tickers in batch: ['ayi', 'are', 'apa', 'apd', '

In [13]:
def batch_doc_len(inpath, batch_size):
    in_listdir = os.listdir(inpath)[:100]

    n_batches = int(len(in_listdir)/batch_size)
    in_listdir_batches = np.array_split(in_listdir, n_batches)
    
    doc_len_df_lst = []
    for batch in tqdm(in_listdir_batches, desc=f'Extracting tf-idf', unit='batch'):
        docs_meta = bow_sent.filenames_to_index(batch)
        docs_len_lst = []
        # Read docs and create a list of documents to process: docs_lst
        for file in batch:
            ticker, doc_type, date = file.split("_")
            date = date.split(".")[0]
            infilename = inpath + file

            with gzip.open(infilename, "rb") as f:
                doc = f.read()
            doc = doc.decode()
            docs_len_lst.append(len(doc))  # Compute doc length
            
        doc_len_df_lst.append(pd.Series(index=docs_meta, data=docs_len_lst, name='doc_len'))
        
    return pd.concat(docs_len_lst)

In [20]:
from src.nlp_quant import bow_sent

doc_lens = bow_sent.batch_doc_len(inpath=INPATH2, batch_size=1000, re_word_pattern=re_word_pattern)

Extracting tf-idf: 100%|██████████| 8/8 [02:41<00:00, 20.23s/batch]


In [21]:
doc_lens.head()

ticker  date      
aal     2003-04-15    23454
        2004-02-27    22242
        2005-02-25    23472
        2006-02-24    38997
        2007-02-23    39831
Name: doc_len, dtype: int64

# Write Preprocessed 10Ks

In [31]:
sentiment_df.to_csv(OUTPATH1 + OUTFILE1, index=False)

In [32]:
bow_sent.write_sent_tfidf_dict(path=OUTPATH1, name=OUTFILE2, sent_tfidf_dict=tf_idf_by_sent)

In [23]:
doc_lens.to_csv(OUTPATH1 + OUTFILE3)