## Preprocessing: Tools

#### Main Imports

In [0]:
! pip install pyLDAvis

from IPython.display import clear_output

import pprint
import numpy as np
import nltk
nltk.download('wordnet')
import re

import spacy
spacynlp = spacy.load('en_core_web_sm')

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this

import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

nltk.download('stopwords')

from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

import string
exclude = set(string.punctuation)

from nltk.stem.wordnet import WordNetLemmatizer

lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    stop_free = stop_free.replace("-", " ")
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized 
  
clear_output()

#### Get CIK for a Ticker

In [0]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urlparse

def get_CIK(ticker):
    URL = "https://www.sec.gov/cgi-bin/browse-edgar?CIK="+ticker+"&owner=exclude&action=getcompany"
    page = requests.get(URL)
    soup = BeautifulSoup(page.text, "html.parser")
    CIK = soup.find(attrs={"name": "CIK"})
    return CIK["value"]

## Preprocessing: Bring in Data


#### Read Call Pickle and Build Meta-data and Text DataFrames

In [0]:
data_directory = "/content/gdrive/My Drive/DATS6450 - Labadie - Data/"

# import the call dataset
import pandas as pd
import numpy as np
import pickle
import os

from google.colab import drive
drive.mount('/content/gdrive')

if os.path.exists(data_directory+"ProcessedCallsPickleUpdated_v2"):
    pfile = open(data_directory+"ProcessedCallsPickleUpdated_v2", "rb")
    calls = pickle.load(pfile)                 
    pfile.close()
    
df_metadata = calls["metadata"]
df_text = calls["text"]
  
clear_output()

print("Metadata Shape:",df_metadata.shape, "Text Shape:",df_text.shape)

#### List of Companies and Industries/Sectors

In [4]:
company_list = pd.read_csv(data_directory+"companylist.csv")
company_list.shape

(5285, 9)

#### Financial Statement Data

In [0]:
directory=data_directory+"2018q4/"

filename = directory + "sub.txt"
df_sub = pd.read_table(filename,dtype={"cik":object})

filename = directory + "tag.txt"
df_tag = pd.read_table(filename,dtype={"cik":object})

filename = directory + "pre.txt"
df_pre = pd.read_table(filename,dtype={"cik":object})

filename = directory + "num.txt"
df_num = pd.read_table(filename,dtype={"cik":object})

df_fin_stmt = df_sub.merge(df_pre).merge(df_num).merge(df_tag)
df_fin_stmt_items = df_fin_stmt[df_fin_stmt["stmt"]=="IS"][["cik","line","plabel","tlabel"]].drop_duplicates()

<br>
<br>
<hr>
## Build Metadata DataFrame

In [0]:
df_metadata = df_metadata.merge(company_list, left_on=["ticker"], right_on=["Symbol"], how="left")

<br>
<br>
<hr>
## Build Text DataFrames

Add the full text of the prepared remarks of a call to the metadata dataframe

In [0]:
corpus, call_names, paragraph_nums, corpus_whole_calls, call_names_whole_calls = [], [], [], [], []
for _, row in df_metadata.iterrows():
    prepared_remarks = df_text[df_text["Link"]==row["Link"]]
    prepared_remarks = prepared_remarks[(prepared_remarks["Call Section"]=="Prepared Remarks")&
                                        (prepared_remarks["Speaker"]!="Operator")]
    
    corpus_whole_calls.append(" ".join(list(prepared_remarks["Text"])))
    
df_metadata["Prepared Remarks Text"] = corpus_whole_calls

Create a dataframe of the paragraphs of the prepared remarks

In [0]:
prepared_remarks = df_text[(df_text["Call Section"]=="Prepared Remarks")&(df_text["Speaker"]!="Operator")]

## Pickle Key Objects

In [23]:
data_directory = "/content/gdrive/My Drive/DATS6450 - Labadie - Data/"

# import the call dataset
import pandas as pd
import numpy as np
import pickle
import os

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Pickle Metadata DataFrame

In [0]:
if os.path.exists(data_directory+"Pickles/metadata.pkl"):
    os.remove(data_directory+"Pickles/metadata.pkl")
pfile = open(data_directory+"Pickles/metadata.pkl", "ab")
pickle.dump(df_metadata, pfile)                  
pfile.close()

Pickle Paragraphs DataFrame

In [0]:
if os.path.exists(data_directory+"Pickles/text_paragraph.pkl"):
    os.remove(data_directory+"Pickles/text_paragraph.pkl")
pfile = open(data_directory+"Pickles/text_paragraph.pkl", "ab")
pickle.dump(prepared_remarks, pfile)                  
pfile.close()

Pickle Financial Statements

In [0]:
if os.path.exists(data_directory+"Pickles/finstmts.pkl"):
    os.remove(data_directory+"Pickles/finstmts.pkl")
pfile = open(data_directory+"Pickles/finstmts.pkl", "ab")
pickle.dump(df_fin_stmt, pfile)                  
pfile.close()