## Tools to Build Corpus

In [0]:
stopword_adds = ["chief","officer","financial","result","call"]

In [74]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = set(stopwords.words('english'))
#[stop.add(x) for x in stopword_adds]

import string
exclude = set(string.punctuation)

from nltk.stem.wordnet import WordNetLemmatizer

lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    stop_free = stop_free.replace("-", " ")
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read Call Pickle and Build Meta-data and Text DataFrames

In [0]:
from IPython.display import clear_output

# import the call dataset
import pandas as pd
import pickle
import os

from google.colab import drive
drive.mount('/content/gdrive')

if os.path.exists("/content/gdrive/My Drive/School/DATS 6450 - NLP/Project/ProcessedCallsPickle"):
    pfile = open("/content/gdrive/My Drive/School/DATS 6450 - NLP/Project/ProcessedCallsPickle", "rb")
    calls = pickle.load(pfile)                      
    pfile.close()
    
metadata={link:data["metadata"] for link, data in calls.items()}
text={link:data["text"] for link, data in calls.items()}

clear_output()

## Filter Out Calls Missing Important Metadata

In [76]:
from datetime import datetime
from dateutil.parser import parse

df_metadata = pd.DataFrame.from_dict(metadata, orient="index")

print(df_metadata.shape)
key_fields = ['company', 'ticker', 'period', 'period_end_date', 'call date']
df_metadata["period_end_date"] = pd.to_datetime(df_metadata["period_end_date"], errors='coerce')
df_metadata["call date"] = pd.to_datetime(df_metadata["call date"], errors='coerce')
df_metadata = df_metadata[key_fields].dropna()
df_metadata["Link"]=df_metadata.index.values
print(df_metadata.shape)

(5195, 12)
(5101, 6)


In [77]:
print(len(text))
text = {call_key: text[call_key] for call_key in df_metadata.index.values}
print(len(text))

5195
5101


## Build Cleaned Corpus

#### Clean Each Paragraph of Each Call

In [0]:
df_text = pd.DataFrame(columns=["Link","Speaker","Call Section","Text","Cleaned Text","Call Paragraph Num"])

for key, value in text.items(): 
    # get the text dataframe associated with each call
    call_text = value
    
    call_text["Link"] = key
    
    # add a column to the text dataframe that contains a cleaned version of each paragraph
    call_text["Cleaned Text"] = [clean(paragraph) for paragraph in call_text["Text"]]
    
    call_text["Call Paragraph Num"] = call_text.index.values.astype(str)
    
    call_text = call_text[["Link","Speaker","Call Section","Text","Cleaned Text","Call Paragraph Num"]]
    df_text = pd.concat([df_text,call_text])

In [79]:
df=df_text.merge(df_metadata)
df["Paragraph Identifier"] = df["ticker"]+ " " + df["period"] + " " + df["Call Paragraph Num"]
df.set_index("Paragraph Identifier",inplace=True)
df.shape

(640132, 11)

## Save Call Pickle if Necessary

In [0]:
calls_updated = {}
calls_updated["metadata"]=df_metadata
calls_updated["text"]=df

In [0]:
if os.path.exists("/content/gdrive/My Drive/School/DATS 6450 - NLP/Project/ProcessedCallsPickleUpdated_v2"):
    os.remove("/content/gdrive/My Drive/School/DATS 6450 - NLP/Project/ProcessedCallsPickleUpdated_v2")
pfile = open("/content/gdrive/My Drive/School/DATS 6450 - NLP/Project/ProcessedCallsPickleUpdated_v2", "ab")
pickle.dump(calls_updated, pfile)                  
pfile.close()