<a href="https://colab.research.google.com/github/mikelabadie/Earnings_Call_Transcripts/blob/master/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Upload Call Pickle if Necessary

In [0]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(name=fn, length=len(uploaded[fn])))

Saving ProcessedCallsPickle to ProcessedCallsPickle
User uploaded file "ProcessedCallsPickle" with length 263075028 bytes


## Read Call Pickle and Build Meta-data and Text DataFrames

In [0]:
# import the call dataset
import pandas as pd
import pickle
import os

from google.colab import drive
drive.mount('/content/gdrive')

if os.path.exists("/content/gdrive/My Drive/School/DATS 6450 - NLP/Code/Project/ProcessedCallsPickle"):
    pfile = open("/content/gdrive/My Drive/School/DATS 6450 - NLP/Code/Project/ProcessedCallsPickle", "rb")
    calls = pickle.load(pfile)                      
    pfile.close()
    
metadata={link:data["metadata"] for link, data in calls.items()}
df_metadata = pd.DataFrame.from_dict(metadata, orient="index")

text={link:data["text"] for link, data in calls.items()}

## Tools to Build Corpus

In [0]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

import string
exclude = set(string.punctuation)

from nltk.stem.wordnet import WordNetLemmatizer

lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized 

from gensim import corpora
import gensim

## Build Cleaned Corpus

In [0]:
corpus = []
names = []

calls_to_use = df_metadata.dropna(subset=["ticker","period","call date"])
for _, row in calls_to_use.iterrows():
    names.append(row["ticker"] + " " + row["period"])
    prepared_remarks = text.get(row.name)
    prepared_remarks = prepared_remarks[(prepared_remarks["Call Section"]=="Prepared Remarks")&(prepared_remarks["Speaker"]!="Operator")]
    docs = list(prepared_remarks["Text"])
    docs_clean = [clean(doc) for doc in docs]
    docs_clean = " ".join(docs_clean)
    corpus.append(docs_clean)

## Word Vectors

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# count vectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
cnt_vect = df.T
cnt_vect.columns=names

# count vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
tfidf_vect = df.T
tfidf_vect.columns=names
tfidf_vectT = tfidf_vect.T

## Analysis of Word Vectors

In [12]:
print(tfidf_vectT["trump"].sort_values(ascending=False))

X Q3 2018        0.073439
UHAL Q2 2019     0.057219
DPLO Q2 2018     0.052207
ELF Q3 2018      0.049803
LMT Q4 2017      0.044921
LCII Q3 2018     0.044157
GBX Q2 2018      0.041901
REVG Q4 2017     0.040668
ULH Q4 2018      0.039593
STMP Q3 2018     0.039032
IIIN Q1 2019     0.036888
EBIX Q4 2018     0.034870
IIIN Q4 2018     0.034385
CMC Q3 2018      0.034175
HOMB Q3 2018     0.030315
AXTI Q3 2018     0.029397
GLDD Q4 2018     0.028459
SD Q4 2018       0.025309
NUE Q3 2018      0.025288
DPLO Q3 2018     0.023847
CACI Q1 2019     0.023441
STLD Q4 2017     0.022615
HR Q4 2018       0.022097
ASTE Q4 2018     0.020890
ARCB Q3 2018     0.020110
TGNA Q3 2018     0.019222
AGO Q4 2018      0.018506
AMBC Q4 2018     0.018134
LCII Q4 2018     0.018086
INGN Q4 2018     0.017111
                   ...   
CTO Q4 2018      0.000000
CUB Q1 2019      0.000000
WCG Q4 2018      0.000000
CMI Q4 2018      0.000000
DASTY Q4 2018    0.000000
ECHO Q4 2018     0.000000
ENTA Q1 2019     0.000000
PLUS Q3 2019

## Save Call Pickle if Necessary

In [0]:
if os.path.exists("/content/gdrive/My Drive/School/DATS 6450 - NLP/Code/Project/ProcessedCallsPickle"):
    os.remove("/content/gdrive/My Drive/School/DATS 6450 - NLP/Code/Project/ProcessedCallsPickle")
pfile = open("/content/gdrive/My Drive/School/DATS 6450 - NLP/Code/Project/ProcessedCallsPickle", "ab")
pickle.dump(calls, pfile)                  
pfile.close()

Paragraph similarity across calls
Text Reuse: Croft
News in Essence: CMU (lady)
Plot 1 -> Plot 2 -> Plot 3![alt text](https://)