In [1]:
import numpy as np
import pandas as pd
from nltk.sentiment import vader

In [2]:
call_data = pd.read_csv('data/earningscall_fraud.csv')

print(call_data.describe())

        Unnamed: 0         PRES  TURN_AT_TALK          CEO    WORDCOUNT  \
count  1114.000000  1114.000000   1114.000000  1114.000000  1114.000000   
mean    556.500000     0.581688     15.447935     0.793537    18.205566   
std     321.728405     0.493504     19.025057     0.404949     8.940149   
min       0.000000     0.000000      2.000000     0.000000     1.000000   
25%     278.250000     0.000000      3.000000     1.000000    12.000000   
50%     556.500000     1.000000      4.000000     1.000000    17.000000   
75%     834.750000     1.000000     24.000000     1.000000    23.000000   
max    1113.000000     1.000000     93.000000     1.000000    62.000000   

       Restatement Topic        FRAUD  
count        1114.000000  1114.000000  
mean            0.176840     0.352783  
std             0.381705     0.636108  
min             0.000000     0.000000  
25%             0.000000     0.000000  
50%             0.000000     0.000000  
75%             0.000000     1.000000  
max 

In [3]:
# percent of fraud
print(call_data['Restatement Topic'].mean())

0.17684021543985637


In [6]:
from gensim.parsing.preprocessing import preprocess_string
from gensim import corpora

call_data['clean_text'] = call_data['Sentence'].apply(preprocess_string)
print(call_data.loc[1, ['Sentence', 'clean_text']])

Sentence      Welcome to Northwest Pipe's conference call an...
clean_text    [welcom, northwest, pipe, confer, announc, ear...
Name: 1, dtype: object


In [7]:
dictionary = corpora.Dictionary(call_data['clean_text'])
print(dictionary)

Dictionary(1116 unique tokens: ['diann', 'thank', 'announc', 'confer', 'earn']...)


In [8]:
bow_corpus = [dictionary.doc2bow(text) for text in call_data['clean_text']]

In [9]:
from gensim import models

lda_10 = models.LdaModel(bow_corpus, num_topics=10, id2word=dictionary)

In [10]:
for topic in lda_10.show_topics():
    print("Topic", topic[0], ":", topic[1])

Topic 0 : 0.018*"look" + 0.018*"expect" + 0.017*"project" + 0.015*"like" + 0.014*"quarter" + 0.013*"demand" + 0.012*"impact" + 0.012*"time" + 0.011*"signific" + 0.010*"date"
Topic 1 : 0.034*"cost" + 0.026*"steel" + 0.020*"quarter" + 0.020*"price" + 0.016*"product" + 0.015*"busi" + 0.015*"chang" + 0.013*"higher" + 0.012*"result" + 0.010*"materi"
Topic 2 : 0.051*"think" + 0.033*"million" + 0.030*"quarter" + 0.018*"time" + 0.016*"fourth" + 0.015*"cost" + 0.014*"year" + 0.014*"abl" + 0.013*"increas" + 0.011*"expens"
Topic 3 : 0.027*"quarter" + 0.022*"project" + 0.018*"littl" + 0.018*"water" + 0.015*"price" + 0.014*"bit" + 0.013*"expect" + 0.013*"think" + 0.012*"fourth" + 0.012*"go"
Topic 4 : 0.021*"cost" + 0.016*"steel" + 0.014*"share" + 0.014*"market" + 0.013*"lower" + 0.012*"year" + 0.012*"sell" + 0.012*"margin" + 0.012*"price" + 0.011*"million"
Topic 5 : 0.036*"quarter" + 0.031*"product" + 0.028*"million" + 0.016*"project" + 0.015*"market" + 0.014*"tubular" + 0.013*"steel" + 0.013*"busi

In [11]:
print('Perplexity: ', lda_10.log_perplexity(bow_corpus))

Perplexity:  -6.905256194913186


In [12]:
from gensim.models import CoherenceModel
coherence_model_lda = CoherenceModel(model=lda_10, texts=call_data['clean_text'], dictionary=dictionary, coherence='u_mass')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  -6.729851155165351


In [13]:
stopwords = []
with open('data/stoplist.txt', 'r') as f:
    stopwords = f.read().splitlines()

In [14]:
def remove_stopwords(text):
    """ preprocess string and remove words from custom stopword list. """
    result = []

    for word in preprocess_string(text):
        if word not in stopwords:
            result.append(word)
    return result

call_data['clean_newstop'] = call_data['Sentence'].apply(remove_stopwords)

In [15]:
new_dictionary = corpora.Dictionary(call_data['clean_newstop'])
print(new_dictionary)

new_corpus = [new_dictionary.doc2bow(text) for text in call_data['clean_newstop']]

Dictionary(1001 unique tokens: ['diann', 'announc', 'confer', 'earn', 'northwest']...)


In [16]:
lda_new = models.LdaModel(new_corpus, num_topics=10, id2word=new_dictionary)

for topic in lda_new.show_topics():
    print("Topic", topic[0], ":", topic[1])

Topic 0 : 0.045*"steel" + 0.021*"ton" + 0.014*"overal" + 0.014*"probabl" + 0.014*"strong" + 0.014*"project" + 0.012*"backlog" + 0.012*"lower" + 0.010*"continu" + 0.010*"month"
Topic 1 : 0.019*"signific" + 0.015*"steel" + 0.015*"project" + 0.013*"higher" + 0.013*"improv" + 0.011*"tubular" + 0.011*"probabl" + 0.011*"continu" + 0.011*"date" + 0.009*"construct"
Topic 2 : 0.031*"result" + 0.027*"compar" + 0.024*"steel" + 0.021*"water" + 0.018*"transmiss" + 0.016*"materi" + 0.013*"group" + 0.011*"project" + 0.011*"caus" + 0.011*"rang"
Topic 3 : 0.034*"backlog" + 0.022*"certainli" + 0.020*"project" + 0.020*"tubular" + 0.015*"brian" + 0.015*"month" + 0.014*"steel" + 0.013*"opportun" + 0.013*"fund" + 0.013*"bid"
Topic 4 : 0.022*"tubular" + 0.017*"share" + 0.015*"project" + 0.015*"ye" + 0.014*"energi" + 0.012*"water" + 0.012*"steel" + 0.010*"lower" + 0.010*"result" + 0.009*"agenc"
Topic 5 : 0.039*"group" + 0.027*"tubular" + 0.021*"gross" + 0.020*"profit" + 0.019*"activ" + 0.018*"compar" + 0.016*

In [17]:
for doc in new_corpus[0:9]:
    print(lda_new.get_document_topics(doc))

[(0, 0.050003044), (1, 0.050003044), (2, 0.050003044), (3, 0.050003044), (4, 0.050003044), (5, 0.050003044), (6, 0.050003044), (7, 0.54997265), (8, 0.050003044), (9, 0.050003044)]
[(0, 0.014286468), (1, 0.014286224), (2, 0.014286485), (3, 0.014286732), (4, 0.014288322), (5, 0.014287145), (6, 0.87141764), (7, 0.014287321), (8, 0.014286282), (9, 0.014287413)]
[(3, 0.93076515)]
[(0, 0.0111124), (1, 0.0111117475), (2, 0.01111558), (3, 0.011112106), (4, 0.8999831), (5, 0.011113911), (6, 0.011111682), (7, 0.011114508), (8, 0.011111579), (9, 0.011113446)]
[(2, 0.92499626)]
[(0, 0.016669285), (1, 0.84996015), (2, 0.0166755), (3, 0.016674267), (4, 0.016669672), (5, 0.0166708), (6, 0.016669162), (7, 0.016670417), (8, 0.01667107), (9, 0.016669702)]
[(0, 0.1), (1, 0.1), (2, 0.1), (3, 0.1), (4, 0.1), (5, 0.1), (6, 0.1), (7, 0.1), (8, 0.1), (9, 0.1)]
[(0, 0.020003922), (1, 0.020002587), (2, 0.020010136), (3, 0.02000251), (4, 0.020003138), (5, 0.020003296), (6, 0.020001251), (7, 0.020003738), (8, 0.0

In [18]:
print('Perplexity: ', lda_new.log_perplexity(new_corpus))

Perplexity:  -7.189317498732393


In [19]:
print('Perplexity: ', lda_new.log_perplexity(new_corpus))

Perplexity:  -7.189775884634407


In [20]:
from gensim.matutils import corpus2csc
all_topics = lda_new.get_document_topics(new_corpus, minimum_probability=0.0)
all_topics_csr = corpus2csc(all_topics)
all_topics_numpy = all_topics_csr.T.toarray()
all_topics_df = pd.DataFrame(all_topics_numpy)

classification_df = pd.concat([call_data, all_topics_df], axis=1)

In [21]:
classification_df.describe()

Unnamed: 0.1,Unnamed: 0,PRES,TURN_AT_TALK,CEO,WORDCOUNT,Restatement Topic,FRAUD,0,1,2,3,4,5,6,7,8,9
count,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0
mean,556.5,0.581688,15.447935,0.793537,18.205566,0.17684,0.352783,0.090717,0.083317,0.103588,0.119739,0.092642,0.122571,0.092226,0.085626,0.079249,0.130325
std,321.728405,0.493504,19.025057,0.404949,8.940149,0.381705,0.636108,0.212972,0.199048,0.230908,0.247129,0.21579,0.252661,0.213028,0.206871,0.193141,0.26141
min,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.005264,0.005264,0.005264,0.005264,0.005264,0.005264,0.005264,0.005265,0.005264,0.005265
25%,278.25,0.0,3.0,1.0,12.0,0.0,0.0,0.014287,0.014287,0.014288,0.014288,0.014288,0.014288,0.014287,0.014288,0.014287,0.014288
50%,556.5,1.0,4.0,1.0,17.0,0.0,0.0,0.020003,0.020003,0.020005,0.020007,0.020004,0.020006,0.020003,0.020004,0.020003,0.02001
75%,834.75,1.0,24.0,1.0,23.0,0.0,1.0,0.03334,0.033339,0.033347,0.050002,0.033342,0.050003,0.033341,0.033339,0.033338,0.050003
max,1113.0,1.0,93.0,1.0,62.0,1.0,2.0,0.935702,0.943745,0.939987,0.939993,0.94999,0.949983,0.930756,0.952621,0.935704,0.939993


In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_validate

n_splits = 5

pred_vars = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9,]


scoring = ['accuracy', 'neg_log_loss', 'f1', 'roc_auc']
rf_base = RandomForestClassifier()
cv_rf = cross_validate(rf_base, classification_df[pred_vars], classification_df['Restatement Topic'], cv=StratifiedShuffleSplit(n_splits), scoring=scoring)
print(cv_rf)
print("Mean Accuracy:", cv_rf['test_accuracy'].mean())
print("Mean F1:", cv_rf['test_f1'].mean())
print("Mean ROC:", cv_rf['test_roc_auc'].mean())
print("Mean Log Loss:", cv_rf['test_neg_log_loss'].mean())

{'fit_time': array([0.32897711, 0.33165193, 0.32615852, 0.28246951, 0.27108026]), 'score_time': array([0.03889537, 0.03291225, 0.03089428, 0.0298543 , 0.02801657]), 'test_accuracy': array([0.84821429, 0.83928571, 0.79464286, 0.82142857, 0.76785714]), 'test_neg_log_loss': array([-1.0115674 , -0.40003523, -0.52848207, -0.76320408, -0.52950749]), 'test_f1': array([0.32      , 0.35714286, 0.20689655, 0.28571429, 0.13333333]), 'test_roc_auc': array([0.64076087, 0.72146739, 0.60516304, 0.63804348, 0.53777174])}
Mean Accuracy: 0.8142857142857144
Mean F1: 0.26061740558292285
Mean ROC: 0.6286413043478262
Mean Log Loss: -0.6465592533130089


# Tasks

1. Try fitting LDA with just 5 topics instead of 10. How does this affect human interpretability, perplexity, coherence, and classification performance?

2. Try fitting LDA with 15 topics. How does this affect human interpretability, perplexity, coherence, and classification performance?

3. In addition to Random Forest, try another classifier of your choosing. How does this compare to the Random Forest?

# Optional Tasks

1. Run sentiment analysis on this data. Does adding that to a classifier improve performance?

2. See the section below on weighted word counts. Does using tf-idf improve human interpretability?

# Task 1

Try fitting LDA with just 5 topics instead of 10. How does this affect human interpretability, perplexity, coherence, and classification performance?

In [23]:
lda_new_2 = models.LdaModel(new_corpus, num_topics=5, id2word=new_dictionary)

for topic in lda_new_2.show_topics():
    print("Topic", topic[0], ":", topic[1])

Topic 0 : 0.018*"continu" + 0.015*"gener" + 0.013*"activ" + 0.013*"tubular" + 0.012*"share" + 0.011*"pipe" + 0.011*"month" + 0.009*"effect" + 0.009*"opportun" + 0.009*"facil"
Topic 1 : 0.020*"certainli" + 0.014*"compar" + 0.013*"continu" + 0.013*"group" + 0.012*"tubular" + 0.011*"strong" + 0.010*"share" + 0.009*"water" + 0.009*"rang" + 0.009*"transmiss"
Topic 2 : 0.027*"project" + 0.024*"water" + 0.019*"result" + 0.018*"transmiss" + 0.015*"group" + 0.014*"compar" + 0.014*"tubular" + 0.013*"steel" + 0.012*"signific" + 0.012*"improv"
Topic 3 : 0.032*"steel" + 0.025*"project" + 0.014*"water" + 0.014*"higher" + 0.013*"continu" + 0.011*"issu" + 0.010*"result" + 0.010*"fund" + 0.009*"tubular" + 0.009*"work"
Topic 4 : 0.020*"steel" + 0.017*"project" + 0.016*"bid" + 0.013*"job" + 0.012*"probabl" + 0.010*"half" + 0.010*"backlog" + 0.010*"order" + 0.010*"book" + 0.009*"strong"


In [45]:
for doc in new_corpus[0:5]:
    print(lda_new_2.get_document_topics(doc))

[(0, 0.100018024), (1, 0.10001767), (2, 0.10001352), (3, 0.10001732), (4, 0.5999335)]
[(0, 0.8846291), (1, 0.028796803), (2, 0.02869406), (3, 0.029197399), (4, 0.028682638)]
[(0, 0.015563075), (1, 0.9380681), (2, 0.015483566), (3, 0.015487208), (4, 0.015398054)]
[(0, 0.022292644), (1, 0.022265011), (2, 0.91069597), (3, 0.0224084), (4, 0.022337934)]
[(0, 0.01676031), (1, 0.016768405), (2, 0.017041635), (3, 0.9326322), (4, 0.016797446)]


In [25]:
print('Perplexity: ', lda_new_2.log_perplexity(new_corpus))
print('Perplexity: ', lda_new_2.log_perplexity(new_corpus))

Perplexity:  -6.915479806325692
Perplexity:  -6.915510124540565


In [26]:
all_topics = lda_new_2.get_document_topics(new_corpus, minimum_probability=0.0)
all_topics_csr = corpus2csc(all_topics)
all_topics_numpy = all_topics_csr.T.toarray()
all_topics_df = pd.DataFrame(all_topics_numpy)

classification_df = pd.concat([call_data, all_topics_df], axis=1)

In [27]:
classification_df.describe()

Unnamed: 0.1,Unnamed: 0,PRES,TURN_AT_TALK,CEO,WORDCOUNT,Restatement Topic,FRAUD,0,1,2,3,4
count,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0
mean,556.5,0.581688,15.447935,0.793537,18.205566,0.17684,0.352783,0.192191,0.189635,0.262094,0.181297,0.174783
std,321.728405,0.493504,19.025057,0.404949,8.940149,0.381705,0.636108,0.292925,0.288929,0.336938,0.283981,0.275034
min,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.011254,0.010628,0.01095,0.010602,0.010593
25%,278.25,0.0,3.0,1.0,12.0,0.0,0.0,0.029156,0.029162,0.033711,0.029444,0.028844
50%,556.5,1.0,4.0,1.0,17.0,0.0,0.0,0.05019,0.050116,0.066689,0.05021,0.050009
75%,834.75,1.0,24.0,1.0,23.0,0.0,1.0,0.103083,0.103502,0.589466,0.101337,0.1008
max,1113.0,1.0,93.0,1.0,62.0,1.0,2.0,0.956876,0.952416,0.954606,0.952467,0.948753


# Task 2

Try fitting LDA with 15 topics. How does this affect human interpretability, perplexity, coherence, and classification performance?

In [46]:
lda_new_3 = models.LdaModel(new_corpus, num_topics=15, id2word=new_dictionary)
print(len(lda_new_3.show_topics()))
for topic in lda_new_3.show_topics():
    #print()
    print("Topic", topic[0], ":", topic[1])

10
Topic 0 : 0.026*"steel" + 0.020*"materi" + 0.017*"work" + 0.013*"statement" + 0.013*"rule" + 0.012*"total" + 0.010*"pipelin" + 0.010*"season" + 0.010*"final" + 0.010*"project"
Topic 5 : 0.019*"construct" + 0.019*"job" + 0.019*"ahead" + 0.016*"opportun" + 0.016*"profit" + 0.016*"gross" + 0.016*"compar" + 0.013*"record" + 0.013*"certainli" + 0.013*"project"
Topic 14 : 0.016*"spend" + 0.016*"group" + 0.016*"compar" + 0.014*"demand" + 0.014*"tubular" + 0.014*"earn" + 0.014*"pipe" + 0.012*"econom" + 0.012*"activ" + 0.012*"higher"
Topic 7 : 0.037*"share" + 0.017*"facil" + 0.016*"certainli" + 0.013*"believ" + 0.013*"ye" + 0.013*"dilut" + 0.013*"project" + 0.013*"improv" + 0.011*"effect" + 0.010*"volum"
Topic 8 : 0.024*"high" + 0.019*"continu" + 0.018*"gener" + 0.017*"cash" + 0.017*"record" + 0.014*"flow" + 0.014*"financ" + 0.013*"half" + 0.013*"stronger" + 0.011*"capac"
Topic 3 : 0.032*"project" + 0.022*"brian" + 0.021*"result" + 0.017*"financi" + 0.015*"stephani" + 0.013*"backlog" + 0.012

In [39]:
for doc in new_corpus[0:15]:
    print(lda_new_3.get_document_topics(doc))

[]
[(59, 0.20173945), (64, 0.20173071), (65, 0.40174204)]
[(74, 0.9237454)]
[(95, 0.88985485)]
[(85, 0.917391)]
[(71, 0.16811103), (74, 0.33478427), (95, 0.33478236)]
[]
[(8, 0.40173888), (112, 0.40173793)]
[(53, 0.40173823), (95, 0.2017382), (100, 0.20173827)]
[(15, 0.41099524), (41, 0.22673966), (104, 0.1674804)]
[(0, 0.20173413), (4, 0.20173587), (7, 0.20173953), (100, 0.20174143)]
[(79, 0.75217265)]
[(101, 0.87608624)]
[(43, 0.14409594), (53, 0.14409423), (57, 0.36274603), (82, 0.21117201)]
[(12, 0.2017379), (79, 0.6017388)]


In [40]:
print('Perplexity: ', lda_new_3.log_perplexity(new_corpus))
print('Perplexity: ', lda_new_3.log_perplexity(new_corpus))

Perplexity:  -17.58528391814902
Perplexity:  -17.591051012004588


In [41]:
all_topics = lda_new_3.get_document_topics(new_corpus, minimum_probability=0.0)
all_topics_csr = corpus2csc(all_topics)
all_topics_numpy = all_topics_csr.T.toarray()
all_topics_df = pd.DataFrame(all_topics_numpy)

classification_df = pd.concat([call_data, all_topics_df], axis=1)

In [42]:
classification_df.describe()

Unnamed: 0.1,Unnamed: 0,PRES,TURN_AT_TALK,CEO,WORDCOUNT,Restatement Topic,FRAUD,0,1,2,...,105,106,107,108,109,110,111,112,113,114
count,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,...,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0,1114.0
mean,556.5,0.581688,15.447935,0.793537,18.205566,0.17684,0.352783,0.008647,0.010351,0.007017,...,0.005381,0.002491,0.009703,0.009925,0.004486,0.01075,0.002491,0.010878,0.01151,0.002491
std,321.728405,0.493504,19.025057,0.404949,8.940149,0.381705,0.636108,0.063327,0.068459,0.043343,...,0.042019,0.001988,0.059322,0.058568,0.032857,0.062102,0.001988,0.064986,0.071829,0.001988
min,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.000458,0.000458,0.000458,...,0.000458,0.000458,0.000458,0.000458,0.000458,0.000458,0.000458,0.000458,0.000458,0.000458
25%,278.25,0.0,3.0,1.0,12.0,0.0,0.0,0.001242,0.001242,0.001242,...,0.001242,0.001242,0.001242,0.001242,0.001242,0.001242,0.001242,0.001242,0.001242,0.001242
50%,556.5,1.0,4.0,1.0,17.0,0.0,0.0,0.001739,0.001739,0.001739,...,0.001739,0.001739,0.001739,0.001739,0.001739,0.001739,0.001739,0.001739,0.001739,0.001739
75%,834.75,1.0,24.0,1.0,23.0,0.0,1.0,0.002899,0.002899,0.002899,...,0.002899,0.002899,0.002899,0.002899,0.002899,0.002899,0.002899,0.002899,0.002899,0.002899
max,1113.0,1.0,93.0,1.0,62.0,1.0,2.0,0.929192,0.929192,0.900869,...,0.944927,0.008696,0.938043,0.876086,0.876087,0.900869,0.008696,0.889855,0.909881,0.008696


# Task 3

In addition to Random Forest, try another classifier of your choosing. How does this compare to the Random Forest?