# Import Packages

In [123]:
import numpy as np
import pandas as pd
from gensim.models import TfidfModel, LsiModel
from gensim.models import LdaMulticore
from gensim import matutils
from sklearn.cluster import KMeans

from collections import defaultdict
from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')

import re
from gensim import corpora

import pyLDAvis.gensim as gensimvis
import pyLDAvis
import string
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

from gensim.models import CoherenceModel
from gensim.utils import ClippedCorpus
import tqdm

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/KattPaint/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/KattPaint/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Import Data

In [263]:
data = pd.read_csv('../data/beige_books_1970_2024_with_sentiment.csv')
data = data.drop(['Unnamed: 0'], axis=1)
data.time_index = pd.to_datetime(data.time_index)
data = data[data.time_index <= '2024-03-01']

data.sentence = data.sentence.str.lower()
data.sentence = data.sentence.str.translate(str.maketrans('', '', string.punctuation))
data.head()

Unnamed: 0,time_index,region,sentence,label,score
0,1970-05-01,at,the mood of our directors varies from pessimis...,neutral,0.845975
1,1970-05-01,at,if any consensus exists it is that business ac...,negative,0.957333
2,1970-05-01,at,many major economic indices should drift downw...,negative,0.961576
3,1970-05-01,at,in the pessimistic vein a leading department s...,negative,0.922643
4,1970-05-01,at,the store reported that labor costs were up 8 ...,negative,0.905306


In [265]:
# remove sentences that contain less than 3 words 
sent_list = data.sentence.str.split()
words_per_sent = sent_list.str.len()

data = data[words_per_sent > 3]

# LDA

Code leveraged from https://github.com/gaurikatyagi/Natural-Language-Processing/tree/master and https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0

In [276]:
#randomly sample 25% of data to reduce runtime 
sample = data.sample(frac=0.25)

In [277]:
# lemmatize words 
lemma = nltk.wordnet.WordNetLemmatizer()
texts = sample["sentence"].apply(lambda text: lemma.lemmatize(text)) 

In [278]:
stop_words = stopwords.words("english")

def tokenize(text):
    text_wordlist = []
    for x in re.split(r"([.,!?\s]+)", text):
        if x and x not in [".", " "] and x.lower() not in stop_words:
            text_wordlist.append(x)
    return(text_wordlist)

# remove stop words from sentences 
texts = texts.apply(lambda text: tokenize(text))

In [279]:
# remove adjectives 
def remove_adj(tokens):
  adjective_tags = ["JJ", "JJR", "JJS"]
  #tokens = nltk.word_tokenize(sentence)
  tags = nltk.pos_tag(tokens)
  for i in range(len(tags)):
    word = [word for word,pos in tags if (pos not in adjective_tags)]
  return word

texts = list(texts.apply(lambda text: remove_adj(text))) # final data structure needs to be list 

In [280]:
texts[:2]

[['credit',
  'standards',
  'estate',
  'loans',
  'remained',
  'mostly',
  'past',
  'three',
  'months'],
 ['hotel', 'renovations', 'continue', 'hotels', 'added']]

In [281]:
# create dictionary of words
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [116]:
# create baseline LDA model
lda_model = LdaMulticore(corpus=corpus,
                            id2word=dictionary,
                            num_topics=30, 
                            random_state=100,
                            chunksize=100,
                            passes=10,
                            per_word_topics=True)

In [159]:
# print first topic - topics are a combination of weighted key words 
lda_model.print_topics(0)

[(21,
  '0.009*"sales" + 0.008*"district" + 0.008*"prices" + 0.007*"demand" + 0.007*"activity" + 0.007*"reported" + 0.006*"contacts" + 0.006*"year" + 0.005*"construction" + 0.005*"continued"'),
 (22,
  '0.011*"district" + 0.010*"sales" + 0.009*"prices" + 0.008*"reported" + 0.007*"activity" + 0.006*"demand" + 0.005*"continued" + 0.005*"report" + 0.005*"year" + 0.005*"percent"'),
 (24,
  '0.012*"sales" + 0.009*"district" + 0.009*"reported" + 0.008*"activity" + 0.007*"demand" + 0.007*"prices" + 0.006*"contacts" + 0.006*"year" + 0.006*"construction" + 0.005*"levels"'),
 (25,
  '0.009*"sales" + 0.008*"prices" + 0.008*"activity" + 0.008*"district" + 0.007*"demand" + 0.007*"reported" + 0.006*"contacts" + 0.006*"growth" + 0.005*"construction" + 0.005*"report"'),
 (1,
  '0.012*"sales" + 0.009*"reported" + 0.008*"district" + 0.008*"activity" + 0.006*"prices" + 0.006*"demand" + 0.006*"contacts" + 0.006*"year" + 0.005*"report" + 0.005*"percent"'),
 (15,
  '0.009*"sales" + 0.008*"prices" + 0.008*"d

In [118]:
# calcualte coherence 
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Baseline Coherence Score: ', coherence_lda)

Baseline Coherence Score:  0.36579323976313377


In [135]:
## hyperparameter tuning 
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

grid = {}
grid['Validation_Set'] = {}

# Topics range
min_topics = 5
max_topics = 35
step_size = 5
topics_range = range(min_topics, max_topics, step_size)

# Alpha parameter
alpha = [0.01, 0.5, 1]
alpha.append('symmetric')

# Beta parameter
beta = [0.01, 0.5, 1]
beta.append('symmetric')

# Validation sets
num_of_sents = len(corpus)
corpus_sets = [ClippedCorpus(corpus, int(num_of_sents*0.75)), corpus]

corpus_title = ['75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

In [136]:
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dictionary, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    pd.DataFrame(model_results).to_csv('../data/lda_tuning_results.csv', index=False)
    pbar.close()

  2%|█▏                                                                              | 28/1800 [28:43<30:18:17, 61.57s/it]
100%|█████████████████████████████████████████████████████████████████████████████████| 192/192 [4:50:16<00:00, 90.71s/it]


In [164]:
# explore results 
df = pd.DataFrame(model_results)
df.sort_values('Coherence', ascending=False)

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
162,100% Corpus,25,0.01,1,0.503097
174,100% Corpus,25,symmetric,1,0.498752
66,75% Corpus,25,0.01,1,0.496440
94,75% Corpus,30,symmetric,1,0.491296
114,100% Corpus,10,0.01,1,0.491070
...,...,...,...,...,...
185,100% Corpus,30,1,0.5,0.269872
89,75% Corpus,30,1,0.5,0.254353
186,100% Corpus,30,1,1,0.243486
74,75% Corpus,25,1,1,0.242655


In [165]:
# create tuned LDA model
lda_model = LdaMulticore(corpus=corpus,
                            id2word=dictionary,
                            num_topics=10, 
                            random_state=100,
                            chunksize=100,
                            passes=10,
                            per_word_topics=True, alpha=0.01, eta=1)

In [166]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.44721869016133803


In [167]:
# print first topic - topics are a combination of weighted key words 
lda_model.print_topics(0)

[(1,
  '0.015*"contact" + 0.013*"tourism" + 0.011*"one" + 0.010*"travel" + 0.010*"homes" + 0.010*"area" + 0.009*"reported" + 0.008*"rates" + 0.008*"said" + 0.008*"buyers"'),
 (0,
  '0.054*"activity" + 0.036*"district" + 0.032*"construction" + 0.022*"estate" + 0.019*"manufacturing" + 0.018*"continued" + 0.016*"sales" + 0.015*"report" + 0.015*"remained" + 0.014*"since"'),
 (6,
  '0.037*"prices" + 0.024*"contacts" + 0.021*"increases" + 0.019*"costs" + 0.018*"firms" + 0.018*"labor" + 0.017*"reported" + 0.016*"price" + 0.015*"wage" + 0.015*"workers"'),
 (3,
  '0.061*"services" + 0.030*"demand" + 0.022*"firms" + 0.020*"reported" + 0.018*"conditions" + 0.016*"district" + 0.013*"service" + 0.013*"transportation" + 0.011*"growth" + 0.011*"information"'),
 (4,
  '0.038*"rates" + 0.033*"loan" + 0.033*"demand" + 0.029*"loans" + 0.022*"banks" + 0.020*"credit" + 0.017*"lending" + 0.015*"bankers" + 0.015*"consumer" + 0.015*"interest"'),
 (5,
  '0.042*"year" + 0.039*"percent" + 0.029*"sales" + 0.024*"

# Visualization

In [168]:
# visualize the topics 
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)

In [282]:
# create corpus on full text 
# lemmatize words 
lemma = nltk.wordnet.WordNetLemmatizer()
texts = data["sentence"].apply(lambda text: lemma.lemmatize(text)) 

stop_words = stopwords.words("english")

def tokenize(text):
    text_wordlist = []
    for x in re.split(r"([.,!?\s]+)", text):
        if x and x not in [".", " "] and x.lower() not in stop_words:
            text_wordlist.append(x)
    return(text_wordlist)

# remove stop words from sentences 
texts = texts.apply(lambda text: tokenize(text))

# remove adjectives 
def remove_adj(tokens):
  adjective_tags = ["JJ", "JJR", "JJS"]
  #tokens = nltk.word_tokenize(sentence)
  tags = nltk.pos_tag(tokens)
  for i in range(len(tags)):
    word = [word for word,pos in tags if (pos not in adjective_tags)]
  return word

texts = list(texts.apply(lambda text: remove_adj(text))) # final data structure needs to be list 

In [283]:
# create corpus using previous dictionary 
corpus = [dictionary.doc2bow(text) for text in texts]

In [284]:
# assigning topics to sentences 
assign = []
print(len(corpus))
for i in range(len(corpus)):
    topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.5)
    if topics == []:
        topics = [(99, 0)]
    assign.extend(topics)

len(assign)

322319


322319

In [285]:
topics_english = {0:'mortgages', 1:'percent', 2:'consumer_goods', 3:'wages', 4:'credit', 5:'agriculture', 6:'capital', 7:'energy', 8:'travel', 9:'services', 99:'none'}
topics = [topic[0] for topic in assign]

In [291]:
# join topics to data 
data['topic'] = topics 
data['topic'] = data['topic'].map(topics_english)

In [292]:
data.head()

Unnamed: 0,time_index,region,sentence,label,score,topic
0,1970-05-01,at,the mood of our directors varies from pessimis...,neutral,0.845975,consumer_goods
1,1970-05-01,at,if any consensus exists it is that business ac...,negative,0.957333,percent
2,1970-05-01,at,many major economic indices should drift downw...,negative,0.961576,percent
3,1970-05-01,at,in the pessimistic vein a leading department s...,negative,0.922643,none
4,1970-05-01,at,the store reported that labor costs were up 8 ...,negative,0.905306,credit


In [293]:
data.to_csv('../data/final_data.csv', encoding='utf-8', index=False)