# Import Packages

In [None]:
#!python3.11 -m pip install

In [10]:
import numpy as np
import pandas as pd
from gensim.models import TfidfModel, LsiModel
from gensim.models.ldamodel import LdaModel
from gensim import matutils
from sklearn.cluster import KMeans

from collections import defaultdict
from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')

import re
from gensim import corpora

import pyLDAvis.gensim as gensimvis
import pyLDAvis

# Import Data

In [54]:
data = pd.read_csv('../data/beige_books_1970_2024.csv')
data = data.drop(['Unnamed: 0'], axis=1)
data.time_index = pd.to_datetime(data.time_index)

data.sentence = data.sentence.str.lower()
data.sentence = data.sentence.str.translate(str.maketrans('', '', string.punctuation))
data.head()

Unnamed: 0,time_index,region,sentence
0,1970-05-01,at,the mood of our directors varies from pessimis...
1,1970-05-01,at,if any consensus exists it is that business ac...
2,1970-05-01,at,many major economic indices should drift downw...
3,1970-05-01,at,in the pessimistic vein a leading department s...
4,1970-05-01,at,the store reported that labor costs were up 8 ...


# LDA

Code leveraged from https://github.com/gaurikatyagi/Natural-Language-Processing/tree/master 

In [94]:
#try on single region only to reduce runtime & avoid learning regions as topics 
data_su = data[data.region == 'at']

In [95]:
stop_words = stopwords.words("english")

def tokenize(text):
    text_wordlist = []
    for x in re.split(r"([.,!?\s]+)", text):
        if x and x not in [".", " "] and x.lower() not in stop_words:
            text_wordlist.append(x)
    return(text_wordlist)

# remove stop words from sentences 
texts = list(data_su["sentence"].apply( lambda text: tokenize(text)))

In [96]:
# create dictionary of words
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [97]:
# fit LDA model
topics = LdaModel(corpus=corpus,
                    id2word=dictionary,
                    num_topics=15, # 50 was good 
                    alpha='auto', 
                    passes=50 # lower number of passes to reduce run time  
                    ) 

In [98]:
# print first 10 topics
for i, topic in enumerate(topics.print_topics(10)):
    print ('%d: %s\n'%(i+1, topic))

1: (11, '0.062*"one" + 0.042*"near" + 0.037*"small" + 0.037*"food" + 0.026*"greater" + 0.026*"weaker" + 0.025*"would" + 0.019*"used" + 0.018*"better" + 0.018*"drilling"')

2: (6, '0.079*"overall" + 0.061*"workers" + 0.061*"supply" + 0.037*"shortages" + 0.036*"rose" + 0.033*"positions" + 0.031*"gulf" + 0.029*"coast" + 0.024*"along" + 0.023*"available"')

3: (0, '0.069*"sector" + 0.045*"pressures" + 0.039*"especially" + 0.037*"pressure" + 0.037*"wages" + 0.035*"average" + 0.025*"margins" + 0.023*"occupancy" + 0.022*"customers" + 0.018*"weakened"')

4: (12, '0.106*"labor" + 0.098*"markets" + 0.063*"employment" + 0.044*"majority" + 0.040*"hiring" + 0.027*"december" + 0.026*"tight" + 0.023*"november" + 0.019*"limited" + 0.019*"significantly"')

5: (9, '0.099*"conditions" + 0.065*"market" + 0.053*"areas" + 0.053*"improved" + 0.050*"housing" + 0.041*"oil" + 0.038*"agriculture" + 0.035*"gas" + 0.030*"multifamily" + 0.028*"natural"')

6: (5, '0.159*"sales" + 0.039*"consumer" + 0.037*"spending" 

In [99]:
print(corpus[0])
print(data["sentence"][0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]
the mood of our directors varies from pessimism to optimism


In [100]:
# topic distribution for first sentence in data 
display(topics.get_document_topics(corpus[0],per_word_topics=True ))

([(0, 0.02741488),
  (1, 0.04273574),
  (2, 0.14912473),
  (3, 0.03628825),
  (4, 0.06148374),
  (5, 0.058540307),
  (6, 0.026557159),
  (7, 0.03449119),
  (8, 0.038064912),
  (9, 0.03419075),
  (10, 0.064228505),
  (11, 0.07463598),
  (12, 0.028204093),
  (13, 0.09331698),
  (14, 0.23072283)],
 [(0, [14]), (1, [14]), (2, [2]), (3, [11]), (4, [13])],
 [(0, [(14, 0.999485)]),
  (1, [(14, 0.9939756)]),
  (2, [(2, 0.9997152)]),
  (3, [(11, 0.9968155)]),
  (4, [(13, 0.998405)])])

In [102]:
# visualize the topics 
vis_data = gensimvis.prepare(topics, corpus, dictionary)
pyLDAvis.display(vis_data)

Iteration notes
- need to lower case and remove punctuation
- picking up on locations in clusters, try on national summary
- 30 topics seemed to be too many 