Monthly:

Thanks to https://medium.com/analytics-vidhya/topic-modeling-using-gensim-lda-in-python-48eaa2344920 for inspiration!

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from gensim.utils import simple_preprocess
import gensim
from nltk.stem import WordNetLemmatizer
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from tqdm import tqdm
import pyLDAvis
import pyLDAvis.gensim

import nltk
nltk.download('wordnet')

In [None]:
strategy = pd.read_pickle('..\data\portfolio_decile\portfolio_dict_GHR-bigram-tf-idf.pkl')
data = pd.read_csv('..\data\\final_dataset.csv')
stop_words = stopwords.words('english')

In [None]:
stop_words.extend(['from', 'subject', 're', 'edu', 'use', '--', 'motley', 'ladies', 'gentlemen', 'year', 
                    'million', 'thousand', 'think', 'call', 'quarter', 'analyst', 'officer', 'like', 'month', 'rate', 
                    'one','time','u','well','would','really','first','thank','see','going','kind', 'look', 'study', 'thanks', 'also', 'last', 'operator','question','results','term',
                    'billion','good','know','patient','third','second','get','back','lot','thing','today','right','trial', 'two'])

In [None]:
# convert all dates to datetime
data['date'] = pd.to_datetime(data['date']).dt.date

dates = strategy.keys()
strategy.keys()
# data

In [None]:
def preprocess(transcript_string): # input: one row from dataframe
    transcript = transcript_string.split(". ") # could be done smarter
    
    def sent_to_words(sentences):
        for sentence in sentences:
            yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

    def remove_stopwords(texts):
        return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

    # basic lemmatization
    def lemmatization(texts):
        texts_out = []
        lemmatizer = WordNetLemmatizer()

        for sent in texts:
            for word in sent:
                texts_out.append(lemmatizer.lemmatize(word))
        return texts_out
   
    data_words = list(sent_to_words(transcript))

    data_words_nostops = remove_stopwords(data_words)

    bigram = gensim.models.Phrases(data_words_nostops, min_count=5, threshold=20) # higher threshold fewer phrases.
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram = gensim.models.Phrases(bigram[data_words_nostops], threshold=20)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    data_words_bigrams = [bigram_mod[transcript] for transcript in data_words_nostops]
    data_words_trigrams = [trigram_mod[bigram_mod[transcript]] for transcript in data_words_bigrams]

    data_lemmatized = lemmatization(data_words_trigrams)

    data_lemmatized = [token for token in data_lemmatized if token != 'u']
    data_lemmatized = [token for token in data_lemmatized if token != 't']
    
    return data_lemmatized

    # return(data_lemmatized) # a list with all words and bigrams in 2decile

As interpreting these takes too long, we instead run LDA on the top/bottom 2 deciles overall, without considering the month in which each was released:

In [None]:
def generate_lda_corpus(transcripts): # set perf=True to see coherence, perplexity
    """Get LDA topics for each list of transcripts"""
    # print(transcripts)
    preprocessed_transcripts = [] # list of list with all words and bigrams in 2decile

    for i in tqdm(transcripts.index):
        transcript = (transcripts['transcript'][i])
        preprocessed_transcripts.append(preprocess(transcript))      
    
    # Create Dictionary 
    id2word = corpora.Dictionary(preprocessed_transcripts)  
    # Create Corpus 
    texts = preprocessed_transcripts  
    # tdf 
    corpus = [id2word.doc2bow(text) for text in texts]  
    # print([[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]])
    return id2word, texts, corpus

def train_lda(id2w, texts, corpus, perf=False):
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2w,
                                           num_topics=3, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=50,
                                           passes=15,
                                           alpha='symmetric',
                                           eta=0.000000001,
                                           per_word_topics=True) # initiation stolen from medium article

    # print(lda_model.print_topics()) # run to get topics
    if perf: # kills performance
        perplexity_lda = lda_model.log_perplexity(corpus) # lower => better
        coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2w, coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        return (lda_model, perplexity_lda, coherence_lda)

    return lda_model, corpus, id2w

In [None]:
A_lda_pos_topics = {}
A_lda_neg_topics = {}

all_top_2deciles = []
all_bottom_2deciles = [] 

for date in dates:
    """
    Get LDA topics for each date for bottom and top deciles. 
    """
    dt_date = pd.to_datetime(date).date()
    last_date = dt_date - pd.DateOffset(months=1)
    # find all transcripts corresponding to given date and top/bottom 2 deciles:
    n_stocks_in_top = strategy[date]['decile'].count(1)+strategy[date]['decile'].count(2)
    n_stocks_in_bottom = strategy[date]['decile'].count(9)+strategy[date]['decile'].count(10)

    # n_monthly_transcripts = ((len(strategy[date]['score'])))
    # n_stocks_in_deciles = n_monthly_transcripts//5
    top_2deciles_stocks = strategy[date]['ticker'][0:n_stocks_in_top]
    bottom_2deciles_stocks = strategy[date]['ticker'][-n_stocks_in_bottom:]
    
    # find transcripts from top deciles:

    last_date = pd.to_datetime(last_date)
    dt_date = pd.to_datetime(dt_date)

    data['date'] = pd.to_datetime(data['date'])

    top_2deciles_transcripts = data[
        (data['date'] >= last_date) &
        (data['date'] <= dt_date) &
        (data['ticker'].isin(top_2deciles_stocks))
        ]
    
    bottom_2deciles_transcripts = data[
        (data['date'] >= last_date) &
        (data['date'] <= dt_date) &
        (data['ticker'].isin(bottom_2deciles_stocks))
        ]
    
    all_top_2deciles.append(top_2deciles_transcripts)
    all_bottom_2deciles.append(bottom_2deciles_transcripts)

# Combine the dataframes
all_top_2deciles = pd.concat(all_top_2deciles)
all_bottom_2deciles = pd.concat(all_bottom_2deciles)

# Run the LDA algorithm as above:
pos_id2word, pos_texts, pos_corpus = generate_lda_corpus(all_top_2deciles) # (lda_model, perplexity_lda, coherence_lda)
neg_id2word, neg_texts, neg_corpus = generate_lda_corpus(all_bottom_2deciles)

In [None]:
all_top_2deciles

In [None]:
pos_lda_model, pos_corpus, pos_id2word = train_lda(pos_id2word, pos_texts, pos_corpus)
neg_lda_model, neg_corpus, neg_id2word = train_lda(neg_id2word, neg_texts, neg_corpus)

In [None]:
pos_lda_model.show_topics()

In [None]:
neg_lda_model.show_topics()

Next, we plot word clouds for the topics:

In [None]:
# Function to generate word clouds for each topic
import matplotlib.pyplot as plt
from wordcloud import WordCloud

import random

# def random_reds(word, font_size, position, orientation, random_state=None, **kwargs):
#     r = random.randint(150, 255)
    
#     return f"rgb({r}, 0, 0)"  # Generate colors from dark to bright red

def plot_wordclouds(lda_model, num_topics):
    for i in range(num_topics):
        plt.figure(figsize=(10, 10))
        
        # Get topic terms and their weights
        topic_terms = lda_model.show_topic(i)
        
        # Convert the topic terms into a dictionary suitable for WordCloud
        wc = WordCloud(width=800, height=800, background_color='white')
        
        # Generate the word cloud for the i-th topic
        wc.generate_from_frequencies(dict(topic_terms))
        
        # Plotting
        plt.imshow(wc, interpolation='bilinear')
        plt.title(f'Topic {i + 1}', fontsize=20)
        plt.axis("off")
        plt.show()

plot_wordclouds(pos_lda_model, num_topics=3)

To make sense of the topics, we run the following visualization (not in report):

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(pos_lda_model, pos_corpus, pos_id2word)
vis

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(neg_lda_model, neg_corpus, neg_id2word)
vis