In [1]:
import numpy as np
import pandas as pd

import spacy
nlp = spacy.load('en_core_web_sm')

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
df_videos_cleaned_v6 = pd.read_csv('../Data/df_videos_cleaned_v6.csv')

### All the functions (Need to put these into a separate .py file)

In [3]:
def remove_custom_stopwords(df):
    '''
    Input: Cleaned dataframe
    Output: Dataframe with custom stopwords removed
    '''
    def final_preprocessing(cleaned_text):
        preprocessed_text_11 = eval(cleaned_text)
       
        nlp.Defaults.stop_words |= {'uh','yeah','man','um','oh','guy'}
        stopwords = nlp.Defaults.stop_words
        
        preprocessed_text_12 = [(word.lower(), pos) for word, pos in preprocessed_text_11 
                                    if word.lower() not in stopwords] 
        
        return preprocessed_text_12
    
    df['Transcript'] = df['Transcript'].apply(final_preprocessing)
            
    return df  

In [4]:
def document_term_matrix(df, vectorizer):
    '''
    Input: Cleaned dataframe (after removing custom stopwords) and type of vectorizer
    Output: Document-term matrix
    '''
    # Take the words out of the (word, POS) tuple, vectorize, and fit-transform into a matrix
    word_list = [[word[0] for word in doc] for doc in df['Transcript']]
    vec = vectorizer(tokenizer=lambda doc:doc, lowercase=False, min_df=2, max_df=0.5)
    matrix = vec.fit_transform(word_list).toarray()
        
    return matrix, vec.get_feature_names()

In [5]:
def topic_model(matrix, model, num_topics, num_words):
    '''
    Input: Document-term matrix, type of topic model, number of topics, and number of words is each topic
    Output: a list of lists containing topic words
    '''
    if model == NMF:
        model = model(num_topics)
    elif model == LatentDirichletAllocation:
        model = model(n_components=num_topics)
        
    doc_topic = model.fit_transform(matrix)
    topic_word = model.components_
    
    words = document_term_matrix(df_videos_cleaned_v7, CountVectorizer)[1]
    t_model = topic_word.argsort(axis=1)[:, -1:-(num_words+1):-1]
    top_topic_words = [[words[i] for i in topic] for topic in t_model]
        
    return top_topic_words

### Remove custom stopwords

In [6]:
df_videos_cleaned_v7 = remove_custom_stopwords(df_videos_cleaned_v6)

### Creating document-term matrices

#### CountVectorizer

In [7]:
X_cv = document_term_matrix(df_videos_cleaned_v7, CountVectorizer)[0]

#### TfidfVectorizer

In [8]:
X_tfidf = document_term_matrix(df_videos_cleaned_v7, TfidfVectorizer)[0]

### Topic modeling - Entire corpus

#### Non-negative matrix factorization (NMF), CountVectorizer

In [9]:
topic_model(X_cv, NMF, 8, 6)

[['question', 'sort', 'world', 'maybe', 'industry', 'capital'],
 ['dividend', 'equal', 'plus', 'constant', 'model', 'flow'],
 ['flow', 'billion', 'debt', 'free', 'revenue', 'ebitda'],
 ['portfolio', 'dividend', 'etf', 'fund', 'yield', 'income'],
 ['trade', 'fundamental', 'analysis', 'support', 'news', 'level'],
 ['ratio', 'pe', 'current', 'profit', 'book', 'equity'],
 ['option', 'leap', 'trade', 'month', 'risk', 'longterm'],
 ['graham', 'fund', 'buffett', 'asset', 'book', 'analysis']]

#### Non-negative matrix factorization (NMF), TfidfVectorizer

In [10]:
topic_model(X_tfidf, NMF, 8, 6)



[['question', 'book', 'sort', 'risk', 'buffett', 'maybe'],
 ['dividend', 'yield', 'portfolio', 'income', 'increase', 'cent'],
 ['flow', 'billion', 'debt', 'ebitda', 'revenue', 'free'],
 ['moat', 'advantage', 'mode', 'competitive', 'economic', 'brand'],
 ['analysis', 'fundamental', 'technical', 'trader', 'ratio', 'chart'],
 ['ratio', 'model', 'discount', 'formula', 'calculate', 'flow'],
 ['music', 'foreign', 'applause', 'bye', 'thank', 'backbone'],
 ['etf', 'fund', 'portfolio', 'index', 'vanguard', 'mutual']]

#### Latent Dirichlet Allocation (LDA), CountVectorizer

In [11]:
topic_model(X_tfidf, LatentDirichletAllocation, 8, 6)

[['dividend', 'portfolio', 'flow', 'ratio', 'fund', 'billion'],
 ['apex', 'roblox', 'tally', 'ebooks', 'asean', 'erp'],
 ['music', 'applause', 'browne', 'mack', 'atman', 'spear'],
 ['oakmark', 'sprout', 'corning', 'obstacles', 'fab', 'nygren'],
 ['akamai', 'coinbase', 'cranberry', 'tattooed', 'ipof', 'emotors'],
 ['coal', 'keeper', 'ground', 'bury', 'rook', 'extract'],
 ['foreign', 'music', 'splunk', 'bulldog', 'rodent', 'meticulous'],
 ['hcmc', 'momo', 'dorothy', "c'm", 'negra', 'samba']]

#### Latent Dirichlet Allocation (LDA), TfidfVectorizer

In [12]:
topic_model(X_tfidf, LatentDirichletAllocation, 8, 6)

[['dividend', 'portfolio', 'flow', 'ratio', 'fund', 'billion'],
 ['hcmc', 'roblox', 'indistinguishable', 'hifi', 'misquote', 'flightsafety'],
 ['gyration', 'coal', 'ebooks', 'erp', 'fab', 'kramer'],
 ['apex', 'corning', 'shovels', 'rebirth', 'rias', 'tenured'],
 ['oakmark', 'strait', 'bulldog', 'nygren', 'autozone', 'bismarck'],
 ['music', 'foreign', 'applause', 'asean', 'paddle', 'thank'],
 ['groupon', 'momo', 'proterra', 'rodent', 'keeper', 'ipof'],
 ['lilu', 'diffident', 'martina', 'akamai', 'snicker', 'amit']]

### Topic modeling - Adjectives

#### Non-negative matrix factorization, CountVectorizer

#### Non-negative matrix factorization, TfidfVectorizer

#### Latent Dirichlet Allocation (LDA) - CountVectorizer

### Topic modeling - Nouns

#### Non-negative matrix factorization, CountVectorizer

#### Non-negative matrix factorization, TfidfVectorizer

#### Latent Dirichlet Allocation (LDA) - CountVectorizer