In [1]:
import numpy as np
import pandas as pd

import spacy
nlp = spacy.load('en_core_web_sm')

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

In [2]:
df_videos_cleaned_v6 = pd.read_csv('../Data/df_videos_cleaned_v6.csv')

### All the functions (Need to put these into a separate .py file)

In [3]:
def remove_custom_stopwords(df):
    '''
    Input: Cleaned dataframe
    Output: Dataframe with custom stopwords removed
    '''
    def final_preprocessing(cleaned_text):
        preprocessed_text_11 = eval(cleaned_text)
       
        nlp.Defaults.stop_words |= {'uh','yeah','man','um','oh','guy','maybe','bye'}
        stopwords = nlp.Defaults.stop_words
        
        preprocessed_text_12 = [(word.lower(), pos) for word, pos in preprocessed_text_11 
                                    if word.lower() not in stopwords] 
        
        return preprocessed_text_12
    
    df['Transcript'] = df['Transcript'].apply(final_preprocessing)
            
    return df  

In [4]:
def document_term_matrix(df, vectorizer):
    '''
    Input: Cleaned dataframe (after removing custom stopwords) and type of vectorizer
    Output: Document-term matrix
    '''
    # Take the words out of the (word, POS) tuple, vectorize, and fit-transform into a matrix
    word_list = [[word[0] for word in doc] for doc in df['Transcript']]
    vec = vectorizer(tokenizer=lambda doc:doc, lowercase=False, min_df=2, max_df=0.5)
    matrix = vec.fit_transform(word_list).toarray()
        
    return matrix, vec.get_feature_names()

In [5]:
def topic_model(matrix, model, num_topics, num_words):
    '''
    Input: Document-term matrix, type of topic model, number of topics, and number of words is each topic
    Output: a list of lists containing topic words
    '''
    if model == NMF:
        model = model(num_topics)
    elif model == LatentDirichletAllocation:
        model = model(n_components=num_topics)
        
    doc_topic = model.fit_transform(matrix)
    topic_word = model.components_
    
    words = document_term_matrix(df_videos_cleaned_v7, CountVectorizer)[1]
    t_model = topic_word.argsort(axis=1)[:, -1:-(num_words+1):-1]
    top_topic_words = [[words[i] for i in topic] for topic in t_model]
        
    return top_topic_words

In [6]:
def corpus_of_adjectives(df):
    '''
    Input: Cleaned dataframe (after removing custom stopwords) 
    Output: Dataframe with only adjectives in the transcript corpus
    '''
    def adjectives(cleaned_text):
        
        preprocessed_text_adj = [(word.lower(), pos) for word, pos in cleaned_text 
                                    if pos=='ADJ'] 
        
        return preprocessed_text_adj
    
    df['Transcript'] = df['Transcript'].apply(adjectives)
            
    return df

In [7]:
def corpus_of_nouns(df):
    '''
    Input: Cleaned dataframe (after removing custom stopwords) 
    Output: Dataframe with only nouns in the transcript corpus
    '''
    def nouns(cleaned_text):
        
        preprocessed_text_noun = [(word.lower(), pos) for word, pos in cleaned_text 
                                    if pos=='NOUN'] 
        
        return preprocessed_text_noun
    
    df['Transcript'] = df['Transcript'].apply(nouns)
            
    return df

### Remove custom stopwords

In [8]:
df_videos_cleaned_v7 = remove_custom_stopwords(df_videos_cleaned_v6)

### Creating document-term matrices

#### CountVectorizer

In [9]:
X_cv = document_term_matrix(df_videos_cleaned_v7, CountVectorizer)[0]

#### TfidfVectorizer

In [10]:
X_tfidf = document_term_matrix(df_videos_cleaned_v7, TfidfVectorizer)[0]

### Topic modeling - Entire corpus

#### Non-negative matrix factorization (NMF), CountVectorizer

In [11]:
topic_model(X_cv, NMF, 8, 6)

[['question', 'sort', 'world', 'industry', 'capital', 'tell'],
 ['dividend', 'equal', 'plus', 'constant', 'model', 'flow'],
 ['flow', 'billion', 'debt', 'free', 'revenue', 'ebitda'],
 ['portfolio', 'dividend', 'etf', 'fund', 'yield', 'income'],
 ['trade', 'fundamental', 'analysis', 'support', 'news', 'level'],
 ['ratio', 'pe', 'current', 'profit', 'book', 'equity'],
 ['option', 'leap', 'trade', 'risk', 'month', 'longterm'],
 ['graham', 'fund', 'buffett', 'book', 'asset', 'analysis']]

#### Non-negative matrix factorization (NMF), TfidfVectorizer

In [12]:
topic_model(X_tfidf, NMF, 8, 6)



[['question', 'book', 'sort', 'risk', 'buffett', 'tesla'],
 ['dividend', 'yield', 'portfolio', 'income', 'increase', 'cent'],
 ['flow', 'billion', 'debt', 'ebitda', 'revenue', 'free'],
 ['moat', 'advantage', 'mode', 'competitive', 'economic', 'brand'],
 ['analysis', 'fundamental', 'technical', 'trader', 'ratio', 'chart'],
 ['ratio', 'model', 'discount', 'formula', 'calculate', 'flow'],
 ['music', 'foreign', 'applause', 'thank', 'backbone', 'raider'],
 ['etf', 'fund', 'portfolio', 'index', 'vanguard', 'mutual']]

#### Latent Dirichlet Allocation (LDA), CountVectorizer

In [13]:
topic_model(X_cv, LatentDirichletAllocation, 8, 6)

[['portfolio', 'etf', 'account', 'month', 'thousand', 'channel'],
 ['flow', 'billion', 'revenue', 'debt', 'free', 'ratio'],
 ['question', 'risk', 'sort', 'capital', 'fund', 'management'],
 ['book', 'intrinsic', 'valuation', 'graham', 'asset', 'profit'],
 ['fund', 'learn', 'buffett', 'tell', 'lose', 'question'],
 ['fundamental', 'analysis', 'trade', 'ratio', 'tesla', 'trading'],
 ['dividend', 'yield', 'model', 'income', 'flow', 'plus'],
 ['moat', 'product', 'advantage', 'cost', 'brand', 'competitive']]

### Topic modeling - Adjectives

In [14]:
df_videos_cleaned_v7_adj = df_videos_cleaned_v7.copy()
df_videos_cleaned_v7_noun = df_videos_cleaned_v7.copy()

In [15]:
df_videos_cleaned_adj = corpus_of_adjectives(df_videos_cleaned_v7_adj)

In [16]:
X_cv_adj = document_term_matrix(df_videos_cleaned_adj, CountVectorizer)[0]

In [17]:
X_tfidf_adj = document_term_matrix(df_videos_cleaned_adj, TfidfVectorizer)[0]

#### Non-negative matrix factorization, CountVectorizer

In [18]:
topic_model(X_cv_adj, NMF, 8, 6)

[['carpet', 'advent', 'admiralty', 'cheese', 'airspace', 'conver'],
 ['brokendown', 'buoy', 'airbnbs', 'brandon', 'calibration', 'analytically'],
 ['arrogant', 'alumnus', 'campus', 'chapman', 'allergy', 'bert'],
 ['avalanche', 'cofounded', 'arcg', 'brownforman', 'cheese', 'admiralty'],
 ['chose', 'binding', 'advent', 'arrange', 'adjunct', 'banner'],
 ['analytically', 'comb', 'brandon', 'adjunct', 'ackman', 'buffet'],
 ['automat', 'adjunct', 'boundless', 'branch', 'brandon', 'bert'],
 ['climbs', 'binding', 'arcg', 'allocation', 'boundless', 'adjunct']]

#### Non-negative matrix factorization, TfidfVectorizer

In [19]:
topic_model(X_tfidf_adj, NMF, 8, 6)

[['carpet', 'advent', 'chose', 'binding', 'admiralty', 'cheese'],
 ['avalanche', 'cofounded', 'adolescent', 'cheese', 'arcg', 'buoy'],
 ['alumnus', 'arrogant', 'campus', 'allergy', 'brokendown', 'camaro'],
 ['allocation', 'arcg', 'coat', 'climbs', 'contraction', 'beneficiaries'],
 ['automat', 'boundless', 'adjunct', 'climbs', 'bueller', 'branch'],
 ['bert', 'conver', 'assume', 'analytically', 'commentary', 'bigticket'],
 ['analytically', 'brandon', 'comb', 'adjunct', 'branch', 'ackman'],
 ['auditor', 'chose', 'brazil', 'awkward', 'candlestick', 'accurate']]

#### Latent Dirichlet Allocation (LDA), CountVectorizer

In [20]:
topic_model(X_cv_adj, LatentDirichletAllocation, 8, 6)

[['chose', 'brokendown', 'cheese', 'avalanche', 'brownforman', 'calculus'],
 ['brokendown', 'arrogant', 'campus', 'alumnus', 'chapman', 'allergy'],
 ['admiralty', 'advent', 'airspace', 'carpet', 'conver', 'conversely'],
 ['automat', 'analytically', 'adjunct', 'brandon', 'bert', 'boundless'],
 ['auditor', 'binding', 'calibration', 'argentine', 'bumps', 'ck'],
 ['adjunct', 'becky', 'box', 'child', 'carpet', 'ackman'],
 ['binding', 'chose', 'awkward', 'advent', 'captain', 'airspace'],
 ['avalanche', 'allocation', 'arcg', 'cofounded', 'climbs', 'bert']]