In [1]:
import numpy as np
import pandas as pd
import pickle

import spacy
nlp = spacy.load('en_core_web_sm')

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

%load_ext autoreload
%autoreload 2

In [2]:
df_videos_cleaned_v7 = pickle.load(open('../Data/df_videos_cleaned_v7.pickle', 'rb'))

### All the functions (Need to put these into a separate .py file)

In [3]:
def remove_custom_stopwords(df):
    '''
    Input: Cleaned dataframe
    Output: Dataframe with custom stopwords removed
    '''
    def final_preprocessing(cleaned_text):  
        nlp.Defaults.stop_words |= {'uh','yeah','man','um','oh','guy','maybe','bye'}
        stopwords = nlp.Defaults.stop_words
        
        preprocessed_text_12 = [(word.lower(), pos) for word, pos in cleaned_text 
                                    if word.lower() not in stopwords] 
        
        return preprocessed_text_12
    
    df['Transcript'] = df['Transcript'].apply(final_preprocessing)
            
    return df  

In [4]:
def document_term_matrix(df, vectorizer):
    '''
    Input: Cleaned dataframe (after removing custom stopwords) and type of vectorizer
    Output: Document-term matrix
    '''
    ## Take the words out of the (word, POS) tuple, vectorize, and fit-transform into a matrix
    word_list = [[word[0] for word in doc] for doc in df['Transcript']]
    vec = vectorizer(tokenizer=lambda doc:doc, lowercase=False, min_df=2, max_df=0.3)
    matrix = vec.fit_transform(word_list).toarray()
        
    return matrix, vec.get_feature_names()

In [5]:
def topic_model(matrix, model, num_topics, num_words):
    '''
    Input: Document-term matrix, type of topic model, number of topics, and number of words is each topic
    Output: a list of lists containing topic words
    '''
    ## Creates an instance of an NMF or LDA model
    if model == NMF:
        model = model(num_topics)
    elif model == LatentDirichletAllocation:
        model = model(n_components=num_topics)
        
    ## Fit_transform (matrix factorization for NMF) the doc_word matrix to get doc_topic and topic_word matrices
    doc_topic = model.fit_transform(matrix)
    topic_word = model.components_
    
    ## Retrieves the top words in each topic
    words = document_term_matrix(df_videos_cleaned_v8, CountVectorizer)[1]
    t_model = topic_word.argsort(axis=1)[:, -1:-(num_words+1):-1]
    top_topic_words = [[words[i] for i in topic] for topic in t_model]
        
    return top_topic_words, doc_topic

In [6]:
def corpus_of_adjectives(df):
    '''
    Input: Cleaned dataframe (after removing custom stopwords) 
    Output: Dataframe with only adjectives in the transcript corpus
    '''
    def adjectives(cleaned_text):
        
        preprocessed_text_adj = [(word.lower(), pos) for word, pos in cleaned_text 
                                    if pos=='ADJ'] 
        
        return preprocessed_text_adj
    
    df['Transcript'] = df['Transcript'].apply(adjectives)
            
    return df

In [7]:
def topic_assignment(df):
    '''
    Input: Cleaned dataframe (after removing custom stopwords)
    Output: Dataframe with topic and topic coefficient added
    '''
    ## Takes the highest coefficient for each video (row) in the doc_topic matrix, and puts them into a list 
    doc_topic = topic_model(X_tfidf, NMF, 6, 7)[1]
    topic_coeff = [round(np.max(coeffs),3) for coeffs in doc_topic]
    topic = list(doc_topic.argmax(axis=1))
    
    ## Map topic indices to topic names
    topic_keys = {0:'General', 1:'Valuation', 2:'Competitive Moats', 3:'Passive Investing', 
                  4:'Valuation', 5:'Technology Stocks'}
    
    topic_name = [topic_keys.get(topic_index,'') for topic_index in topic]
    
    ## Add the Topic and Topic Coefficient columns
    df['Topic'] = topic_name
    df['Topic Coefficient'] = topic_coeff
    
    return df

In [8]:
def pickle_df(df_str):
    '''
    Input: Name of a dataframe in a string format
    Output: Pickle the dataframe into the Data folder
    '''  
    with open('../Data/'+ df_str +'.pickle', 'wb') as f_video_data:
        pickle.dump(eval(df_str), f_video_data)

### Remove custom stopwords

In [9]:
df_videos_cleaned_v8 = remove_custom_stopwords(df_videos_cleaned_v7)

### Creating document-term matrices

#### CountVectorizer

In [10]:
X_cv = document_term_matrix(df_videos_cleaned_v8, CountVectorizer)[0]

#### TfidfVectorizer

In [11]:
X_tfidf = document_term_matrix(df_videos_cleaned_v8, TfidfVectorizer)[0]

### Topic modeling - Entire corpus

#### Non-negative matrix factorization (NMF), CountVectorizer

In [12]:
topics_nmf_cv = topic_model(X_cv, NMF, 6, 7)[0]
topics_nmf_cv

[['sort', 'buffett', 'cheap', 'moat', 'multiple', 'team', 'life'],
 ['option', 'leap', 'cover', 'decay', 'spread', 'view', 'ge'],
 ['equal', 'divide', 'discount', 'constant', 'present', 'zero', 'minus'],
 ['graham', 'buffett', 'security', 'ben', 'street', 'warren', 'intrinsic'],
 ['etf', 'holding', 'index', 'tax', 'sector', 'expense', 'goal'],
 ['bank', 'report', 'news', 'chart', 'support', 'data', 'economy']]

#### Non-negative matrix factorization (NMF), TfidfVectorizer

In [13]:
topics_nmf_tfidf = topic_model(X_tfidf, NMF, 6, 7)[0]
topics_nmf_tfidf

[['buffett', 'sort', 'warren', 'technical', 'bank', 'trader', 'index'],
 ['constant',
  'formula',
  'divide',
  'discount',
  'present',
  'equal',
  'calculate'],
 ['moat', 'mode', 'competitive', 'economic', 'brand', 'competitor', 'castle'],
 ['etf', 'holding', 'index', 'vanguard', 'expense', 'johnson', 'etfs'],
 ['ebitda', 'enterprise', 'multiple', 'irr', 'forecast', 'statement', 'da'],
 ['tesla', 'apple', 'pe', 'amazon', 'facebook', 'car', 'vehicle']]

#### Latent Dirichlet Allocation (LDA), CountVectorizer

In [14]:
topics_lda_cv = topic_model(X_cv, LatentDirichletAllocation, 6, 7)[0]
topics_lda_cv

[['moat', 'customer', 'tesla', 'brand', 'apple', 'competitive', 'mode'],
 ['bank', 'economy', 'news', 'report', 'country', 'inflation', 'index'],
 ['equal',
  'divide',
  'discount',
  'formula',
  'present',
  'calculate',
  'constant'],
 ['buffett', 'sort', 'warren', 'graham', 'cheap', 'berkshire', 'write'],
 ['pe', 'statement', 'multiple', 'bank', 'sheet', 'ebitda', 'negative'],
 ['etf', 'index', 'chart', 'tax', 'sector', 'holding', 'goal']]

### Topic modeling - Adjectives

In [15]:
df_videos_cleaned_v8_adj = df_videos_cleaned_v8.copy()

In [16]:
df_videos_cleaned_adj = corpus_of_adjectives(df_videos_cleaned_v8_adj)

In [17]:
X_cv_adj = document_term_matrix(df_videos_cleaned_adj, CountVectorizer)[0]

In [18]:
X_tfidf_adj = document_term_matrix(df_videos_cleaned_adj, TfidfVectorizer)[0]

#### Non-negative matrix factorization (NMF), CountVectorizer

In [19]:
topics_nmf_cv_adj = topic_model(X_cv_adj, NMF, 6, 6)[0]
topics_nmf_cv_adj

[['airtel', 'boundaries', 'badge', 'allocation', 'brokendown', 'conversely'],
 ['broken', 'buoy', 'branding', 'airbus', 'cancel', 'california'],
 ['arrive', 'alumnus', 'camry', 'allergy', 'breakfast', 'atr'],
 ['berserk', 'boundaries', 'camry', 'branding', 'alum', 'association'],
 ['coffin', 'arc', 'branding', 'brainwash', 'brownfield', 'calendar'],
 ['bowman', 'beck', 'bjp', 'brokendown', 'acknowledge', 'blackberry']]

#### Non-negative matrix factorization (NMF), TfidfVectorizer

In [20]:
topics_nmf_tfidf_adj = topic_model(X_tfidf_adj, NMF, 6, 6)[0]
topics_nmf_tfidf_adj

[['beck', 'brokendown', 'airtel', 'broken', 'bowman', 'burdensome'],
 ['alumnus', 'arrive', 'camry', 'broken', 'allergy', 'cambria'],
 ['berserk', 'association', 'commensurate', 'camry', 'alum', 'completion'],
 ['allocation', 'arc', 'coastal', 'contractor', 'beneficial', 'aqr'],
 ['coffin', 'adolescent', 'ag', 'buoy', 'broken', 'brownfield'],
 ['branding',
  'brainwash',
  'boundaries',
  'budweiser',
  'calendar',
  'acknowledge']]

#### Latent Dirichlet Allocation (LDA), CountVectorizer

In [21]:
topics_lda_cv_adj = topic_model(X_cv_adj, LatentDirichletAllocation, 6, 6)[0]
topics_lda_cv_adj

[['broken', 'coffin', 'buoy', 'california', 'airbus', 'ag'],
 ['airtel', 'allocation', 'arc', 'boundaries', 'asleep', 'badge'],
 ['awfully', 'airtel', 'badge', 'captive', 'canina', 'allocation'],
 ['branding', 'brainwash', 'berserk', 'calendar', 'association', 'budweiser'],
 ['beck', 'brokendown', 'bowman', 'brownfield', 'blackberry', 'buffer'],
 ['arrive', 'camry', 'alumnus', 'allergy', 'berserk', 'broken']]

### Assigning topics and coefficients to videos

In [22]:
df_videos_cleaned_v9 = topic_assignment(df_videos_cleaned_v8)

### Pickle the modified dataframe 

In [23]:
pickle_df('df_videos_cleaned_v9')