In [32]:
import numpy as np
import pandas as pd
import pickle

import spacy
nlp = spacy.load('en_core_web_sm')

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

pd.set_option("display.max_rows", 1000)
pd.set_option("max_colwidth", 200)


%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df_videos_cleaned_v7 = pickle.load(open('../Data/df_videos_cleaned_v7.pickle', 'rb'))

### All the functions (Need to put these into a separate .py file)

In [3]:
def remove_custom_stopwords(df):
    '''
    Input: Cleaned dataframe
    Output: Dataframe with custom stopwords removed
    '''
    def final_preprocessing(cleaned_text):  
        nlp.Defaults.stop_words |= {'uh','yeah','man','um','oh','guy','maybe','bye','hey', 'sort'}
        stopwords = nlp.Defaults.stop_words
        
        preprocessed_text_12 = [(word.lower(), pos) for word, pos in cleaned_text 
                                    if word.lower() not in stopwords] 
        
        return preprocessed_text_12
    
    df['Transcript'] = df['Transcript'].apply(final_preprocessing)
            
    return df  

In [4]:
def document_term_matrix(df, vectorizer):
    '''
    Input: Cleaned dataframe (after removing custom stopwords) and type of vectorizer
    Output: Document-term matrix
    '''
    ## Take the words out of the (word, POS) tuple, vectorize, and fit-transform into a matrix
    word_list = [[word[0] for word in doc] for doc in df['Transcript']]
    vec = vectorizer(tokenizer=lambda doc:doc, lowercase=False, min_df=2, max_df=0.3)
    matrix = vec.fit_transform(word_list).toarray()
        
    return matrix, vec.get_feature_names()

In [5]:
def topic_model(matrix, model, num_topics, num_words):
    '''
    Input: Document-term matrix, type of topic model, number of topics, and number of words is each topic
    Output: a list of lists containing topic words
    '''
    ## Creates an instance of an NMF or LDA model
    if model == NMF:
        model = model(num_topics)
    elif model == LatentDirichletAllocation:
        model = model(n_components=num_topics)
        
    ## Fit_transform (matrix factorization for NMF) the doc_word matrix to get doc_topic and topic_word matrices
    doc_topic = model.fit_transform(matrix)
    topic_word = model.components_
    
    ## Retrieves the top words in each topic
    words = document_term_matrix(df_videos_cleaned_v9, CountVectorizer)[1]
    t_model = topic_word.argsort(axis=1)[:, -1:-(num_words+1):-1]
    top_topic_words = [[words[i] for i in topic] for topic in t_model]
        
    return top_topic_words, doc_topic

In [6]:
def corpus_of_adjectives(df):
    '''
    Input: Cleaned dataframe (after removing custom stopwords) 
    Output: Dataframe with only adjectives in the transcript corpus
    '''
    def adjectives(cleaned_text):
        
        preprocessed_text_adj = [(word.lower(), pos) for word, pos in cleaned_text 
                                    if pos=='ADJ'] 
        
        return preprocessed_text_adj
    
    df['Transcript'] = df['Transcript'].apply(adjectives)
            
    return df

In [7]:
def topic_assignment(df):
    '''
    Input: Cleaned dataframe (after removing custom stopwords)
    Output: Dataframe with topic and topic coefficient added
    '''
    ## Takes the highest coefficient for each video (row) in the doc_topic matrix, and puts them into a list 
    doc_topic = topic_model(X_tfidf, NMF, 12, 7)[1]
    topic_coeff = [round(np.max(coeffs),3) for coeffs in doc_topic]
    topic = list(doc_topic.argmax(axis=1))
    
    ## Map topic indices to topic names
    topic_keys = {0:'Value Investing', 1:'Valuation', 2:'Economic Moats', 3:'Passive Investing', 
                  4:'Valuation (Case Studies)', 5:'Technology Stocks', 6:'General', 7:'Value Investing', 8:'Fundamental vs. Technical Analysis', 
                  9:'Electric Vehicle Stocks', 10:'Value Investing', 11:'Dividend Investing'}
    
    topic_name = [topic_keys.get(topic_index,'') for topic_index in topic]
    
    ## Add the Topic and Topic Coefficient columns
    df['Topic'] = topic_name
    df['Topic Coefficient'] = topic_coeff
    
    return df

In [8]:
def pickle_df(df_str):
    '''
    Input: Name of a dataframe in a string format
    Output: Pickle the dataframe into the Data folder
    '''  
    with open('../Data/'+ df_str +'.pickle', 'wb') as f_video_data:
        pickle.dump(eval(df_str), f_video_data)

### Remove custom stopwords

In [9]:
df_videos_cleaned_v8 = remove_custom_stopwords(df_videos_cleaned_v7)

In [10]:
pickle_df('df_videos_cleaned_v8')

### Creating document-term matrices

In [11]:
df_videos_cleaned_v9 = pickle.load(open('../Data/df_videos_cleaned_v9.pickle', 'rb'))

#### CountVectorizer

In [12]:
X_cv = document_term_matrix(df_videos_cleaned_v9, CountVectorizer)[0]

#### TfidfVectorizer

In [13]:
X_tfidf = document_term_matrix(df_videos_cleaned_v9, TfidfVectorizer)[0]

### Topic modeling - Entire corpus

#### Non-negative matrix factorization (NMF), CountVectorizer

In [14]:
topics_nmf_cv = topic_model(X_cv, NMF, 12, 7)[0]
topics_nmf_cv

[['life', 'team', 'cheap', 'stuff', 'problem', 'answer', 'berkshire'],
 ['option', 'leap', 'decay', 'cover', 'spread', 'view', 'ge'],
 ['equal', 'divide', 'constant', 'discount', 'present', 'zero', 'minus'],
 ['graham', 'security', 'ben', 'street', 'intrinsic', 'wall', 'benjamin'],
 ['etf', 'holding', 'index', 'sector', 'expense', 'vanguard', 'individual'],
 ['bank', 'loan', 'credit', 'sector', 'percentage', 'loss', 'deposit'],
 ['unit', 'purchase', 'method', 'inventory', 'batch', 'system', 'blue'],
 ['moat', 'brand', 'customer', 'competitive', 'network', 'economic', 'mode'],
 ['tesla', 'apple', 'stuff', 'youtube', 'car', 'amazon', 'drop'],
 ['multiple', 'ebitda', 'pe', 'enterprise', 'statement', 'sheet', 'forecast'],
 ['report', 'news', 'release', 'economy', 'data', 'event', 'economic'],
 ['tax', 'roth', 'ira', 'retire', 'live', 'estate', 'goal']]

#### Non-negative matrix factorization (NMF), TfidfVectorizer

In [15]:
topics_nmf_tfidf = topic_model(X_tfidf, NMF, 12, 7)[0]
topics_nmf_tfidf

[['index', 'life', 'cheap', 'warren', 'bond', 'manager', 'answer'],
 ['constant', 'formula', 'equal', 'divide', 'present', 'minus', 'calculate'],
 ['moat', 'mode', 'competitive', 'economic', 'competitor', 'brand', 'castle'],
 ['etf', 'holding', 'vanguard', 'index', 'expense', 'etfs', 'sector'],
 ['ebitda', 'enterprise', 'multiple', 'irr', 'da', 'forecast', 'decade'],
 ['apple', 'cent', 'facebook', 'microsoft', 'amazon', 'iphone', 'google'],
 ['foreign', 'applause', 'raider', 'backbone', 'nifty', 'twice', 'prior'],
 ['intrinsic',
  'discount',
  'method',
  'calculate',
  'graham',
  'safety',
  'estimate'],
 ['technical', 'trader', 'chart', 'trend', 'volume', 'shortterm', 'statement'],
 ['tesla', 'car', 'vehicle', 'elon', 'battery', 'neo', 'electric'],
 ['pe', 'eps', 'divide', 'multiple', 'peg', 'amazon', 'metric'],
 ['bank', 'loan', 'sector', 'rupee', 'statement', 'deposit', 'liability']]

#### Latent Dirichlet Allocation (LDA), CountVectorizer

In [16]:
topics_lda_cv = topic_model(X_cv, LatentDirichletAllocation, 6, 7)[0]
topics_lda_cv

[['multiple', 'cheap', 'graham', 'warren', 'ebitda', 'pe', 'intrinsic'],
 ['etf', 'index', 'tax', 'sector', 'goal', 'holding', 'individual'],
 ['discount', 'divide', 'calculate', 'equal', 'formula', 'present', 'cent'],
 ['bank', 'economy', 'country', 'sector', 'loan', 'unit', 'report'],
 ['moat',
  'competitive',
  'brand',
  'customer',
  'mode',
  'economic',
  'competitor'],
 ['tesla', 'apple', 'game', 'quarter', 'drop', 'customer', 'car']]

### Topic modeling - Adjectives

In [17]:
df_videos_cleaned_v9_adj = df_videos_cleaned_v9.copy()

In [18]:
df_videos_cleaned_adj = corpus_of_adjectives(df_videos_cleaned_v9_adj)

In [19]:
X_cv_adj = document_term_matrix(df_videos_cleaned_adj, CountVectorizer)[0]

In [20]:
X_tfidf_adj = document_term_matrix(df_videos_cleaned_adj, TfidfVectorizer)[0]

#### Non-negative matrix factorization (NMF), CountVectorizer

In [21]:
topics_nmf_cv_adj = topic_model(X_cv_adj, NMF, 6, 6)[0]
topics_nmf_cv_adj

[['brownforman', 'bahamas', 'beget', 'allow', 'cardinal', 'airline'],
 ['brownfield', 'break', 'burt', 'airline', 'cannula', 'caltech'],
 ['amalgam', 'candor', 'brew', 'break', 'acl', 'allianz'],
 ['boy', 'aj', 'break', 'brazilian', 'bulk', 'bookkeeper'],
 ['article', 'allianz', 'attain', 'candor', 'cannula', 'behaviorally'],
 ['beth', 'candor', 'assured', 'alumnus', 'alexis', 'commissionfree']]

#### Non-negative matrix factorization (NMF), TfidfVectorizer

In [22]:
topics_nmf_tfidf_adj = topic_model(X_tfidf_adj, NMF, 6, 6)[0]
topics_nmf_tfidf_adj

[['beget', 'brownforman', 'brownfield', 'bracelet', 'aj', 'bus'],
 ['amalgam', 'candor', 'article', 'brownfield', 'allianz', 'canberra'],
 ['beth', 'assured', 'commissionfree', 'candor', 'alumnus', 'compose'],
 ['allow', 'archaic', 'cocaine', 'contributes', 'beraldo', 'arab'],
 ['coil', 'adore', 'ageappropriate', 'burt', 'aum', 'airline'],
 ['break', 'boy', 'brazilian', 'bulk', 'caltech', 'acl']]

#### Latent Dirichlet Allocation (LDA), CountVectorizer

In [23]:
topics_lda_cv_adj = topic_model(X_cv_adj, LatentDirichletAllocation, 6, 6)[0]
topics_lda_cv_adj

[['archaic', 'break', 'allow', 'boy', 'bookkeeper', 'assured'],
 ['brazilian', 'caltech', 'aum', 'anima', 'axiom', 'chump'],
 ['aj', 'convince', 'bahamas', 'clue', 'commissionfree', 'airline'],
 ['article', 'candor', 'brownfield', 'amalgam', 'beth', 'break'],
 ['beget', 'brownfield', 'coil', 'bulletin', 'bracelet', 'brownforman'],
 ['boy', 'axiom', 'cambria', 'beth', 'aftermath', 'catering']]

### Assigning topics and coefficients to videos

In [24]:
df_videos_cleaned_v10 = topic_assignment(df_videos_cleaned_v9)

In [25]:
df_videos_cleaned_v10['Topic'].value_counts()

Value Investing                       745
Dividend Investing                    212
Technology Stocks                     173
Economic Moats                        129
Passive Investing                     121
Valuation                             113
Fundamental vs. Technical Analysis    112
Electric Vehicle Stocks                84
Valuation (Case Studies)               64
General                                25
Name: Topic, dtype: int64

### Reassigning topics for videos under 'Value Investing'

In [40]:
df_videos_cleaned_v10_value = df_videos_cleaned_v10[df_videos_cleaned_v10['Topic']=='Value Investing'].sort_values('Topic Coefficient')
df_videos_cleaned_v10_value[['Title']]

Unnamed: 0,Title
1083,UNBOXING | The Intelligent Investor: The Definitive Book on Value Investing.
1355,A GOOD STOCK FOR LONG TERM INVESTMENT
970,Value Investing Guide Singapore | 8 financial ratios that Value Investors must know
94,How to select a stock for investing | Introduction to fundamental analysis.
86,Top 5 reasons why COAL INDIA is Falling | CA Rachana Ranade
873,Guy Spier: How Value Investing has changed -- new strategies of successful value investors (1)
187,Fundamental Analysis on Keeper DAO: Money Making Machine ($ROOK)
994,Phil Goldstein: The Bulldog's passion for value investing
1458,Value Stocks vs. Growth Stocks: Which Way Should You Invest?
1408,AGTC Stock PRICE TARGET & ANALYSIS - ANALYST SEE 400% GROWTH (Applied Genetic Technologies)


### Pickle the modified dataframe 

In [26]:
pickle_df('df_videos_cleaned_v10')