In [1]:
import numpy as np
import pandas as pd

import spacy
nlp = spacy.load('en_core_web_sm')

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

%load_ext autoreload
%autoreload 2

In [2]:
df_videos_cleaned_v6 = pd.read_csv('../Data/df_videos_cleaned_v6.csv')

### All the functions (Need to put these into a separate .py file)

In [3]:
def remove_custom_stopwords(df):
    '''
    Input: Cleaned dataframe
    Output: Dataframe with custom stopwords removed
    '''
    def final_preprocessing(cleaned_text):
        preprocessed_text_11 = eval(cleaned_text)
       
        nlp.Defaults.stop_words |= {'uh','yeah','man','um','oh','guy','maybe','bye'}
        stopwords = nlp.Defaults.stop_words
        
        preprocessed_text_12 = [(word.lower(), pos) for word, pos in preprocessed_text_11 
                                    if word.lower() not in stopwords] 
        
        return preprocessed_text_12
    
    df['Transcript'] = df['Transcript'].apply(final_preprocessing)
            
    return df  

In [4]:
def document_term_matrix(df, vectorizer):
    '''
    Input: Cleaned dataframe (after removing custom stopwords) and type of vectorizer
    Output: Document-term matrix
    '''
    # Take the words out of the (word, POS) tuple, vectorize, and fit-transform into a matrix
    word_list = [[word[0] for word in doc] for doc in df['Transcript']]
    vec = vectorizer(tokenizer=lambda doc:doc, lowercase=False, min_df=2, max_df=0.3)
    matrix = vec.fit_transform(word_list).toarray()
        
    return matrix, vec.get_feature_names()

In [5]:
def topic_model(matrix, model, num_topics, num_words):
    '''
    Input: Document-term matrix, type of topic model, number of topics, and number of words is each topic
    Output: a list of lists containing topic words
    '''
    if model == NMF:
        model = model(num_topics)
    elif model == LatentDirichletAllocation:
        model = model(n_components=num_topics)
        
    doc_topic = model.fit_transform(matrix)
    topic_word = model.components_
    
    words = document_term_matrix(df_videos_cleaned_v7, CountVectorizer)[1]
    t_model = topic_word.argsort(axis=1)[:, -1:-(num_words+1):-1]
    top_topic_words = [[words[i] for i in topic] for topic in t_model]
        
    return top_topic_words, doc_topic

In [6]:
def corpus_of_adjectives(df):
    '''
    Input: Cleaned dataframe (after removing custom stopwords) 
    Output: Dataframe with only adjectives in the transcript corpus
    '''
    def adjectives(cleaned_text):
        
        preprocessed_text_adj = [(word.lower(), pos) for word, pos in cleaned_text 
                                    if pos=='ADJ'] 
        
        return preprocessed_text_adj
    
    df['Transcript'] = df['Transcript'].apply(adjectives)
            
    return df

In [7]:
def topic_assignment(df):
    '''
    Input: Cleaned dataframe (after removing custom stopwords)
    Output: Dataframe with topic and topic coefficient added
    '''
    doc_topic = topic_model(X_tfidf, NMF, 6, 7)[1]
    topic_coeff = [round(np.max(coeffs),3) for coeffs in doc_topic]
    topic = list(doc_topic.argmax(axis=1))
    
    topic_keys = {0:'General', 1:'Valuation', 2:'Competitive Moats', 3:'Passive Investing', 
                  4:'Financial statement Analysis', 5:'Technology stocks'}
    
    topic_name = [topic_keys.get(topic_index,'') for topic_index in topic]
    
    df['Topic'] = topic_name
    df['Topic Coefficient'] = topic_coeff
    
    return df

In [8]:
def df_to_csv(df_str):
    '''
    Input: Name of a dataframe in a string format
    Output: CSV file of the dataframe saved into the Data folder
    '''
    eval(df_str).to_csv('../Data/{}.csv'.format(df_str), index=False)

### Remove custom stopwords

In [9]:
df_videos_cleaned_v7 = remove_custom_stopwords(df_videos_cleaned_v6)

### Creating document-term matrices

#### CountVectorizer

In [10]:
X_cv = document_term_matrix(df_videos_cleaned_v7, CountVectorizer)[0]

#### TfidfVectorizer

In [11]:
X_tfidf = document_term_matrix(df_videos_cleaned_v7, TfidfVectorizer)[0]

### Topic modeling - Entire corpus

#### Non-negative matrix factorization (NMF), CountVectorizer

In [12]:
topics_nmf_cv = topic_model(X_cv, NMF, 6, 7)[0]
topics_nmf_cv

[['sort', 'buffett', 'cheap', 'multiple', 'moat', 'team', 'answer'],
 ['option', 'leap', 'decay', 'cover', 'spread', 'view', 'ge'],
 ['equal', 'divide', 'discount', 'constant', 'present', 'zero', 'minus'],
 ['graham', 'buffett', 'security', 'ben', 'street', 'intrinsic', 'warren'],
 ['bank', 'support', 'news', 'report', 'resistance', 'break', 'chart'],
 ['etf', 'holding', 'index', 'tax', 'sector', 'expense', 'goal']]

#### Non-negative matrix factorization (NMF), TfidfVectorizer

In [13]:
topics_nmf_tfidf = topic_model(X_tfidf, NMF, 6, 7)[0]
topics_nmf_tfidf

[['buffett', 'sort', 'technical', 'warren', 'bank', 'trader', 'index'],
 ['constant',
  'formula',
  'divide',
  'discount',
  'present',
  'equal',
  'calculate'],
 ['moat', 'mode', 'competitive', 'economic', 'brand', 'competitor', 'castle'],
 ['etf', 'holding', 'index', 'vanguard', 'expense', 'johnson', 'etfs'],
 ['ebitda', 'enterprise', 'multiple', 'irr', 'statement', 'forecast', 'da'],
 ['tesla', 'apple', 'pe', 'amazon', 'car', 'vehicle', 'facebook']]

#### Latent Dirichlet Allocation (LDA), CountVectorizer

In [14]:
topics_lda_cv = topic_model(X_cv, LatentDirichletAllocation, 6, 7)[0]
topics_lda_cv

[['discount',
  'divide',
  'equal',
  'calculate',
  'formula',
  'present',
  'constant'],
 ['brand', 'game', 'sort', 'china', 'team', 'customer', 'moat'],
 ['pe', 'bank', 'statement', 'sheet', 'multiple', 'apple', 'ebitda'],
 ['etf', 'tesla', 'index', 'tax', 'sector', 'holding', 'goal'],
 ['buffett', 'sort', 'warren', 'graham', 'cheap', 'berkshire', 'write'],
 ['moat', 'trader', 'economic', 'technical', 'chart', 'trend', 'news']]

### Topic modeling - Adjectives

In [15]:
df_videos_cleaned_v7_adj = df_videos_cleaned_v7.copy()

In [16]:
df_videos_cleaned_adj = corpus_of_adjectives(df_videos_cleaned_v7_adj)

In [17]:
X_cv_adj = document_term_matrix(df_videos_cleaned_adj, CountVectorizer)[0]

In [18]:
X_tfidf_adj = document_term_matrix(df_videos_cleaned_adj, TfidfVectorizer)[0]

#### Non-negative matrix factorization (NMF), CountVectorizer

In [19]:
topics_nmf_cv_adj = topic_model(X_cv_adj, NMF, 6, 6)[0]
topics_nmf_cv_adj

[['airtel', 'bourbon', 'badmind', 'allocation', 'convex', 'captured'],
 ['brokerages', 'brandy', 'bureau', 'airbus', 'candid', 'callaghan'],
 ['arrogance', 'alumnus', 'canadianbased', 'allergy', 'breaks', 'att'],
 ['bertrand', 'brandy', 'bourbon', 'canadianbased', 'alum', 'assumed'],
 ['cognitive', 'browse', 'arcanum', 'buffy', 'brandy', 'calibrate'],
 ['boxer', 'bedell', 'brokerdealer', 'blab', 'acknowledge', 'blackness']]

#### Non-negative matrix factorization (NMF), TfidfVectorizer

In [20]:
topics_nmf_tfidf_adj = topic_model(X_tfidf_adj, NMF, 6, 6)[0]
topics_nmf_tfidf_adj

[['bedell', 'brokerdealer', 'airtel', 'brokerages', 'boxer', 'burg'],
 ['alumnus', 'arrogance', 'canadianbased', 'brokerages', 'allergy', 'cameron'],
 ['bertrand', 'assumed', 'commerce', 'canadianbased', 'alum', 'compliant'],
 ['allocation', 'arcanum', 'coauthored', 'contradictory', 'benefited', 'aqui'],
 ['cognitive', 'adolescent', 'ag', 'bureau', 'brokerages', 'browse'],
 ['brandy', 'brand', 'bourbon', 'buffer', 'calibrate', 'acknowledge']]

#### Latent Dirichlet Allocation (LDA), CountVectorizer

In [21]:
topics_lda_cv_adj = topic_model(X_cv_adj, LatentDirichletAllocation, 6, 6)[0]
topics_lda_cv_adj

[['arrogance',
  'alumnus',
  'canadianbased',
  'bertrand',
  'assumed',
  'brokerages'],
 ['bourbon', 'airtel', 'brandy', 'bertrand', 'buffer', 'brand'],
 ['allocation', 'brokerages', 'arcanum', 'allergy', 'bureau', 'coauthored'],
 ['boxer', 'airtel', 'convex', 'badmind', 'clout', 'airbus'],
 ['bedell', 'buffy', 'brokerdealer', 'burger', 'blackness', 'burg'],
 ['cognitive', 'brand', 'calibrate', 'brandy', 'bonfire', 'browse']]

### Assigning topics and coefficients to videos

In [22]:
df_videos_cleaned_v8 = topic_assignment(df_videos_cleaned_v7)

### Save the modified dataframe into a CSV file

In [23]:
df_to_csv('df_videos_cleaned_v8')