In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import date
from IPython.display import clear_output

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

%load_ext autoreload
%autoreload 2

In [2]:
df_videos_cleaned_v10 = pickle.load(open('../Data/df_videos_cleaned_v10.pickle', 'rb')).reset_index(drop=True)

### All the functions (Need to put these into a separate .py file)

In [3]:
def initial_recommender(df):
    '''
    Input: Final dataframe of video data, user input on topic, duration, and upload date
    Output: Top five recommendations
    '''
    ## Take user inputs
    print('Which topics would you like to learn about?')
    topic = input()
    
    print('\n')
    print('What is the upper limit of the duration of videos (in minutes)?')
    duration = int(input())
    
    print('\n')
    print('How recent do you want the videos to be (in months since upload date)?')
    upload_date = int(input())
    today = date.today()
    df['Months Since Upload'] = df['Upload Date'].apply(lambda x:((today - x).days)*12/365)

    clear_output()
    
    ## Define a new variable to store the preferred videos. Copy the contents of df to filtered videos
    df_videos_filtered = df.copy()

    ## Return top five videos based on topic coefficient (how relevant the videos are to the user's topic)
    df_videos_filtered = df_videos_filtered[(df_videos_filtered['Topic']==topic) & 
                                      (df_videos_filtered['Duration']<duration) &
                                      (df_videos_filtered['Months Since Upload']<upload_date)]
    df_videos_filtered = df_videos_filtered.sort_values('Topic Coefficient', ascending=False)
    
    return df_videos_filtered.head(), df_videos_filtered

In [4]:
def follow_up_recommender(video_id, df_videos_filtered, vectorizer, similarity_metric):
    '''
    Input: Video ID of a user's liked video, and the dataframe of the filtered videos generated from the initial recommender
    Output: Top five follow-up recommendations using content-based recommender system
    '''
    ## Fit and transform the transcript into a document-term matrix
    word_list = [[word[0] for word in doc] for doc in df_videos_filtered['Transcript']]
    vec = vectorizer(tokenizer=lambda doc:doc, lowercase=False)
    matrix = vec.fit_transform(word_list).toarray()
    
    ## Generate a similarity matrix
    similarity_matrix = similarity_metric(matrix, matrix)
    
    ## Create a series of indices for the videos (rows)  
    df_videos_filtered = df_videos_filtered.reset_index(drop=True)
    indices = pd.Series(df_videos_filtered['Video_ID'])
    
    ## Get the index of the user's liked video
    idx = indices[indices == video_id].index[0]
    
    ## Create a series with the similarity scores in descending order and grab indices
    score_series = pd.Series(similarity_matrix[idx]).sort_values(ascending=False)
    similarity_indices = list(score_series.index)
    
    ## Drop videos that were in the original recommendation
    similarity_indices = [index for index in similarity_indices if index not in list(df_videos_filtered[:5].index)]
    top_5_indices = similarity_indices[:5]
    
    ## Populate a dataframe of the recommended videos
    df_videos_follow_up_recs = df_videos_filtered.iloc[top_5_indices]
    
    return df_videos_follow_up_recs

### Initial knowledge-based recommender

In [5]:
df_videos_recs_test1, df_videos_filtered_test1 = initial_recommender(df_videos_cleaned_v10)

In [6]:
df_videos_recs_test1

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
1120,5MBalQDK5IA,FUNDAMENTAL IN VALUE INVESTING - Economic Moat...,2020-05-26,8.433333,No views,0,Investing is only complex when you are not wel...,"[(welcome, NOUN), (module, PROPN), (module, PR...",Economic Moats,0.338,9.863014
1089,OHCpUcerz1Q,Economic Moat (Competitive Advantage): Warren ...,2020-11-22,14.816667,53 views,2,What is an Economic Moat and how does an inves...,"[(people, NOUN), (prefer, VERB), (buy, VERB), ...",Economic Moats,0.335,3.945205
1150,sHhlvdjiY2g,How Do Economic Moats Add Value? (It Matters),2020-06-24,7.933333,26 views,0,What is Economic Moat? Where did the term come...,"[(today, NOUN), (gon, VERB), (na, PROPN), (tal...",Economic Moats,0.313,8.909589
1155,TV4bxhc2Azw,5M Mondays - Moat - The 5 Moats,2020-09-28,11.216667,3 views,0,What are the 5 Moats and why are they importan...,"[(happy, ADJ), (mm, PROPN), (monday, PROPN), (...",Economic Moats,0.303,5.753425
1066,_WpbrpYjvHk,Investing Basics: Economic Moats Explained,2020-11-15,17.633333,58 views,5,In this video I explain the investing basics o...,"[(welcome, ADJ), (financial, ADJ), (interest, ...",Economic Moats,0.284,4.175342


In [7]:
df_videos_cleaned_v10.to_csv('../App/df_videos_cleaned_v10.csv',index=False)

### Follow-up content-based recommender (after the initial knowledge-based recommender takes care of the cold start problem)

#### TfidfVectorizer, Cosine similarity

In [8]:
X_tfidf_cosine = follow_up_recommender(df_videos_recs_test1['Video_ID'].iloc[0], df_videos_filtered_test1, TfidfVectorizer, cosine_similarity)
X_tfidf_cosine

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
18,F18O45yBtZQ,FUNDAMENTAL IN VALUE INVESTING - Economic Moat...,2020-05-29,9.783333,No views,0,Investing is only complex when you are not wel...,"[(cost, NOUN), (advantage, NOUN), (company, NO...",Economic Moats,0.206,9.764384
13,dZItart8asY,What Is An Economic Moat?,2020-11-20,7.416667,20 views,4,Be More Productive: https://skl.sh/33u3Qbl Kno...,"[(economic, ADJ), (mode, NOUN), (longterm, PRO...",Economic Moats,0.235,4.010959
10,Z9H162gzzCg,Market Beating Stocks With Economic Moats (Com...,2020-09-24,10.483333,"3,528 views",146,One of the most important things we look for i...,"[(important, ADJ), (thing, NOUN), (look, VERB)...",Economic Moats,0.253,5.884932
14,JgHWx_-_4K0,What is an Economic Moat? | Warren Buffett & C...,2020-07-28,5.916667,314 views,19,"""The term economic moat, popularized by Warren...","[(hello, INTJ), (welcome, VERB), (financial, A...",Economic Moats,0.229,7.791781
16,s37ifMv5894,What is Economic Moat? | Warren Buffett Invest...,2020-10-16,4.933333,392 views,30,Hello everyone. Warren Buffett believes that i...,"[(hello, INTJ), (welcome, ADJ), (channel, NOUN...",Economic Moats,0.213,5.161644


#### TfidfVectorizer, Euclidean distance 

In [9]:
X_tfidf_euclidean = follow_up_recommender(df_videos_recs_test1['Video_ID'].iloc[0], df_videos_filtered_test1, TfidfVectorizer, euclidean_distances)
X_tfidf_euclidean

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
47,gdAKtVYELTs,Mohnish Pabrai on how cloning is not in confli...,2020-07-07,3.283333,965 views,32,"[Join YAPSS Membership, For Early Access to Ne...","[(music, PROPN), (buffett, PROPN), (buffett, P...",Economic Moats,0.039,8.482192
40,wWjv2bIKYDQ,What are MOATS in business? - Tushar Kansal | ...,2020-12-21,3.933333,5 views,0,What are MOATS in business?\n\nEnjoyed the Vid...,"[(music, NOUN), (modes, VERB), (business, NOUN...",Economic Moats,0.095,2.991781
26,ozrk3Q_vUvU,Charlie Munger on Traditional Moats. | [C:C.M ...,2020-06-23,1.666667,"2,787 views",72,"In this episode, Charlie Munger was asked that...","[(music, NOUN), (common, ADJ), (sentiment, NOU...",Economic Moats,0.156,8.942466
48,SSqhCAgUacQ,Warren Buffet: Durable Competitive Advantage (...,2020-09-23,7.166667,132 views,15,Enjoyed this video? \n\nCheck out more videos\...,"[(music, PROPN), (video, PROPN), (warren, PROP...",Economic Moats,0.0,5.917808
43,iU1LJlTH8w4,Warren Buffett On Distinct Competitive Advanta...,2020-05-09,3.2,30 views,0,➖➖➖➖➖➖➖➖➖➖➖➖➖➖➖➖➖➖➖➖➖➖➖\n\nFollow Me on these ...,"[(welcome, INTJ), (know, VERB), (tommy, PROPN)...",Economic Moats,0.056,10.421918


#### CountVectorizer, Cosine similarity

In [10]:
X_cv_cosine = follow_up_recommender(df_videos_recs_test1['Video_ID'].iloc[0], df_videos_filtered_test1, CountVectorizer, cosine_similarity)
X_cv_cosine

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
13,dZItart8asY,What Is An Economic Moat?,2020-11-20,7.416667,20 views,4,Be More Productive: https://skl.sh/33u3Qbl Kno...,"[(economic, ADJ), (mode, NOUN), (longterm, PRO...",Economic Moats,0.235,4.010959
10,Z9H162gzzCg,Market Beating Stocks With Economic Moats (Com...,2020-09-24,10.483333,"3,528 views",146,One of the most important things we look for i...,"[(important, ADJ), (thing, NOUN), (look, VERB)...",Economic Moats,0.253,5.884932
18,F18O45yBtZQ,FUNDAMENTAL IN VALUE INVESTING - Economic Moat...,2020-05-29,9.783333,No views,0,Investing is only complex when you are not wel...,"[(cost, NOUN), (advantage, NOUN), (company, NO...",Economic Moats,0.206,9.764384
14,JgHWx_-_4K0,What is an Economic Moat? | Warren Buffett & C...,2020-07-28,5.916667,314 views,19,"""The term economic moat, popularized by Warren...","[(hello, INTJ), (welcome, VERB), (financial, A...",Economic Moats,0.229,7.791781
16,s37ifMv5894,What is Economic Moat? | Warren Buffett Invest...,2020-10-16,4.933333,392 views,30,Hello everyone. Warren Buffett believes that i...,"[(hello, INTJ), (welcome, ADJ), (channel, NOUN...",Economic Moats,0.213,5.161644


#### CountVectorizer, Euclidean distance 

In [11]:
X_cv_euclidean = follow_up_recommender(df_videos_recs_test1['Video_ID'].iloc[0], df_videos_filtered_test1, CountVectorizer, euclidean_distances)
X_cv_euclidean

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
46,cOlPoEAnN6E,How To Pick Dividend Growth Stocks With Good a...,2020-05-21,14.85,"4,297 views",310,"In this video, I want to share with my step by...","[(pick, VERB), (dividend, NOUN), (growth, NOUN...",Economic Moats,0.046,10.027397
44,a7-LRi-x4B8,Step 2: Moat (Cost Advantage),2020-08-25,8.6,2 views,1,🔥 Step 2: Moat (Cost Advantage) 🔥 \n\nThe 2nd...,"[(welcome, NOUN), (today, NOUN), (video, PROPN...",Economic Moats,0.056,6.871233
35,2creBZPaizI,Warren Buffett's Wonderful Companies: Strong E...,2020-07-14,15.266667,583 views,33,Charlie Tian lays down Warren Buffett's fairly...,"[(fellow, ADJ), (investor, NOUN), (charlie, PR...",Economic Moats,0.123,8.252055
39,oFB3h1QkfwQ,How to Analyse Business Models! | #1 Cost Adva...,2021-02-23,11.45,31 views,6,H goes through how to use find cost advantage ...,"[(everybody, PRON), (welcome, VERB), (invests,...",Economic Moats,0.1,0.887671
19,deYamGLc154,How to Bulletproof a Business Acquisition (Hin...,2020-03-30,10.666667,"1,688 views",83,Free M&A Training: https://jasonpaulrogers.com...,"[(jason, PROPN), (rogers, PROPN), (video, PROP...",Economic Moats,0.204,11.736986


#### *Conclusion: Using CountVectorizer and cosine similarity gives the best recommendations, based on my domain knowledge in stock investing.  Tfidf and CV produce similar results for cosine similarity, but Tfidf gets slightly better results in terms of topic relevance*