In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import date
from IPython.display import clear_output

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

%load_ext autoreload
%autoreload 2

In [2]:
df_videos_cleaned_v9 = pickle.load(open('../Data/df_videos_cleaned_v9.pickle', 'rb'))

### All the functions (Need to put these into a separate .py file)

In [3]:
def initial_recommender(df):
    '''
    Input: Final dataframe of video data, user input on topic, duration, and upload date
    Output: Top five recommendations
    '''
    ## Take user inputs
    print('Which topics would you like to learn about?')
    topic = input().title()
    
    print('\n')
    print('What is the upper limit of the duration of videos (in minutes)?')
    duration = int(input())
    
    print('\n')
    print('How recent do you want the videos to be (in months since upload date)?')
    upload_date = int(input())
    today = date.today()
    df['Months Since Upload'] = df['Upload Date'].apply(lambda x:((today - x).days)*12/365)

    clear_output()
    
    ## Define a new variable to store the preferred videos. Copy the contents of df to filtered videos
    df_videos_filtered = df.copy()

    ## Return top five videos based on topic coefficient (how relevant the videos are to the user's topic)
    df_videos_filtered = df_videos_filtered[(df_videos_filtered['Topic']==topic) & 
                                      (df_videos_filtered['Duration']<duration) &
                                      (df_videos_filtered['Months Since Upload']<upload_date)]
    df_videos_filtered = df_videos_filtered.sort_values('Topic Coefficient', ascending=False)
    
    return df_videos_filtered.head(), df_videos_filtered

In [4]:
def follow_up_recommender(video_id, df_videos_filtered, vectorizer, similarity_metric):
    '''
    Input: Video ID of a user's liked video, and the dataframe of the filtered videos generated from the initial recommender
    Output: Top five follow-up recommendations using content-based recommender system
    '''
    ## Fit and transform the transcript into a document-term matrix
    word_list = [[word[0] for word in doc] for doc in df_videos_filtered['Transcript']]
    vec = vectorizer(tokenizer=lambda doc:doc, lowercase=False)
    matrix = vec.fit_transform(word_list).toarray()
    
    ## Generate a similarity matrix
    similarity_matrix = similarity_metric(matrix, matrix)
    
    ## Create a series of indices for the videos (rows)  
    df_videos_filtered = df_videos_filtered.reset_index(drop=True)
    indices = pd.Series(df_videos_filtered['Video_ID'])
    
    ## Get the index of the user's liked video
    idx = indices[indices == video_id].index[0]
    
    ## Create a series with the similarity scores in descending order and grab indices
    score_series = pd.Series(similarity_matrix[idx]).sort_values(ascending=False)
    similarity_indices = list(score_series.index)
    
    ## Drop videos that were in the original recommendation
    similarity_indices = [index for index in similarity_indices if index not in list(df_videos_filtered[:5].index)]
    top_5_indices = similarity_indices[:5]
    
    ## Populate a dataframe of the recommended videos
    df_videos_follow_up_recs = df_videos_filtered.iloc[top_5_indices]
    
    return df_videos_follow_up_recs

### Initial knowledge-based recommender

In [5]:
df_videos_recs_test1, df_videos_filtered_test1 = initial_recommender(df_videos_cleaned_v9)

In [6]:
df_videos_recs_test1

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
357,7MCuwYTTdxE,YIS Unit 3: Lesson 2 - Whatever Floats Your Moat,2019-09-02,18.333333,202 views,4,Join the largest High School Investing organiz...,"[(music, NOUN), (welcome, VERB), (young, ADJ),...",Competitive Moats,0.338,18.312329
324,OHCpUcerz1Q,Economic Moat (Competitive Advantage): Warren ...,2020-11-22,14.816667,53 views,2,What is an Economic Moat and how does an inves...,"[(people, NOUN), (prefer, VERB), (buy, VERB), ...",Competitive Moats,0.336,3.616438
362,5MBalQDK5IA,FUNDAMENTAL IN VALUE INVESTING - Economic Moat...,2020-05-26,8.433333,No views,0,Investing is only complex when you are not wel...,"[(hey, INTJ), (welcome, NOUN), (module, PROPN)...",Competitive Moats,0.335,9.534247
264,foiHBYfld2c,How to Pick a Strong Moat | Economic Moats Exp...,2018-12-10,12.483333,"3,024 views",141,★ ★ MY COURSES ★ ★\n\nStock Market Investing f...,"[(hey, INTJ), (welcome, VERB), (ozzie, PROPN),...",Competitive Moats,0.326,27.057534
325,WQiX7a_z--4,The Morningstar Economic Moat Rating,2016-05-02,1.616667,499 views,2,The Morningstar Economic Moat Rating represent...,"[(morningstar, PROPN), (economic, ADJ), (moat,...",Competitive Moats,0.322,58.356164


In [7]:
df_videos_cleaned_v9.to_csv('../App/df_videos_cleaned_v9.csv',index=False)

### Follow-up content-based recommender (after the initial knowledge-based recommender takes care of the cold start problem)

#### TfidfVectorizer, Cosine similarity

In [8]:
X_tfidf_cosine = follow_up_recommender('OHCpUcerz1Q', df_videos_filtered_test1, TfidfVectorizer, cosine_similarity)
X_tfidf_cosine

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
14,IA-dcUF5HL0,What is an Economic MOAT?,2020-09-16,3.933333,15 views,1,How to quickly identify a companies economic m...,"[(today, NOUN), (video, PROPN), (teach, AUX), ...",Competitive Moats,0.282,5.819178
20,thckfWlaVcQ,How to Pick a Stock to Buy - Part 2 - Moats,2019-04-22,5.533333,752 views,37,Moats are competitive advantages that a compan...,"[(hello, INTJ), (welcome, ADJ), (channel, NOUN...",Competitive Moats,0.257,22.684932
7,sHhlvdjiY2g,How Do Economic Moats Add Value? (It Matters),2020-06-24,7.933333,26 views,0,What is Economic Moat? Where did the term come...,"[(today, NOUN), (gon, VERB), (na, PROPN), (tal...",Competitive Moats,0.317,8.580822
12,kz4m-Q-Zpp8,Economic Moats To Look For - Investing Warren ...,2018-02-18,2.016667,"2,131 views",79,In this video I talk about the key ingredient ...,"[(hey, INTJ), (welcome, VERB), (ozzie, NOUN), ...",Competitive Moats,0.294,36.756164
36,F18O45yBtZQ,FUNDAMENTAL IN VALUE INVESTING - Economic Moat...,2020-05-29,9.783333,No views,0,Investing is only complex when you are not wel...,"[(cost, NOUN), (advantage, NOUN), (company, NO...",Competitive Moats,0.208,9.435616


#### TfidfVectorizer, Euclidean distance 

In [9]:
X_tfidf_euclidean = follow_up_recommender('OHCpUcerz1Q', df_videos_filtered_test1, TfidfVectorizer, euclidean_distances)
X_tfidf_euclidean

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
98,-NllUDR8b-M,Coca Cola's 5 moats,2016-07-26,4.433333,"1,376 views",14,This episode explores Coca Cola's 5 moats.,"[(wanted, ADJ), (business, NOUN), (moat, PROPN...",Competitive Moats,0.055,55.561644
58,ozrk3Q_vUvU,Charlie Munger on Traditional Moats. | [C:C.M ...,2020-06-23,1.666667,"2,787 views",72,"In this episode, Charlie Munger was asked that...","[(music, NOUN), (common, ADJ), (sentiment, NOU...",Competitive Moats,0.157,8.613699
69,BH5TnaczIvw,Warren Buffett vs Elon Musk: Moat Building or ...,2018-07-31,2.0,"1,501 views",9,"(May 5, 2018) Warren #Buffett and Charlie #Mun...","[(elon, NOUN), (musk, NOUN), (week, NOUN), (te...",Competitive Moats,0.126,31.39726
102,gdAKtVYELTs,Mohnish Pabrai on how cloning is not in confli...,2020-07-07,4.333333,965 views,32,"[Join YAPSS Membership, For Early Access to Ne...","[(music, PROPN), (buffett, PROPN), (buffett, P...",Competitive Moats,0.04,8.153425
88,ZYh3Y9LXoNo,Warren Buffett: What is a Moat and Why is it I...,2020-05-03,7.216667,25 views,0,"Warren Buffett is an American investor, busine...","[(area, NOUN), (work, NOUN), (good, ADJ), (mor...",Competitive Moats,0.069,10.290411


#### CountVectorizer, Cosine similarity

In [10]:
X_cv_cosine = follow_up_recommender('OHCpUcerz1Q', df_videos_filtered_test1, CountVectorizer, cosine_similarity)
X_cv_cosine

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
14,IA-dcUF5HL0,What is an Economic MOAT?,2020-09-16,3.933333,15 views,1,How to quickly identify a companies economic m...,"[(today, NOUN), (video, PROPN), (teach, AUX), ...",Competitive Moats,0.282,5.819178
7,sHhlvdjiY2g,How Do Economic Moats Add Value? (It Matters),2020-06-24,7.933333,26 views,0,What is Economic Moat? Where did the term come...,"[(today, NOUN), (gon, VERB), (na, PROPN), (tal...",Competitive Moats,0.317,8.580822
12,kz4m-Q-Zpp8,Economic Moats To Look For - Investing Warren ...,2018-02-18,2.016667,"2,131 views",79,In this video I talk about the key ingredient ...,"[(hey, INTJ), (welcome, VERB), (ozzie, NOUN), ...",Competitive Moats,0.294,36.756164
31,GDgQBwQBaI8,5 Economic Moats in the Stock Market📈 | Why Mo...,2017-09-13,5.966667,873 views,40,SUBSCRIBE: http://bit.ly/FinancialinfoYT \nRob...,"[(upbeat, ADJ), (music, NOUN), (hello, INTJ), ...",Competitive Moats,0.226,41.950685
15,mUI912y74GY,How to Identify an Economic Moat - Low Risk St...,2018-02-06,6.25,"2,027 views",83,Do you want to know for sure if a business is ...,"[(hey, INTJ), (hamish, PROPN), (buddy, NOUN), ...",Competitive Moats,0.281,37.150685


#### CountVectorizer, Euclidean distance 

In [11]:
X_cv_euclidean = follow_up_recommender('OHCpUcerz1Q', df_videos_filtered_test1, CountVectorizer, euclidean_distances)
X_cv_euclidean

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
90,KVncb-LuVSU,Dividend Growth Investing: How I Select Stocks,2017-04-05,14.516667,"5,877 views",178,I'm a dividend growth investor building a mach...,"[(invest, VERB), (cash, NOUN), (flow, NOUN), (...",Competitive Moats,0.065,47.243836
9,fYacJ2Hz7BA,Dividend growth investing in companies with an...,2020-01-05,5.666667,"4,223 views",182,What is an economic moat and why is it importa...,"[(everybody, PRON), (welcome, VERB), (channel,...",Competitive Moats,0.305,14.20274
86,Uul7NnPCIk4,Long Term Investing! Modern Day Long Term Inve...,2017-01-02,14.983333,"70,936 views",1.6K,--~--\nThis video covers Long term investing a...,"[(longterm, PROPN), (invest, VERB), (strategy,...",Competitive Moats,0.086,50.30137
83,oFB3h1QkfwQ,How to Analyse Business Models! | #1 Cost Adva...,2021-02-23,11.45,31 views,6,H goes through how to use find cost advantage ...,"[(hey, INTJ), (everybody, PRON), (welcome, VER...",Competitive Moats,0.097,0.558904
84,x090-livIfM,Beginners Guide to Value Investing (2021),2020-07-10,11.283333,141 views,15,In this video you will learn what value invest...,"[(today, NOUN), (gon, VERB), (na, PROPN), (dis...",Competitive Moats,0.087,8.054795


#### *Conclusion: Using TfidfVectorizer and cosine similarity gives the best recommendations, based on my domain knowledge in stock investing.  Tfidf and CV produce similar results for cosine similarity, but Tfidf gets slightly better results in terms of topic relevance*