In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import date
from IPython.display import clear_output

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

%load_ext autoreload
%autoreload 2

In [2]:
df_videos_cleaned_v9 = pickle.load(open('../Data/df_videos_cleaned_v9.pickle', 'rb'))

In [84]:
df_videos_cleaned_v9.to_csv('../App/df_videos_cleaned_v9.csv',index=False)

### All the functions (Need to put these into a separate .py file)

In [3]:
def initial_recommender(df):
    '''
    Input: Final dataframe of video data, user input on topic, duration, and upload date
    Output: Top five recommendations
    '''
    ## Take user inputs
    print('Which topics would you like to learn about?')
    topic = input().title()
    
    print('\n')
    print('What is the upper limit of the duration of videos (in minutes)?')
    duration = int(input())
    
    print('\n')
    print('How recent do you want the videos to be (in months since upload date)?')
    upload_date = int(input())
    today = date.today()
    df['Months Since Upload'] = df['Upload Date'].apply(lambda x:((today - x).days)*12/365)

    clear_output()
    
    ## Define a new variable to store the preferred videos. Copy the contents of df to filtered videos
    df_videos_filtered = df.copy()

    ## Return top five videos based on topic coefficient (how relevant the videos are to the user's topic)
    df_videos_filtered = df_videos_filtered[(df_videos_filtered['Topic']==topic) & 
                                      (df_videos_filtered['Duration']<duration) &
                                      (df_videos_filtered['Months Since Upload']<upload_date)]
    df_videos_filtered = df_videos_filtered.sort_values('Topic Coefficient', ascending=False)
    
    return df_videos_filtered.head(), df_videos_filtered

In [63]:
def follow_up_recommender(video_id, df_videos_filtered, vectorizer, similarity_metric):
    '''
    Input: Video ID of a user's liked video, and the dataframe of the filtered videos generated from the initial recommender
    Output: Top five follow-up recommendations using content-based recommender system
    '''
    ## Fit and transform the transcript into a document-term matrix
    word_list = [[word[0] for word in doc] for doc in df_videos_filtered['Transcript']]
    vec = vectorizer(tokenizer=lambda doc:doc, lowercase=False)
    matrix = vec.fit_transform(word_list).toarray()
    
    ## Generate a similarity matrix
    similarity_matrix = similarity_metric(matrix, matrix)
    
    ## Create a series of indices for the videos (rows)  
    df_videos_filtered = df_videos_filtered.reset_index(drop=True)
    indices = pd.Series(df_videos_filtered['Video_ID'])
    
    ## Get the index of the user's liked video
    idx = indices[indices == video_id].index[0]
    
    ## Create a series with the similarity scores in descending order and grab indices
    score_series = pd.Series(similarity_matrix[idx]).sort_values(ascending=False)
    similarity_indices = list(score_series.index)
    
    ## Drop videos that were in the original recommendation
    similarity_indices = [index for index in similarity_indices if index not in list(df_videos_filtered[:5].index)]
    top_5_indices = similarity_indices[:5]
    
    ## Populate a dataframe of the recommended videos
    df_videos_follow_up_recs = df_videos_filtered.iloc[top_5_indices]
    
    return df_videos_follow_up_recs

### Initial knowledge-based recommender

In [66]:
df_videos_recs_test1, df_videos_filtered_test1 = initial_recommender(df_videos_cleaned_v9)

In [97]:
df_videos_recs_test1  

2jLngtFzh8I
Pomp Podcast #412: Peter Doyle on Modern Value Investing
Z52LyI9j1rI
TIP332: Long-Term Investing w/ Tom Gayner
X-3KpqHr_kY
Is Warren Buffett's 'Value Investing' Dead?
mR_iGHZ91AY
Long Term Investing vs Momentum Trading in the Current Market
Rd4fEU3W49w
LONG TERM STOCK INVESTING SECRETS


### Follow-up content-based recommender (after the initial knowledge-based recommender takes care of the cold start problem)

#### TfidfVectorizer, Cosine similarity

In [80]:
X_tfidf_cosine = follow_up_recommender('mR_iGHZ91AY', df_videos_filtered_test1, TfidfVectorizer, cosine_similarity)
X_tfidf_cosine

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
7,zqgRCerwSfQ,Free Cash Flow Plus Growth: Reverse Engineerin...,2020-11-03,5.666667,"1,844 views",72,QuickFS Link: https://quickfs.net/?via=focused...,"[(music, NOUN), (welcome, ADJ), (welcome, ADJ)...",General,0.094,4.208219
202,nOrUZMxv5Z0,Podcast: Spot Emerging Trends With A Long-Term...,2020-03-07,3.433333,344 views,4,How can you leverage short-term thinking to ca...,"[(okay, INTJ), (ross, PROPN), (let, VERB), (lo...",General,0.038,12.131507
9,tad2l_wwnGM,Peter Lynch: How To Invest With Stocks At High...,2021-01-09,2.066667,"222,284 views",5K,In this video Peter Lynch discusses how to inv...,"[(peter, PROPN), (lynch, PROPN), (smart, ADJ),...",General,0.087,2.005479
6,I6ued1LIdbM,Howard Marks & Joel Greenblatt on Value Investing,2020-10-21,2.0,"17,955 views",492,Howard Marks and Joel Greenblatt talk about va...,"[(think, VERB), (reason, NOUN), (people, NOUN)...",General,0.095,4.635616
64,hqe9fdmBp0w,"Portfolio manager weighs in on Gamestop, short...",2021-01-29,9.233333,"3,114 views",39,Yahoo Finance’s Akiko Fujita and Zack Guzman d...,"[(begin, VERB), (today, NOUN), (discussion, NO...",General,0.064,1.347945


#### TfidfVectorizer, Euclidean distance 

In [81]:
X_tfidf_euclidean = follow_up_recommender('mR_iGHZ91AY', df_videos_filtered_test1, TfidfVectorizer, euclidean_distances)
X_tfidf_euclidean

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
259,IhNXFgfLnl0,How to Select Best Stocks for long Term Invest...,2021-01-01,8.333333,"4,341 views",256,📊 | Tamil Stock Market | \n\n🎯 OPEN ZERODHA DE...,"[(foreign, ADJ), (thank, VERB)]",General,0.009,2.268493
264,La0PZz3GvHE,Stock Market Prediction AI: Your Economic Moat...,2019-09-26,1.5,76 views,5,Read Full Article Here: https://iknowfirst.com...,"[(music, NOUN), (applause, NOUN), (music, NOUN...",General,0.005,17.490411
261,_tU-mHHaoWQ,introduction to fundamental Analysis ।। Class 1,2021-02-13,5.9,10 views,0,introduction to fundamental Analysis।।\nLearn ...,"[(music, NOUN), (applause, NOUN), (music, NOUN...",General,0.009,0.854795
263,c9NEhHYC53E,Havells India Ltd. | Fundamental Analysis | #S...,2020-09-19,1.933333,41 views,6,A Small Initiative to make awareness in share ...,"[(music, NOUN), (music, NOUN), (applause, NOUN...",General,0.005,5.687671
265,nMPkvDLU0yg,Fundamental vs. Technical Analysis: Which One ...,2020-07-04,6.85,29 views,1,Most beginner investors ask the best way to pi...,"[(music, NOUN), (applause, NOUN), (music, NOUN...",General,0.005,8.219178


#### CountVectorizer, Cosine similarity

In [82]:
X_cv_cosine = follow_up_recommender('mR_iGHZ91AY', df_videos_filtered_test1, CountVectorizer, cosine_similarity)
X_cv_cosine

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
202,nOrUZMxv5Z0,Podcast: Spot Emerging Trends With A Long-Term...,2020-03-07,3.433333,344 views,4,How can you leverage short-term thinking to ca...,"[(okay, INTJ), (ross, PROPN), (let, VERB), (lo...",General,0.038,12.131507
64,hqe9fdmBp0w,"Portfolio manager weighs in on Gamestop, short...",2021-01-29,9.233333,"3,114 views",39,Yahoo Finance’s Akiko Fujita and Zack Guzman d...,"[(begin, VERB), (today, NOUN), (discussion, NO...",General,0.064,1.347945
6,I6ued1LIdbM,Howard Marks & Joel Greenblatt on Value Investing,2020-10-21,2.0,"17,955 views",492,Howard Marks and Joel Greenblatt talk about va...,"[(think, VERB), (reason, NOUN), (people, NOUN)...",General,0.095,4.635616
7,zqgRCerwSfQ,Free Cash Flow Plus Growth: Reverse Engineerin...,2020-11-03,5.666667,"1,844 views",72,QuickFS Link: https://quickfs.net/?via=focused...,"[(music, NOUN), (welcome, ADJ), (welcome, ADJ)...",General,0.094,4.208219
245,sKueO5LQoV8,Best Long-term Investments to Buy NOW! - Sport...,2020-10-03,9.333333,"3,866 views",101,My website: https://sportscardsinvest.com/?nr=...,"[(today, NOUN), (bring, VERB), (brand, NOUN), ...",General,0.026,5.227397


#### CountVectorizer, Euclidean distance 

In [83]:
X_cv_euclidean = follow_up_recommender('mR_iGHZ91AY', df_videos_filtered_test1, CountVectorizer, euclidean_distances)
X_cv_euclidean

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
7,zqgRCerwSfQ,Free Cash Flow Plus Growth: Reverse Engineerin...,2020-11-03,5.666667,"1,844 views",72,QuickFS Link: https://quickfs.net/?via=focused...,"[(music, NOUN), (welcome, ADJ), (welcome, ADJ)...",General,0.094,4.208219
8,9sWecuhjzOI,"My March 2021 BNPL Game Plan For ZIP, Afterpay...",2021-03-03,2.75,"5,247 views",190,In this video I unpack my March 2021 game plan...,"[(hey, INTJ), (welcome, INTJ), (phil, ADJ), (m...",General,0.088,0.263014
103,X_7s1g9Uut0,Top 7 BANK Stocks to Watch! - Q3 Analysis 2020...,2020-09-06,2.016667,"9,504 views",700,What are the top bank stocks to watch in 2020 ...,"[(long, ADV), (wait, VERB), (finally, ADV), (c...",General,0.057,6.115068
98,Urnbzz0feHA,Why Mark Yusko Always Comes Back to Value Inve...,2019-10-10,1.533333,"20,543 views",511,A.I. and machine learning are changing the fac...,"[(mark, PROPN), (yusko, PROPN), (trade, NOUN),...",General,0.058,17.030137
59,s-V4-rrAYnE,DIVIDEND GROWTH INVESTING (How To Drive 446% S...,2019-07-25,1.05,"14,333 views",650,Today's video answers three important subscrib...,"[(hey, INTJ), (dividend, NOUN), (invest, VERB)...",General,0.064,19.561644


#### *Conclusion: Using TfidfVectorizer and cosine similarity gives the best recommendations, based on my domain knowledge in stock investing.  Tfidf and CV produce similar results for cosine similarity, but Tfidf gets slightly better results in terms of topic relevance*