In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import date
from IPython.display import clear_output

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

%load_ext autoreload
%autoreload 2

In [2]:
df_videos_cleaned_v9 = pickle.load(open('../Data/df_videos_cleaned_v9.pickle', 'rb'))

### All the functions (Need to put these into a separate .py file)

In [3]:
def initial_recommender(df):
    '''
    Input: Final dataframe of video data, user input on topic, duration, and upload date
    Output: Top five recommendations
    '''
    ## Take user inputs
    print('Which topics would you like to learn about?')
    topic = input().title()
    
    print('\n')
    print('What is the upper limit of the duration of videos (in minutes)?')
    duration = int(input())
    
    print('\n')
    print('How recent do you want the videos to be (in months since upload date)?')
    upload_date = int(input())
    today = date.today()
    df['Months Since Upload'] = df['Upload Date'].apply(lambda x:((today - x).days)*12/365)

    clear_output()
    
    ## Define a new variable to store the preferred videos. Copy the contents of df to filtered videos
    df_videos_filtered = df.copy()

    ## Return top five videos based on topic coefficient (how relevant the videos are to the user's topic)
    df_videos_filtered = df_videos_filtered[(df_videos_filtered['Topic']==topic) & 
                                      (df_videos_filtered['Duration']<duration) &
                                      (df_videos_filtered['Months Since Upload']<upload_date)]
    df_videos_filtered = df_videos_filtered.sort_values('Topic Coefficient', ascending=False)
    
    return df_videos_filtered.head(), df_videos_filtered

In [4]:
def follow_up_recommender(video_id, df_videos_filtered, vectorizer, similarity_metric):
    '''
    Input: Video ID of a user's liked video, and the dataframe of the filtered videos generated from the initial recommender
    Output: Top five follow-up recommendations using content-based recommender system
    '''
    ## Fit and transform the transcript into a document-term matrix
    word_list = [[word[0] for word in doc] for doc in df_videos_filtered['Transcript']]
    vec = vectorizer(tokenizer=lambda doc:doc, lowercase=False)
    matrix = vec.fit_transform(word_list).toarray()
    
    ## Generate a similarity matrix
    similarity_matrix = similarity_metric(matrix, matrix)
    
    ## Create a series of indices for the videos (rows)  
    df_videos_filtered = df_videos_filtered.reset_index(drop=True)
    indices = pd.Series(df_videos_filtered['Video_ID'])
    
    ## Get the index of the user's liked video
    idx = indices[indices == video_id].index[0]
    
    ## Create a series with the similarity scores in descending order and grab indices
    score_series = pd.Series(similarity_matrix[idx]).sort_values(ascending=False)
    similarity_indices = list(score_series.index)
    
    ## Drop videos that were in the original recommendation
    similarity_indices = [index for index in similarity_indices if index not in list(df_videos_filtered[:5].index)]
    top_5_indices = similarity_indices[:5]
    
    ## Populate a dataframe of the recommended videos
    df_videos_follow_up_recs = df_videos_filtered.iloc[top_5_indices]
    
    return df_videos_follow_up_recs

### Initial knowledge-based recommender

In [5]:
df_videos_recs_test1, df_videos_filtered_test1 = initial_recommender(df_videos_cleaned_v9)

In [6]:
df_videos_recs_test1

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
893,2jLngtFzh8I,Pomp Podcast #412: Peter Doyle on Modern Value...,2020-10-21,4.666667,"8,710 views",239,This is an episode of The Pomp Podcast with ho...,"[(right, INTJ), (bang, NOUN), (bang, NOUN), (s...",General,0.113,4.70137
1155,Z52LyI9j1rI,TIP332: Long-Term Investing w/ Tom Gayner,2021-01-18,2.75,"3,127 views",100,"In today’s episode, we sit down with legendary...","[(listen, VERB), (tip, NOUN), (today, NOUN), (...",General,0.112,1.775342
989,X-3KpqHr_kY,Is Warren Buffett's 'Value Investing' Dead?,2020-06-03,4.666667,"32,316 views",865,"Is value investing dead? Ben Graham, and now W...","[(hey, INTJ), (welcome, NOUN), (channel, NOUN)...",General,0.105,9.30411
1237,mR_iGHZ91AY,Long Term Investing vs Momentum Trading in the...,2020-04-15,1.05,"9,508 views",222,"The stock market has fallen, and bounced. It's...","[(hey, INTJ), (welcome, NOUN), (channel, NOUN)...",General,0.102,10.915068
855,Iv0zmTmKHYg,Value Investing I: The Back Story!,2020-10-23,19.85,"24,518 views",846,"Many investors, when asked to describe their i...","[(hi, INTJ), (welcome, ADJ), (session, NOUN), ...",General,0.102,4.635616


In [7]:
df_videos_cleaned_v9.to_csv('../App/df_videos_cleaned_v9.csv',index=False)

### Follow-up content-based recommender (after the initial knowledge-based recommender takes care of the cold start problem)

#### TfidfVectorizer, Cosine similarity

In [9]:
X_tfidf_cosine = follow_up_recommender('X-3KpqHr_kY', df_videos_filtered_test1, TfidfVectorizer, cosine_similarity)
X_tfidf_cosine

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
92,LaGhq3xNUpk,Growth Vs Value Investing | WHICH IS BEST?,2021-02-28,6.95,45 views,7,"A lot of people like myself wonder, what's bet...","[(growth, NOUN), (versus, ADP), (value, NOUN),...",General,0.063,0.427397
67,2GFwfNOKWJM,Value Investing Proper Definition - Is Value I...,2020-11-14,15.666667,"30,879 views",1.4K,"Is Value Investing, Dead Or Alive, Something F...","[(good, ADJ), (day, NOUN), (fell, VERB), (inve...",General,0.069,3.912329
10,IxbgNEfNRvc,Value Investing is Dead in 2020,2020-10-16,17.55,"13,346 views",691,In this video I breakdown whether or not Value...,"[(hey, INTJ), (welcome, INTJ), (create, VERB),...",General,0.095,4.865753
20,VbTX_WQaYkU,What is Value Investing,2020-09-07,14.783333,262 views,20,What is Value Investing?\n\n🔬Join my INVESTMEN...,"[(video, PROPN), (value, NOUN), (invest, VERB)...",General,0.086,6.147945
41,y1bXFsUtgjk,Value Investing,2021-02-21,8.45,81 views,6,Have you ever heard of Warren Buffett ? He’s w...,"[(music, NOUN), (hey, INTJ), (today, NOUN), (t...",General,0.077,0.657534


#### TfidfVectorizer, Euclidean distance 

In [10]:
X_tfidf_euclidean = follow_up_recommender('X-3KpqHr_kY', df_videos_filtered_test1, TfidfVectorizer, euclidean_distances)
X_tfidf_euclidean

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
278,IhNXFgfLnl0,How to Select Best Stocks for long Term Invest...,2021-01-01,8.333333,"4,341 views",256,📊 | Tamil Stock Market | \n\n🎯 OPEN ZERODHA DE...,"[(foreign, ADJ), (thank, VERB)]",General,0.009,2.334247
277,1q0ifWYGRzQ,how to do fundamental analysis of stock | Basi...,2020-08-31,14.816667,181 views,10,Hi this video is about\n\nhow to do fundamenta...,"[(foreign, ADJ), (foreign, ADJ), (foreign, ADJ...",General,0.009,6.378082
276,D4wdMRvRZE4,Stock valuation Problem / Without loss | Malay...,2020-11-14,12.283333,160 views,13,You want to see this for being able to solve p...,"[(music, NOUN), (foreign, ADJ), (music, NOUN),...",General,0.01,3.912329
282,86QNcnrEpak,How to Pick Stock Like Warren Buffet in Hindi?,2020-05-18,11.333333,431 views,17,this video provide the information about how w...,"[(music, NOUN), (music, NOUN), (music, NOUN), ...",General,0.009,9.830137
283,eUbP-yBKK7U,My Growth Portfolio | 0$ - 100K Investing Goal...,2021-02-26,13.55,967 views,38,Investing in US and Canadian stock market | My...,"[(music, NOUN), (music, NOUN), (music, NOUN), ...",General,0.009,0.493151


#### CountVectorizer, Cosine similarity

In [11]:
X_cv_cosine = follow_up_recommender('X-3KpqHr_kY', df_videos_filtered_test1, CountVectorizer, cosine_similarity)
X_cv_cosine

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
63,Jf_XvpYTDSA,Best Way Of Investing: Value Vs Growth | Bill ...,2020-09-04,13.8,"1,365 views",24,"On this segment, ET NOW’s Ajaya Sharma is in c...","[(know, VERB), (big, ADJ), (cut, NOUN), (come,...",General,0.069,6.246575
20,VbTX_WQaYkU,What is Value Investing,2020-09-07,14.783333,262 views,20,What is Value Investing?\n\n🔬Join my INVESTMEN...,"[(video, PROPN), (value, NOUN), (invest, VERB)...",General,0.086,6.147945
92,LaGhq3xNUpk,Growth Vs Value Investing | WHICH IS BEST?,2021-02-28,6.95,45 views,7,"A lot of people like myself wonder, what's bet...","[(growth, NOUN), (versus, ADP), (value, NOUN),...",General,0.063,0.427397
25,lcqW9pKeZVs,Why Value Investing is outdated? Warren Buffet...,2020-05-08,12.166667,"66,643 views",5.9K,Warren Buffett just had his Berkshire Hathaway...,"[(hi, INTJ), (friend, NOUN), (warren, PROPN), ...",General,0.083,10.158904
10,IxbgNEfNRvc,Value Investing is Dead in 2020,2020-10-16,17.55,"13,346 views",691,In this video I breakdown whether or not Value...,"[(hey, INTJ), (welcome, INTJ), (create, VERB),...",General,0.095,4.865753


#### CountVectorizer, Euclidean distance 

In [12]:
X_cv_euclidean = follow_up_recommender('X-3KpqHr_kY', df_videos_filtered_test1, CountVectorizer, euclidean_distances)
X_cv_euclidean

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
12,zqgRCerwSfQ,Free Cash Flow Plus Growth: Reverse Engineerin...,2020-11-03,5.666667,"1,844 views",72,QuickFS Link: https://quickfs.net/?via=focused...,"[(music, NOUN), (welcome, ADJ), (welcome, ADJ)...",General,0.094,4.273973
16,9sWecuhjzOI,"My March 2021 BNPL Game Plan For ZIP, Afterpay...",2021-03-03,2.75,"5,247 views",190,In this video I unpack my March 2021 game plan...,"[(hey, INTJ), (welcome, INTJ), (phil, ADJ), (m...",General,0.088,0.328767
73,i5mT4uVFCDI,One of my Long Term Investments ROCKETS 70% | ...,2021-03-03,17.05,121 views,8,"In this video, I will be talking about my thou...","[(music, NOUN), (welcome, ADJ), (start, NOUN),...",General,0.067,0.328767
128,X_7s1g9Uut0,Top 7 BANK Stocks to Watch! - Q3 Analysis 2020...,2020-09-06,2.016667,"9,504 views",700,What are the top bank stocks to watch in 2020 ...,"[(long, ADV), (wait, VERB), (finally, ADV), (c...",General,0.057,6.180822
34,lLx8q0SJkkE,The Best Long Term Trading Strategy (Original ...,2020-08-05,1.05,"27,666 views",1K,For more info contact info@ifundtraders.com. O...,"[(right, ADV), (welcome, INTJ), (welcome, VERB...",General,0.079,7.232877


#### *Conclusion: Using TfidfVectorizer and cosine similarity gives the best recommendations, based on my domain knowledge in stock investing.  Tfidf and CV produce similar results for cosine similarity, but Tfidf gets slightly better results in terms of topic relevance*