In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import date
from IPython.display import clear_output

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

%load_ext autoreload
%autoreload 2

In [2]:
df_videos_cleaned_v9 = pickle.load(open('../Data/df_videos_cleaned_v9.pickle', 'rb'))

### All the functions (Need to put these into a separate .py file)

In [3]:
def initial_recommender(df):
    '''
    Input: Final dataframe of video data, user input on topic, duration, and upload date
    Output: Top five recommendations
    '''
    ## Take user inputs
    print('Which topics would you like to learn about?')
    topic = input().title()
    
    print('\n')
    print('What is the upper limit of the duration of videos (in minutes)?')
    duration = int(input())
    
    print('\n')
    print('How recent do you want the videos to be (in months since upload date)?')
    upload_date = int(input())
    today = date.today()
    df['Months Since Upload'] = df['Upload Date'].apply(lambda x:((today - x).days)*12/365)

    clear_output()
    
    ## Define a new variable to store the preferred videos. Copy the contents of df to filtered videos
    df_videos_filtered = df.copy()

    ## Return top five videos based on topic coefficient (how relevant the videos are to the user's topic)
    df_videos_filtered = df_videos_filtered[(df_videos_filtered['Topic']==topic) & 
                                      (df_videos_filtered['Duration']<duration) &
                                      (df_videos_filtered['Months Since Upload']<upload_date)]
    df_videos_filtered = df_videos_filtered.sort_values('Topic Coefficient', ascending=False)
    
    return df_videos_filtered.head(), df_videos_filtered

In [4]:
def follow_up_recommender(video_id, df_videos_filtered, vectorizer, similarity_metric):
    '''
    Input: Video ID of a user's liked video, and the dataframe of the filtered videos generated from the initial recommender
    Output: Top five follow-up recommendations using content-based recommender system
    '''
    ## Fit and transform the transcript into a document-term matrix
    word_list = [[word[0] for word in doc] for doc in df_videos_filtered['Transcript']]
    vec = vectorizer(tokenizer=lambda doc:doc, lowercase=False)
    matrix = vec.fit_transform(word_list).toarray()
    
    ## Generate a similarity matrix
    similarity_matrix = similarity_metric(matrix, matrix)
    
    ## Create a series of indices for the videos (rows)  
    df_videos_filtered = df_videos_filtered.reset_index(drop=True)
    indices = pd.Series(df_videos_filtered['Video_ID'])
    
    ## Get the index of the user's liked video
    idx = indices[indices == video_id].index[0]
    
    ## Create a series with the similarity scores in descending order and grab indices
    score_series = pd.Series(similarity_matrix[idx]).sort_values(ascending=False)
    similarity_indices = list(score_series.index)
    
    ## Drop videos that were in the original recommendation
    similarity_indices = [index for index in similarity_indices if index not in list(df_videos_filtered[:5].index)]
    top_5_indices = similarity_indices[:5]
    
    ## Populate a dataframe of the recommended videos
    df_videos_follow_up_recs = df_videos_filtered.iloc[top_5_indices]
    
    return df_videos_follow_up_recs

### Initial knowledge-based recommender

In [5]:
df_videos_recs_test1, df_videos_filtered_test1 = initial_recommender(df_videos_cleaned_v9)

In [6]:
df_videos_recs_test1

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
989,i7WDGREdwlM,Is Value Investing Outdated? | Reaction to @Ch...,2020-05-14,28.116667,"3,332 views",216,Follow me on Instagram:\nhttps://www.instagram...,"[(hey, INTJ), (tom, PROPN), (invest, VERB), (t...",General,0.12,9.994521
178,nN861I9nLJM,How can you invest like Warren Buffett? What i...,2020-11-08,20.416667,16 views,3,GRAB MY FREE KNOW WHERE TO INVEST REPORT! CLI...,"[(hello, INTJ), (welcome, INTJ), (know, VERB),...",General,0.114,4.142466
1220,_sYnjYPFNgw,LONG TERM STOCK MARKET INVESTING FOR 2020,2020-05-06,29.683333,"10,617 views",533,LONG TERM STOCK MARKET INVESTING FOR 2020\n\nH...,"[(hi, INTJ), (video, PROPN), (gon, NOUN), (na,...",General,0.112,10.257534
986,X-3KpqHr_kY,Is Warren Buffett's 'Value Investing' Dead?,2020-06-03,17.016667,"32,316 views",865,"Is value investing dead? Ben Graham, and now W...","[(hey, INTJ), (welcome, NOUN), (channel, NOUN)...",General,0.105,9.336986
1248,eXg3kvWJsDE,HOW DO FUND MANAGERS PICK STOCKS FOR LONG TERM...,2020-06-28,22.116667,"16,213 views",520,HOW DO FUND MANAGERS PICK STOCKS FOR LONG TERM...,"[(look, VERB), (history, NOUN), (philippine, A...",General,0.105,8.515068


In [7]:
df_videos_cleaned_v9.to_csv('../App/df_videos_cleaned_v9.csv',index=False)

### Follow-up content-based recommender (after the initial knowledge-based recommender takes care of the cold start problem)

#### TfidfVectorizer, Cosine similarity

In [8]:
X_tfidf_cosine = follow_up_recommender('nN861I9nLJM', df_videos_filtered_test1, TfidfVectorizer, cosine_similarity)
X_tfidf_cosine

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
29,VbTX_WQaYkU,What is Value Investing,2020-09-07,14.783333,262 views,20,What is Value Investing?\n\n🔬Join my INVESTMEN...,"[(video, PROPN), (value, NOUN), (invest, VERB)...",General,0.086,6.180822
10,K4EVbLmQjfY,What's the difference between Trading and Inve...,2020-09-03,6.433333,192 views,14,Welcome back! \n\nThe most common question ask...,"[(music, NOUN), (welcome, NOUN), (video, PROPN...",General,0.096,6.312329
124,yTqOuLH3Qpw,061 Understanding Technical Analysis And Funda...,2020-09-12,11.116667,16 views,0,BEST Investing APP https://m1.finance/P_U-E2g_...,"[(invest, VERB), (stock, NOUN), (market, NOUN)...",General,0.06,6.016438
27,MIR3BXUtrak,Why Growth Investing Is Harder Than You Think ...,2020-10-18,19.383333,206 views,12,Growth investing has seemed easy in the past d...,"[(welcome, INTJ), (return, NOUN), (focus, NOUN...",General,0.086,4.832877
146,0J0E5Ki0gBA,WHAT IS VALUE INVESTING? | The Art of Value In...,2020-06-08,5.75,338 views,20,Subscribe to my channel! https://bit.ly/subscr...,"[(look, VERB), (find, VERB), (value, NOUN), (i...",General,0.057,9.172603


#### TfidfVectorizer, Euclidean distance 

In [9]:
X_tfidf_euclidean = follow_up_recommender('nN861I9nLJM', df_videos_filtered_test1, TfidfVectorizer, euclidean_distances)
X_tfidf_euclidean

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
310,c9NEhHYC53E,Havells India Ltd. | Fundamental Analysis | #S...,2020-09-19,1.933333,41 views,6,A Small Initiative to make awareness in share ...,"[(music, NOUN), (music, NOUN), (applause, NOUN...",General,0.004,5.786301
315,dBb_pbQwULo,How to select a stock for investing | Introduc...,2021-01-20,5.75,42 views,7,This video tells you the process of stock sele...,"[(music, NOUN), (music, NOUN), (music, NOUN)]",General,0.0,1.742466
314,rtxSDyASUKQ,UNBOXING | The Intelligent Investor: The Defin...,2021-03-03,1.466667,11 views,1,FULL TITLE: The Intelligent Investor: The Defi...,"[(music, NOUN), (music, NOUN), (music, NOUN), ...",General,0.0,0.361644
312,nMPkvDLU0yg,Fundamental vs. Technical Analysis: Which One ...,2020-07-04,6.85,29 views,1,Most beginner investors ask the best way to pi...,"[(music, NOUN), (applause, NOUN), (music, NOUN...",General,0.004,8.317808
300,TLYJ8_EApnk,BRITANNIA Stock fundamental and technical anal...,2021-02-27,9.1,87 views,4,"Welcome to our youtube channel ""earn with stoc...","[(music, NOUN), (music, NOUN), (foreign, ADJ),...",General,0.011,0.493151


#### CountVectorizer, Cosine similarity

In [10]:
X_cv_cosine = follow_up_recommender('nN861I9nLJM', df_videos_filtered_test1, CountVectorizer, cosine_similarity)
X_cv_cosine

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
86,yBNi5fYMIfM,Are these stocks undervalued? | Warren Buffett...,2020-03-31,17.116667,173 views,7,In today's episode of StockerFinance we talk a...,"[(modern, PROPN), (warren, PROPN), (buffett, P...",General,0.069,11.441096
159,K_15cfTklu0,Multibagger Stocks - Must Buy Stocks For 2021 ...,2020-11-18,18.283333,"44,429 views",870,Join the channel Membership To Get Access To M...,"[(music, NOUN), (hello, INTJ), (friends, NOUN)...",General,0.055,3.813699
124,yTqOuLH3Qpw,061 Understanding Technical Analysis And Funda...,2020-09-12,11.116667,16 views,0,BEST Investing APP https://m1.finance/P_U-E2g_...,"[(invest, VERB), (stock, NOUN), (market, NOUN)...",General,0.06,6.016438
146,0J0E5Ki0gBA,WHAT IS VALUE INVESTING? | The Art of Value In...,2020-06-08,5.75,338 views,20,Subscribe to my channel! https://bit.ly/subscr...,"[(look, VERB), (find, VERB), (value, NOUN), (i...",General,0.057,9.172603
24,tad2l_wwnGM,Peter Lynch: How To Invest With Stocks At High...,2021-01-09,2.066667,"222,284 views",5K,In this video Peter Lynch discusses how to inv...,"[(peter, PROPN), (lynch, PROPN), (smart, ADJ),...",General,0.087,2.10411


#### CountVectorizer, Euclidean distance 

In [11]:
X_cv_euclidean = follow_up_recommender('nN861I9nLJM', df_videos_filtered_test1, CountVectorizer, euclidean_distances)
X_cv_euclidean

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
14,zqgRCerwSfQ,Free Cash Flow Plus Growth: Reverse Engineerin...,2020-11-03,5.666667,"1,844 views",72,QuickFS Link: https://quickfs.net/?via=focused...,"[(music, NOUN), (welcome, ADJ), (welcome, ADJ)...",General,0.094,4.306849
22,9sWecuhjzOI,"My March 2021 BNPL Game Plan For ZIP, Afterpay...",2021-03-03,2.75,"5,247 views",190,In this video I unpack my March 2021 game plan...,"[(hey, INTJ), (welcome, INTJ), (phil, ADJ), (m...",General,0.088,0.361644
15,VIFBMx3VLx4,From trading on Emotions to Fundamental Invest...,2021-02-26,20.633333,6 views,0,I first started investing back in 2017. It was...,"[(right, INTJ), (folk, NOUN), (member, NOUN), ...",General,0.094,0.526027
277,uEclt5D59hw,Dividend Growth Investing - The Power of Compo...,2020-11-16,22.15,"1,005 views",54,This video shows how the power of compounding ...,"[(hello, INTJ), (today, NOUN), (video, PROPN),...",General,0.031,3.879452
114,cH2You1fy0k,Top 5 BANK Stocks to Watch! - Q2 Analysis 2020...,2020-06-07,23.8,"20,056 views",1K,What are the top 5 bank stocks to watch in 202...,"[(hey, INTJ), (saver, PROPN), (investor, NOUN)...",General,0.062,9.205479


#### *Conclusion: Using TfidfVectorizer and cosine similarity gives the best recommendations, based on my domain knowledge in stock investing.  Tfidf and CV produce similar results for cosine similarity, but Tfidf gets slightly better results in terms of topic relevance*