In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import date
from IPython.display import clear_output

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

%load_ext autoreload
%autoreload 2

In [2]:
df_videos_cleaned_v10 = pickle.load(open('../Data/df_videos_cleaned_v10.pickle', 'rb'))

In [3]:
df_videos_cleaned_v10

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient
1083,rtxSDyASUKQ,UNBOXING | The Intelligent Investor: The Defin...,2021-03-03,1.466667,11 views,1,FULL TITLE: The Intelligent Investor: The Defi...,"[(music, NOUN), (music, NOUN), (music, NOUN), ...",Value Investing,0.000
94,dBb_pbQwULo,How to select a stock for investing | Introduc...,2021-01-20,5.750000,42 views,7,This video tells you the process of stock sele...,"[(music, NOUN), (music, NOUN), (music, NOUN)]",Fundamental Vs. Technical Analysis,0.000
1355,L2ZrCb5hj9w,A GOOD STOCK FOR LONG TERM INVESTMENT,2020-07-23,10.666667,"7,643 views",391,UPSTOX FREE DEMAT ACCOUNT (DISCOUNT BROKER)\nh...,"[(music, PROPN), (video, PROPN)]",Long-term Investing,0.000
970,w96XTaqxltw,Value Investing Guide Singapore | 8 financial ...,2017-09-22,1.100000,"1,115 views",9,Value Investing Singapore Guide: https://www.d...,"[(music, NOUN)]",Value Investing,0.000
86,3pGTy3aNGRs,Top 5 reasons why COAL INDIA is Falling | CA R...,2020-10-07,8.900000,"193,341 views",9.4K,A detailed analysis of the recent developments...,"[(coal, NOUN), (extract, NOUN), (ground, NOUN)...",General,0.003
...,...,...,...,...,...,...,...,...,...,...
1769,gqpfvyFl_Jk,Growth Vs. Value ETFs?,2019-04-12,3.800000,"7,133 views",126,The difference between the two is surprisingly...,"[(welcome, INTJ), (etf, PROPN), (comm, PROPN),...",Passive Investing,0.087
1774,4iSh1svBWmA,Is It The End Of Growth Stocks? - Behavioral V...,2021-01-15,3.233333,264 views,LIKE,"In 2000, the sign of the investing times was c...","[(end, NOUN), (growth, NOUN), (stock, NOUN), (...",Electric Vehicle Stocks,0.109
1775,dw9iRBXKqD0,Investors are starting to recognize a lot of E...,2021-02-26,4.433333,"1,211 views",20,#ESGinvesting #climatechange #YahooFinance\nYa...,"[(president, PROPN), (biden, PROPN), (focus, N...",Passive Investing,0.027
1776,i8WO9cjlQA0,Should You BUY BUZZ ETF? (High Growth ETF),2021-03-03,6.950000,"5,671 views",115,In this video we are looking at the new ETF BU...,"[(alrighty, PROPN), (welcome, ADJ), (episode, ...",Passive Investing,0.241


### All the functions (Need to put these into a separate .py file)

In [4]:
def initial_recommender(df):
    '''
    Input: Final dataframe of video data, user input on topic, duration, and upload date
    Output: Top five recommendations
    '''
    ## Take user inputs
    print('Which topics would you like to learn about?')
    topic = input().title()
    
    print('\n')
    print('What is the upper limit of the duration of videos (in minutes)?')
    duration = int(input())
    
    print('\n')
    print('How recent do you want the videos to be (in months since upload date)?')
    upload_date = int(input())
    today = date.today()
    df['Months Since Upload'] = df['Upload Date'].apply(lambda x:((today - x).days)*12/365)

    clear_output()
    
    ## Define a new variable to store the preferred videos. Copy the contents of df to filtered videos
    df_videos_filtered = df.copy()

    ## Return top five videos based on topic coefficient (how relevant the videos are to the user's topic)
    df_videos_filtered = df_videos_filtered[(df_videos_filtered['Topic']==topic) & 
                                      (df_videos_filtered['Duration']<duration) &
                                      (df_videos_filtered['Months Since Upload']<upload_date)]
    df_videos_filtered = df_videos_filtered.sort_values('Topic Coefficient', ascending=False)
    
    return df_videos_filtered.head(), df_videos_filtered

In [5]:
def follow_up_recommender(video_id, df_videos_filtered, vectorizer, similarity_metric):
    '''
    Input: Video ID of a user's liked video, and the dataframe of the filtered videos generated from the initial recommender
    Output: Top five follow-up recommendations using content-based recommender system
    '''
    ## Fit and transform the transcript into a document-term matrix
    word_list = [[word[0] for word in doc] for doc in df_videos_filtered['Transcript']]
    vec = vectorizer(tokenizer=lambda doc:doc, lowercase=False)
    matrix = vec.fit_transform(word_list).toarray()
    
    ## Generate a similarity matrix
    similarity_matrix = similarity_metric(matrix, matrix)
    
    ## Create a series of indices for the videos (rows)  
    df_videos_filtered = df_videos_filtered.reset_index(drop=True)
    indices = pd.Series(df_videos_filtered['Video_ID'])
    
    ## Get the index of the user's liked video
    idx = indices[indices == video_id].index[0]
    
    ## Create a series with the similarity scores in descending order and grab indices
    score_series = pd.Series(similarity_matrix[idx]).sort_values(ascending=False)
    similarity_indices = list(score_series.index)
    
    ## Drop videos that were in the original recommendation
    similarity_indices = [index for index in similarity_indices if index not in list(df_videos_filtered[:5].index)]
    top_5_indices = similarity_indices[:5]
    
    ## Populate a dataframe of the recommended videos
    df_videos_follow_up_recs = df_videos_filtered.iloc[top_5_indices]
    
    return df_videos_follow_up_recs

### Initial knowledge-based recommender

In [6]:
df_videos_recs_test1, df_videos_filtered_test1 = initial_recommender(df_videos_cleaned_v10)

In [7]:
df_videos_recs_test1

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
852,rbd_ylJkD0g,Growth Investing vs Value Investing In 2020 | ...,2020-07-20,12.65,661 views,49,Invest with eToro:\n\nhttps://etoro.tw/3hcqqdo,"[(today, NOUN), (talk, VERB), (different, ADJ)...",Growth Investing Vs. Value Investing,0.151,7.857534
1436,ViNvWPGsfP0,What is a Growth Stock - Growth Stock Explaine...,2018-06-19,2.75,"8,341 views",171,Growth Stock Investing is investing in a stock...,"[(growth, NOUN), (stock, NOUN), (stock, NOUN),...",Growth Investing Vs. Value Investing,0.143,32.909589
1616,jwBAInK32qU,Growth vs. Value Investing in 2021 (BEST INVES...,2021-02-07,11.3,68 views,9,"In the investing world, there is a never endin...","[(coke, PROPN), (pepsi, PROPN), (adidas, PROPN...",Growth Investing Vs. Value Investing,0.108,1.216438
1661,LaGhq3xNUpk,Growth Vs Value Investing | WHICH IS BEST?,2021-02-28,6.95,45 views,7,"A lot of people like myself wonder, what's bet...","[(growth, NOUN), (versus, ADP), (value, NOUN),...",Growth Investing Vs. Value Investing,0.085,0.526027
1488,MIR3BXUtrak,Why Growth Investing Is Harder Than You Think ...,2020-10-18,19.383333,206 views,12,Growth investing has seemed easy in the past d...,"[(welcome, INTJ), (return, NOUN), (focus, NOUN...",Growth Investing Vs. Value Investing,0.081,4.89863


In [8]:
df_videos_cleaned_v10.to_csv('../App/df_videos_cleaned_v10.csv',index=False)

### Follow-up content-based recommender (after the initial knowledge-based recommender takes care of the cold start problem)

#### TfidfVectorizer, Cosine similarity

In [10]:
X_tfidf_cosine = follow_up_recommender('ViNvWPGsfP0', df_videos_filtered_test1, TfidfVectorizer, cosine_similarity)
X_tfidf_cosine

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
6,JGJC5wlJTHU,Growth Stocks vs Value Stocks || How to START ...,2020-06-18,7.083333,"1,285 views",120,Growth Stocks vs Value Stocks || How to START ...,"[(music, NOUN), (welcome, NOUN), (average, ADJ...",Growth Investing Vs. Value Investing,0.079,8.909589
9,cW1Djgz092g,Growth Investing vs Value Investing vs Income ...,2018-12-05,8.25,163 views,12,Today we discuss growth investing vs value inv...,"[(music, PROPN), (brynjar, PROPN), (meal, NOUN...",Growth Investing Vs. Value Investing,0.069,27.353425
20,AovFMi0I3_E,R.I.P Value Investing? Value Investing vs Grow...,2020-06-03,12.45,"3,821 views",250,🔵 Get My FREE Investing Guide! 👉 http://bit.l...,"[(hi, INTJ), (welcome, INTJ), (invest, VERB), ...",Growth Investing Vs. Value Investing,0.046,9.40274
17,RB-CHLDEgkk,Don't Buy Growth Stocks! (WARNING) || The Trut...,2020-08-13,8.033333,"3,772 views",224,🔵 Get My FREE Investing Guide! 👉 http://bit.l...,"[(hi, INTJ), (welcome, INTJ), (invest, VERB), ...",Growth Investing Vs. Value Investing,0.052,7.068493
10,w4BlPd2-Ezw,What is Growth Investing,2019-04-05,4.016667,454 views,9,Growth Investing is an investment strategy tha...,"[(music, NOUN), (music, NOUN), (growth, NOUN),...",Growth Investing Vs. Value Investing,0.067,23.375342


#### TfidfVectorizer, Euclidean distance 

In [11]:
X_tfidf_euclidean = follow_up_recommender('ViNvWPGsfP0', df_videos_filtered_test1, TfidfVectorizer, euclidean_distances)
X_tfidf_euclidean

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
36,TIlBvMhNXz0,"Look for growth opportunities in health care, ...",2020-10-22,1.85,"3,836 views",40,"Doug Sandler, Riverfront Investment Group's he...","[(know, PROPN), (average, ADJ), (return, NOUN)...",Growth Investing Vs. Value Investing,0.028,4.767123
38,v_BK3pj7Cvs,Value Stocks vs. Growth Stocks: Which Way Shou...,2018-11-29,2.466667,"1,750 views",16,One of the most common nicknames for your reti...,"[(robert, PROPN), (brokamp, PROPN), (far, ADV)...",Growth Investing Vs. Value Investing,0.016,27.550685
11,QzLrCQ_yOSo,Jim Rogers Latest interview 2020: Value Vs Gro...,2020-09-25,7.216667,905 views,47,⭐ ⭐ DON'T FORGET TO SUBSCRIBE!!! For More inv...,"[(think, VERB), (investor, NOUN), (point, NOUN...",Growth Investing Vs. Value Investing,0.066,5.654795
19,Zvw8WUaHXgw,Why Does Insight Focus on Growth Investing?,2019-01-15,1.716667,84 views,0,"Explore Insight Partners' ""Scale-Up Hacks"" pla...","[(growth, NOUN), (investor, NOUN), (think, VER...",Growth Investing Vs. Value Investing,0.046,26.005479
24,otx-wAKI_pc,Value Is Seen as More Serious Than Growth Inve...,2019-07-11,5.866667,"4,248 views",48,"Jul.10 -- James Anderson, Baillie Gifford part...","[(james, PROPN), (honor, NOUN), (guest, NOUN),...",Growth Investing Vs. Value Investing,0.04,20.186301


#### CountVectorizer, Cosine similarity

In [12]:
X_cv_cosine = follow_up_recommender('ViNvWPGsfP0', df_videos_filtered_test1, CountVectorizer, cosine_similarity)
X_cv_cosine

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
9,cW1Djgz092g,Growth Investing vs Value Investing vs Income ...,2018-12-05,8.25,163 views,12,Today we discuss growth investing vs value inv...,"[(music, PROPN), (brynjar, PROPN), (meal, NOUN...",Growth Investing Vs. Value Investing,0.069,27.353425
17,RB-CHLDEgkk,Don't Buy Growth Stocks! (WARNING) || The Trut...,2020-08-13,8.033333,"3,772 views",224,🔵 Get My FREE Investing Guide! 👉 http://bit.l...,"[(hi, INTJ), (welcome, INTJ), (invest, VERB), ...",Growth Investing Vs. Value Investing,0.052,7.068493
37,FEL6pUqB34E,Breaking Views: Is Growth Investing Healthier ...,2020-03-23,2.766667,40 views,0,"""You may have noticed that growth stocks are d...","[(music, NOUN), (notice, VERB), (growth, NOUN)...",Growth Investing Vs. Value Investing,0.025,11.769863
8,mVe1oGzKpJw,Value Versus Growth Investing - Which is Best?,2019-09-09,9.166667,"11,189 views",461,Value investing usually beats growth investing...,"[(different, ADJ), (style, NOUN), (invest, VER...",Growth Investing Vs. Value Investing,0.071,18.213699
20,AovFMi0I3_E,R.I.P Value Investing? Value Investing vs Grow...,2020-06-03,12.45,"3,821 views",250,🔵 Get My FREE Investing Guide! 👉 http://bit.l...,"[(hi, INTJ), (welcome, INTJ), (invest, VERB), ...",Growth Investing Vs. Value Investing,0.046,9.40274


#### CountVectorizer, Euclidean distance 

In [13]:
X_cv_euclidean = follow_up_recommender('ViNvWPGsfP0', df_videos_filtered_test1, CountVectorizer, euclidean_distances)
X_cv_euclidean

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
14,YGEe9Df2XSs,Best Way to Invest Money: Value Investing vs G...,2019-11-21,29.65,"52,819 views",1.7K,Value vs growth investing is a debate practica...,"[(chris, PROPN), (hill, PROPN), (coming, VERB)...",Growth Investing Vs. Value Investing,0.054,15.813699
18,NPJgpY6T7lQ,WHAT IS GROWTH INVESTING? | HOW TO SELECT GROW...,2019-09-10,16.933333,"5,017 views",217,STOXKART FREE DEMAT ACCOUNT OPENING (PROMOCODE...,"[(music, NOUN), (applause, NOUN), (music, NOUN...",Growth Investing Vs. Value Investing,0.047,18.180822
20,AovFMi0I3_E,R.I.P Value Investing? Value Investing vs Grow...,2020-06-03,12.45,"3,821 views",250,🔵 Get My FREE Investing Guide! 👉 http://bit.l...,"[(hi, INTJ), (welcome, INTJ), (invest, VERB), ...",Growth Investing Vs. Value Investing,0.046,9.40274
5,jQEaRQyChaM,Growth vs. Value Investing - What's Best?,2019-04-23,17.666667,423 views,32,Growth versus value investing – what’s best?\n...,"[(tom, PROPN), (invest, VERB), (tom, PROPN), (...",Growth Investing Vs. Value Investing,0.08,22.783562
12,dDg6QX_Avu0,"Howard Marks on growth investing; says, 'have ...",2021-02-02,19.233333,"4,352 views",115,Subscribe To ET Now For Latest Updates On Stoc...,"[(music, NOUN), (applause, NOUN), (extraordina...",Growth Investing Vs. Value Investing,0.066,1.380822


#### *Conclusion: Using TfidfVectorizer and cosine similarity gives the best recommendations, based on my domain knowledge in stock investing.  Tfidf and CV produce similar results for cosine similarity, but Tfidf gets slightly better results in terms of topic relevance*