In [1]:
import numpy as np
import pandas as pd
import pickle
from datetime import date
from IPython.display import clear_output

from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

%load_ext autoreload
%autoreload 2

In [2]:
df_videos_cleaned_v10 = pickle.load(open('../Data/df_videos_cleaned_v10.pickle', 'rb')).reset_index(drop=True)

### All the functions (Need to put these into a separate .py file)

In [3]:
def initial_recommender(df):
    '''
    Input: Final dataframe of video data, user input on topic, duration, and upload date
    Output: Top five recommendations
    '''
    ## Take user inputs
    print('Which topics would you like to learn about?')
    topic = input()
    
    print('\n')
    print('What is the upper limit of the duration of videos (in minutes)?')
    duration = int(input())
    
    print('\n')
    print('How recent do you want the videos to be (in months since upload date)?')
    upload_date = int(input())
    today = date.today()
    df['Months Since Upload'] = df['Upload Date'].apply(lambda x:((today - x).days)*12/365)

    clear_output()
    
    ## Define a new variable to store the preferred videos. Copy the contents of df to filtered videos
    df_videos_filtered = df.copy()

    ## Return top five videos based on topic coefficient (how relevant the videos are to the user's topic)
    df_videos_filtered = df_videos_filtered[(df_videos_filtered['Topic']==topic) & 
                                      (df_videos_filtered['Duration']<duration) &
                                      (df_videos_filtered['Months Since Upload']<upload_date)]
    df_videos_filtered = df_videos_filtered.sort_values('Topic Coefficient', ascending=False)
    
    return df_videos_filtered.head(), df_videos_filtered

In [4]:
def follow_up_recommender(video_id, df_videos_filtered, vectorizer, similarity_metric):
    '''
    Input: Video ID of a user's liked video, and the dataframe of the filtered videos generated from the initial recommender
    Output: Top five follow-up recommendations using content-based recommender system
    '''
    ## Fit and transform the transcript into a document-term matrix
    word_list = [[word[0] for word in doc] for doc in df_videos_filtered['Transcript']]
    vec = vectorizer(tokenizer=lambda doc:doc, lowercase=False)
    matrix = vec.fit_transform(word_list).toarray()
    
    ## Generate a similarity matrix
    similarity_matrix = similarity_metric(matrix, matrix)
    
    ## Create a series of indices for the videos (rows)  
    df_videos_filtered = df_videos_filtered.reset_index(drop=True)
    indices = pd.Series(df_videos_filtered['Video_ID'])
    
    ## Get the index of the user's liked video
    idx = indices[indices == video_id].index[0]
    
    ## Create a series with the similarity scores in descending order and grab indices
    score_series = pd.Series(similarity_matrix[idx]).sort_values(ascending=False)
    similarity_indices = list(score_series.index)
    
    ## Drop videos that were in the original recommendation
    similarity_indices = [index for index in similarity_indices if index not in list(df_videos_filtered[:5].index)]
    top_5_indices = similarity_indices[:5]
    
    ## Populate a dataframe of the recommended videos
    df_videos_follow_up_recs = df_videos_filtered.iloc[top_5_indices]
    
    return df_videos_follow_up_recs

### Initial knowledge-based recommender

In [5]:
df_videos_recs_test1, df_videos_filtered_test1 = initial_recommender(df_videos_cleaned_v10)

In [6]:
df_videos_recs_test1

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
1759,eUbP-yBKK7U,My Growth Portfolio | 0$ - 100K Investing Goal...,2021-02-26,13.55,967 views,38,Investing in US and Canadian stock market | My...,"[(music, NOUN), (music, NOUN), (music, NOUN), ...",General,0.508,0.624658
1162,86QNcnrEpak,How to Pick Stock Like Warren Buffet in Hindi?,2020-05-18,11.333333,431 views,17,this video provide the information about how w...,"[(music, NOUN), (music, NOUN), (music, NOUN), ...",General,0.508,9.961644
1450,nK6-oSubcpA,TRADING vs INVESTING | Which is Better and Why?,2021-01-08,17.183333,"213,378 views",12K,TRADING vs INVESTING in Hindi\nWhich is better...,"[(music, NOUN), (option, NOUN), (music, NOUN),...",General,0.494,2.235616
1001,u_IhJ2OWXYA,Global Matters: Fundamentals of Offshore Inves...,2020-04-20,21.533333,928 views,20,,"[(music, NOUN), (applause, NOUN), (good, ADJ),...",General,0.099,10.882192
1347,Q9egMIbTo0E,Meb Faber On What Low Bond Yields Mean For Sto...,2021-01-14,13.483333,229 views,13,Meb Faber analyzes the connection between stoc...,"[(let, VERB), (talk, NOUN), (connection, NOUN)...",General,0.091,2.038356


In [7]:
df_videos_cleaned_v10.to_csv('../App/df_videos_cleaned_v10.csv',index=False)

### Follow-up content-based recommender (after the initial knowledge-based recommender takes care of the cold start problem)

#### TfidfVectorizer, Cosine similarity

In [8]:
X_tfidf_cosine = follow_up_recommender('Q9egMIbTo0E', df_videos_filtered_test1, TfidfVectorizer, cosine_similarity)
X_tfidf_cosine

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
173,mvTyt7qdjvA,WHY STOCKS ARE CRASHING NOW (THE TRUTH),2021-03-04,12.766667,"44,786 views",3.9K,Application form to apply & try and get in my ...,"[(howdy, PROPN), (folk, NOUN), (holy, PROPN), ...",General,0.0,0.427397
177,hTVt2o_txMI,STOCK MARKET CRASH GETTING WORSE,2021-03-03,11.816667,"55,360 views",3.4K,Application form to apply & try and get in my ...,"[(whoa, PROPN), (holy, PROPN), (smoke, NOUN), ...",General,0.0,0.460274
167,nHO37zXMiew,Why Stocks Keep Dropping + What I am Doing,2021-03-04,13.483333,"26,343 views",2K,"Gain Access To My Community of Over 1,200 Inve...","[(hello, INTJ), (market, NOUN), (look, VERB), ...",General,0.0,0.427397
180,FSlT1g106_w,Stock Market Crash Ahead or Inflation & Stocks...,2021-02-13,21.15,"58,110 views",4K,Stock market crash ahead or inflation ahead? T...,"[(good, ADJ), (day, NOUN), (fellow, NOUN), (in...",General,0.0,1.052055
18,hqe9fdmBp0w,"Portfolio manager weighs in on Gamestop, short...",2021-01-29,9.233333,"3,114 views",39,Yahoo Finance’s Akiko Fujita and Zack Guzman d...,"[(begin, VERB), (today, NOUN), (discussion, NO...",General,0.0,1.545205


#### TfidfVectorizer, Euclidean distance 

In [9]:
X_tfidf_euclidean = follow_up_recommender('Q9egMIbTo0E', df_videos_filtered_test1, TfidfVectorizer, euclidean_distances)
X_tfidf_euclidean

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
133,3pGTy3aNGRs,Top 5 reasons why COAL INDIA is Falling | CA R...,2020-10-07,8.9,"193,341 views",9.4K,A detailed analysis of the recent developments...,"[(coal, NOUN), (extract, NOUN), (ground, NOUN)...",General,0.0,5.293151
52,1gNCpbsAwAo,"💰Low Risk, High Return Investment | INVOICE DI...",2020-10-15,17.333333,"587,035 views",23K,☑️Open TradeCred Account [code: TC0245]: https...,"[(music, NOUN), (small, ADJ), (business, NOUN)...",General,0.0,5.030137
80,AbJFRmNY-sE,Should You Look At Investing In CCL Products F...,2021-01-04,4.65,"5,173 views",218,"CCL Product’s revenues, EBIDTA and PAT have gr...","[(music, NOUN), (incorporate, VERB), (ccl, PRO...",General,0.0,2.367123
114,QNXvPHjkz_E,Titan 2020 - Company Analysis | Financial rati...,2020-10-21,10.433333,584 views,20,TITAN - company analysis | Financial ratios an...,"[(music, PROPN), (video, PROPN), (detailed, AD...",General,0.0,4.832877
5,7RuyY6eno80,Top 6 Stocks to Buy Now for Value Investing wi...,2020-10-12,13.033333,"7,344 views",430,#topstockstobuynow #portfoliostocks #toppicks2...,"[(music, NOUN), (music, NOUN), (foreign, ADJ),...",General,0.045,5.128767


#### CountVectorizer, Cosine similarity

In [10]:
X_cv_cosine = follow_up_recommender('Q9egMIbTo0E', df_videos_filtered_test1, CountVectorizer, cosine_similarity)
X_cv_cosine

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
177,hTVt2o_txMI,STOCK MARKET CRASH GETTING WORSE,2021-03-03,11.816667,"55,360 views",3.4K,Application form to apply & try and get in my ...,"[(whoa, PROPN), (holy, PROPN), (smoke, NOUN), ...",General,0.0,0.460274
173,mvTyt7qdjvA,WHY STOCKS ARE CRASHING NOW (THE TRUTH),2021-03-04,12.766667,"44,786 views",3.9K,Application form to apply & try and get in my ...,"[(howdy, PROPN), (folk, NOUN), (holy, PROPN), ...",General,0.0,0.427397
18,hqe9fdmBp0w,"Portfolio manager weighs in on Gamestop, short...",2021-01-29,9.233333,"3,114 views",39,Yahoo Finance’s Akiko Fujita and Zack Guzman d...,"[(begin, VERB), (today, NOUN), (discussion, NO...",General,0.0,1.545205
180,FSlT1g106_w,Stock Market Crash Ahead or Inflation & Stocks...,2021-02-13,21.15,"58,110 views",4K,Stock market crash ahead or inflation ahead? T...,"[(good, ADJ), (day, NOUN), (fellow, NOUN), (in...",General,0.0,1.052055
167,nHO37zXMiew,Why Stocks Keep Dropping + What I am Doing,2021-03-04,13.483333,"26,343 views",2K,"Gain Access To My Community of Over 1,200 Inve...","[(hello, INTJ), (market, NOUN), (look, VERB), ...",General,0.0,0.427397


#### CountVectorizer, Euclidean distance 

In [11]:
X_cv_euclidean = follow_up_recommender('Q9egMIbTo0E', df_videos_filtered_test1, CountVectorizer, euclidean_distances)
X_cv_euclidean

Unnamed: 0,Video_ID,Title,Upload Date,Duration,Views,Number of Likes,Description,Transcript,Topic,Topic Coefficient,Months Since Upload
64,9sWecuhjzOI,"My March 2021 BNPL Game Plan For ZIP, Afterpay...",2021-03-03,2.75,"5,247 views",190,In this video I unpack my March 2021 game plan...,"[(welcome, INTJ), (phil, ADJ), (morale, NOUN),...",General,0.0,0.460274
132,VIFBMx3VLx4,From trading on Emotions to Fundamental Invest...,2021-02-26,20.633333,6 views,0,I first started investing back in 2017. It was...,"[(right, INTJ), (folk, NOUN), (member, NOUN), ...",General,0.0,0.624658
101,cH2You1fy0k,Top 5 BANK Stocks to Watch! - Q2 Analysis 2020...,2020-06-07,23.8,"20,056 views",1K,What are the top 5 bank stocks to watch in 202...,"[(saver, PROPN), (investor, NOUN), (hope, NOUN...",General,0.0,9.30411
72,sJ0FacRT-Io,ZIP Shares Have Arrived On The F1 Track 📈🚀 $27...,2021-02-18,28.6,"6,622 views",270,In this video I unpack why ZIP shares have ral...,"[(welcome, INTJ), (phil, PROPN), (geraldo, PRO...",General,0.0,0.887671
68,LlP3wpp01Go,10 Significantly Undervalued ASX Rapid Growth ...,2021-02-13,2.75,"6,634 views",207,In this video I unpack ten significantly under...,"[(welcome, INTJ), (phil, PROPN), (meraldo, PRO...",General,0.0,1.052055


#### *Conclusion: Using TfidfVectorizer and cosine similarity gives the best recommendations, based on my domain knowledge in stock investing.  Tfidf and CV produce similar results for cosine similarity, but Tfidf gets slightly better results in terms of topic relevance*