In [26]:
# Importing data paths
import sys
sys.path.insert(1, '/Users/piyush/Desktop/dsml_Portfolio/podcast/final_push')
from config import PODCASTS_DATABASE_PATH_RAW, USER_REVIEWS_DATABASE_PATH_RAW, PODCASTS_DATABASE_PATH_PROCESSED, USER_REVIEWS_DATABASE_PATH_PROCESSED
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import re
from gensim import models
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
%matplotlib inline
from gensim.models import FastText as ft
from IPython.display import Image

In [27]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/piyush/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/piyush/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/piyush/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/piyush/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [28]:
# Loading data 
df_podcasts = pd.read_csv(PODCASTS_DATABASE_PATH_RAW,header=None)
df_users = pd.read_csv(USER_REVIEWS_DATABASE_PATH_RAW,header=None)

columns_pod = ['id','name','url','studio','category','episode_count','avg_rating','total_ratings','description']
columns_users = ['id','podcasts_id','username','review_title','review','rating','date']

df_podcasts.columns = columns_pod
df_users.columns = columns_users
df_users = df_users.drop('id', axis=1)

In [31]:
df_podcasts

Unnamed: 0,id,name,url,studio,category,episode_count,avg_rating,total_ratings,description
0,id1647910854,Rachel Maddow Presents: Ultra,https://podcasts.apple.com/us/podcast/rachel-m...,MSNBC,News,5,4.8,14900,Sitting members of Congress aiding and abettin...
1,id1464919521,Dateline NBC,https://podcasts.apple.com/us/podcast/dateline...,Dateline NBC,True Crime,456,4.5,29000,"Current and classic episodes, featuring compel..."
2,id1200361736,The Daily,https://podcasts.apple.com/us/podcast/the-dail...,The New York Times,News,1669,4.4,88100,This is what the news should sound like. The b...
3,id1322200189,Crime Junkie,https://podcasts.apple.com/us/podcast/crime-ju...,audiochuck,True Crime,282,4.8,326400,If you can never get enough true crime... Cong...
4,id1379959217,Morbid,https://podcasts.apple.com/us/podcast/morbid/i...,Wondery,Comedy,447,4.5,76000,"It’s a lighthearted nightmare in here, weirdos..."
...,...,...,...,...,...,...,...,...,...
2358,id1582491051,Strawberry Spring,https://podcasts.apple.com/us/podcast/strawber...,,Fiction,12,4.2,1300,"Based on the short story by Stephen King, Stra..."
2359,id1493193473,Intent: The Tex McIver Case,https://podcasts.apple.com/us/podcast/intent-t...,VAULT Studios,True Crime,18,4.5,2400,"""The Officer's Wife"" is now ""Intent: The Tex M..."
2360,id1518866635,Request Pending,https://podcasts.apple.com/us/podcast/request-...,The Drag,News,10,4.6,27,"During this time of social isolation, The Drag..."
2361,id1551162705,The Opportunist,https://podcasts.apple.com/us/podcast/the-oppo...,,True Crime,50,4.8,7700,The Opportunist tells true stories of regular ...


In [33]:
df_podcasts['tags'] =  df_podcasts['description'] + ' ' + df_podcasts['category']

In [35]:
df_podcasts['tags_parsed'] = df_podcasts['tags'].apply(lambda x: x.lower()) 

# Remove Stop special Characters
df_podcasts['tags_parsed'] = df_podcasts['tags_parsed'].apply(lambda x: re.sub(r'[^\w\s]', '', x))


In [54]:
def get_preprocessed_text(text):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    lst_stopwords = nltk.corpus.stopwords.words("english")

    ## Tokenize (convert from string to list)
    lst_text = text.split()
    
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]            
    ## Lemmatisation (convert the word into root word)
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    lst_text = [lem.lemmatize(word) for word in lst_text]
      
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

def find_cosine_similarity(cosine_sim_matrix, index, n):
    
    # calculate cosine similarity between each vectors
    result = list(enumerate(cosine_sim_matrix[index]))
    
    # Sorting the Score
    sorted_result = sorted(result,key=lambda x:x[1],reverse=True)[1:n+1]
    
    similar_products =  [{'value': df_podcasts.iloc[x[0]]['name'], 'score' : round(x[1], 2)} for x in sorted_result]
     
    return similar_products




def find_euclidean_distances(sim_matrix, index, n): 

    # Getting Score and Index
    result = list(enumerate(sim_matrix[index]))

    # Sorting the Score and taking top 10 products
    sorted_result = sorted(result,key=lambda x:x[1],reverse=False)[1:n+1]

    # Mapping index with data
    similar_products =  [{'value': df_podcasts.iloc[x[0]]['name'], 'score' : round(x[1], 2)} for x in sorted_result]
    
    return similar_products



def find_manhattan_distance(sim_matrix, index, n):   
     
    # Getting Score and Index
    result = list(enumerate(sim_matrix[index]))

    # Sorting the Score and taking top 10 products
    sorted_result = sorted(result,key=lambda x:x[1],reverse=False)[1:n+1]
    
    # Mapping index with data
    similar_products =  [{'value': df_podcasts.iloc[x[0]]['name'], 'score' : round(x[1], 2)} for x in sorted_result]
    
    return similar_products

In [39]:
 df_podcasts['tags_parsed'] =  df_podcasts['tags_parsed'].apply(lambda x: get_preprocessed_text(x)) 

In [41]:
tags_list = list(df_podcasts['tags_parsed'])

In [43]:
# Importing IFIDF
tfidf_vec = TfidfVectorizer(max_features=1000,stop_words='english', analyzer='word', ngram_range=(1,3))

In [45]:
tfidf_matrix = tfidf_vec.fit_transform(tags_list)
similarity_matrix = cosine_similarity(tfidf_matrix)

In [51]:
# Comparing similarity to get the top matches using TF-IDF

def get_recommendation_tfidf(podcast_id, df, similarity, n):

    row = df.loc[df['name'] == podcast_id]
    index = list(row.index)[0]
    description = row['tags_parsed'].loc[index]

    #Create vector using tfidf
    
    tfidf_matrix = tfidf_vec.fit_transform(tags_list)
    
    if similarity == "cosine":
        sim_matrix = cosine_similarity(tfidf_matrix)
        products = find_cosine_similarity(sim_matrix , index,n)
        
    elif similarity == "manhattan":
        sim_matrix = manhattan_distances(tfidf_matrix)
        products = find_manhattan_distance(sim_matrix , index,n)
        
    else:
        sim_matrix = euclidean_distances(tfidf_matrix)
        products = find_euclidean_distances(sim_matrix , index,n)

    return products



In [56]:
podcast_id = 'Lex Fridman Podcast'

In [57]:
get_recommendation_tfidf(podcast_id, df_podcasts, similarity = 'cosine',n = 13)

[{'value': 'Overpriced JPEGs', 'score': 0.56},
 {'value': 'Android Central Podcast', 'score': 0.48},
 {'value': 'Your Undivided Attention', 'score': 0.46},
 {'value': 'EV News Daily', 'score': 0.4},
 {'value': "Aarthi and Sriram's Good Time Show", 'score': 0.38},
 {'value': 'Into the Impossible With Brian Keating', 'score': 0.37},
 {'value': 'Underserved', 'score': 0.36},
 {'value': 'Lew Later', 'score': 0.35},
 {'value': 'Nature Guys', 'score': 0.34},
 {'value': '9to5Mac Happy Hour', 'score': 0.34},
 {'value': 'This Machine Kills', 'score': 0.33},
 {'value': 'Acquired', 'score': 0.32},
 {'value': 'Hacking Humans', 'score': 0.3}]