In [1]:
import spotipy 
from spotipy.oauth2 import SpotifyClientCredentials
import getpass
import numpy as np
import pandas as pd
import random
import re
import nltk 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
podcast_collection=pd.read_csv('podcast_collection.csv')

In [3]:
podcast_collection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11665 entries, 0 to 11664
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         11665 non-null  object
 1   description  11502 non-null  object
 2   explicit     11665 non-null  bool  
 3   language     11665 non-null  object
 4   url          11665 non-null  object
 5   topic        11665 non-null  object
dtypes: bool(1), object(5)
memory usage: 467.2+ KB


In [4]:
podcast_collection.head(1)

Unnamed: 0,name,description,explicit,language,url,topic
0,Moment 45 - Why 80% Of Relationships Don't Wor...,"In these ‘Moment’ episodes of my podcast, I’ll...",False,en-US,https://open.spotify.com/episode/2ysqVdxX6rkzO...,relationship


In [5]:
podcast_collection['topic'].unique()

array(['relationship', 'technology', 'history', 'food', 'music', 'lgbt',
       'health fitness', 'science', 'education', 'book', 'spirituality'],
      dtype=object)

In [6]:
#Check null values in 'description': NaN 163 rows
podcast_collection[podcast_collection.isnull().any(axis=1)]

Unnamed: 0,name,description,explicit,language,url,topic
20,Ellum & Captain Puffy Reveal Relationship Secr...,,False,en-GB,https://open.spotify.com/episode/3toVDqZQJqWML...,relationship
129,Ask Ashley (Rocky Relationship With Mom),,False,en,https://open.spotify.com/episode/1axNUEH8pI0lB...,relationship
289,"Shortest Relationship, New Waiting By The Phon...",,False,en,https://open.spotify.com/episode/3Ep19mc3JsaI2...,relationship
352,Episode 148: We Go Deep Into Relationships,,False,en,https://open.spotify.com/episode/36DGuLYcRkM2r...,relationship
422,Relationships Can 10X Your Business Growth,,False,en,https://open.spotify.com/episode/3Wq2kOfuNdbQR...,relationship
...,...,...,...,...,...,...
11538,Episode 15: Freemasonry and Spirituality,,False,en-US,https://open.spotify.com/episode/7dfuUgi5pisYH...,spirituality
11567,"False Flag Ww3 , Spirituality.",,False,en,https://open.spotify.com/episode/6W1uLNAKBi0Nl...,spirituality
11598,Episode 005 - Nick Yeh discusses religion and ...,,False,en-US,https://open.spotify.com/episode/7nUaVIpGjOuKv...,spirituality
11603,Case Dorman & Spirituality in the Workplace - ...,,False,en,https://open.spotify.com/episode/17GtZ8zRRdfWI...,spirituality


In [7]:
#drop null values as those rows won't be used during the content analisy
podcast_collection.dropna(subset=['description'],inplace=True)

In [8]:
# All cleaned: no Nan values 
podcast_collection[podcast_collection.isnull().any(axis=1)]

Unnamed: 0,name,description,explicit,language,url,topic


In [9]:
podcast_collection.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11502 entries, 0 to 11664
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         11502 non-null  object
 1   description  11502 non-null  object
 2   explicit     11502 non-null  bool  
 3   language     11502 non-null  object
 4   url          11502 non-null  object
 5   topic        11502 non-null  object
dtypes: bool(1), object(5)
memory usage: 550.4+ KB


In [10]:
podcast_collection.shape

(11502, 6)

In [11]:
# search for and eliminate duplicates: no duplicates
data_dedupe=podcast_collection.drop_duplicates()
data_dedupe.shape

(11501, 6)

In [12]:
#dropped one duplicate and index reset
podcast_collection.reset_index(drop=True, inplace=True)

In [13]:
podcast_collection=data_dedupe

In [14]:
podcast_collection.shape

(11501, 6)

In [15]:
podcast_collection['language'].unique()

array(['en-US', 'en', 'en-AU', 'en-GB', 'id', 'de', 'en-ZA', 'nl-NL',
       'en-CA', 'ml', 'fil', 'hi', 'ar', 'en-IE', 'ms', 'no', 'aa', 'tl',
       'pt', 'en-PH', 'es', 'kn', 'nl', 'en-NZ', 'de-DE', 'hu', 'fi',
       'te', 'pt-BR', 'it', 'th', 'sv-SE', 'ta', 'ja', 'fr', 'en-UK',
       'es-MX', 'de-CH', 'es-ES', 'cs', 'pl', 'nl-BE', 'es-EC', 'aii',
       'tr', 'es-CO', 'unite', 'it-IT', 'sv', 'und', 'zh', 'fr-FR', 'vi',
       'ru', 'es-AR', 'zh-TW', 'pt-PT', 'sk', 'no-NO', 'ja-JP', 'de-AT',
       'da', 'el', 'en-EN', 'es-CL', 'ne', 'ti', 'es-PE', 'es-LA',
       'fr-CA', 'da-DK', 'et', 'zh-CN', 'gu', 'es-PR', 'sl', 'az', 'lv',
       'ug', 'lt', 'zh-Hant', 'ca', 'cy', 'lb-LB', 'ko', 'he', 'mr',
       'ro-RO', 'nb', 'my', 'ru-RU', 'hu-HU', 'es-CR', 'el-GR', 'bg',
       'zu', 'af', 'pl-PL'], dtype=object)

In [16]:
## filter by english language
podcast_collection=podcast_collection[podcast_collection.language.str.startswith('en')]

In [17]:
podcast_collection['language'].unique()

array(['en-US', 'en', 'en-AU', 'en-GB', 'en-ZA', 'en-CA', 'en-IE',
       'en-PH', 'en-NZ', 'en-UK', 'en-EN'], dtype=object)

In [18]:
podcast_collection.shape

(9770, 6)

In [19]:
#drop "language" as it won't impact content similarity analisy
podcast_collection=podcast_collection.drop(['language'],axis=1)

In [20]:
podcast_collection.reset_index(drop=True, inplace=True)

In [21]:
podcast_collection.to_csv('podcast_collection_final.csv',index=False,header=True)

In [22]:
podcast_collection.tail(1)

Unnamed: 0,name,description,explicit,url,topic
9769,"""Church Misogyny: A Privileged Perspective"" Ho...",Watch the entire episode at patreon.com/tfcatl...,False,https://open.spotify.com/episode/6cYoO0JpTQLom...,spirituality


In [23]:
podcast_collection['topic'].unique()

array(['relationship', 'technology', 'history', 'food', 'music', 'lgbt',
       'health fitness', 'science', 'education', 'book', 'spirituality'],
      dtype=object)

In [24]:
# Create a new column with name and description in preparation for the cosine similarity
podcast_collection['name + description'] = podcast_collection['name']+ ' ' +podcast_collection['description']

In [25]:
podcast_collection.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9770 entries, 0 to 9769
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   name                9770 non-null   object
 1   description         9770 non-null   object
 2   explicit            9770 non-null   bool  
 3   url                 9770 non-null   object
 4   topic               9770 non-null   object
 5   name + description  9770 non-null   object
dtypes: bool(1), object(5)
memory usage: 391.3+ KB


In [26]:
stop_words=stopwords.words('english')

In [27]:
# Preprocessing: clean the dataset to facilitate the similarity analisy 
def cleaning(data):
    text_tokens = word_tokenize(data.replace("'", "").lower()) 
    tokens_without_punc = [w for w in text_tokens if w.isalpha()]  
    tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
    text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
    return " ".join(text_cleaned)

In [28]:
podcast_collection['name']=podcast_collection['name'].str.strip()

In [29]:
podcast_collection['name + description'] = podcast_collection['name + description'].apply(cleaning)

In [30]:
podcast_collection['name + description']=podcast_collection['name + description'].str.strip()

In [31]:
pd.set_option('display.max_colwidth', None)

In [32]:
podcast_collection['name + description'][1]

'mistake make love relationship expectation current relationship wish partner like thing like enjoy thing enjoy share passion want partner carve path along many relationship idealism partner need like like love love hate hate pretext show love toxic strip away partner identity freedom episode purpose jay shetty tell u common mistake make dont help relationship grow strengthen copy think like monk today clicking link http mistake make mistake make someone mistake someone could mistake stay long scared seen bad mistake trade time partner instead creating new mistake want like mistake take time understand way mistake talk many people person relationship mistake mad sacrifice made ask mistake know difference ownership partnership relationshiplike show please leave u review even one sentence help post screenshot listening instagram tag u thank personally'

In [33]:
podcast_collection.tail(1)

Unnamed: 0,name,description,explicit,url,topic,name + description
9769,"""Church Misogyny: A Privileged Perspective"" Holy Smokes: Cigars and Spirituality S4 E6","Watch the entire episode at patreon.com/tfcatl. Listen on your favorite podcast platform. The Faith Community is a disruptive ministry built upon five keys: Greatest Commandment Theology, Anti-Racism, Women's Equity, LGBTQ+ Affirmation, and Bible Criticism. We are for the unchurched, the over-churched, and the under-churched; the weary and the wounded; the seeking and the cynical. For more inspiring content like this, check out our other platforms: Custom Face Masks: www.thefaithcommunity.org/store Facebook: https://www.facebook.com/TFCATL Store: https://www.spreadshirt.com/user/faithcommunityatl Instagram: @tfcatl, @holysmokesmovement Patreon: https://www.patreon.com/Tfcatl Website: https://www.thefaithcommunity.org",False,https://open.spotify.com/episode/6cYoO0JpTQLomZBvHG9B23,spirituality,church misogyny privileged perspective holy smoke cigar spirituality watch entire episode listen favorite podcast platform faith community disruptive ministry built upon five key greatest commandment theology woman equity affirmation bible criticism unchurched weary wounded seeking cynical inspiring content like check platform custom face mask facebook http store http instagram tfcatl holysmokesmovement patreon http website http


In [34]:
tf = TfidfVectorizer()
tf_matrix = tf.fit_transform(podcast_collection['name + description'])
tf_cosine_sim = cosine_similarity(tf_matrix)

In [35]:
tf_cosine_sim

array([[1.        , 0.10672878, 0.08383457, ..., 0.02347535, 0.06615352,
        0.04948533],
       [0.10672878, 1.        , 0.09572376, ..., 0.03691321, 0.01569781,
        0.01993186],
       [0.08383457, 0.09572376, 1.        , ..., 0.02274845, 0.01750033,
        0.        ],
       ...,
       [0.02347535, 0.03691321, 0.02274845, ..., 1.        , 0.02265303,
        0.01443174],
       [0.06615352, 0.01569781, 0.01750033, ..., 0.02265303, 1.        ,
        0.08820411],
       [0.04948533, 0.01993186, 0.        , ..., 0.01443174, 0.08820411,
        1.        ]])

In [36]:
tf_cosine_sim.shape

(9770, 9770)

In [None]:
def episodes_rec(name, number):
    index = podcast_collection[podcast_collection['name'] == name].index.values[0]
    similarity_scores = list(enumerate(tf_cosine_sim[index]))
    similarity_sorted_scores = sorted(similarity_scores, key = lambda x:x[1], reverse = True)

    n = 0
    print('The ' + str(number) + ' most recommended episodes to ' + name + ' are:\n')
    for index, score in similarity_sorted_scores[1:]:
        name = podcast_collection[podcast_collection.index == index]['name'].values[0]
        url = podcast_collection[podcast_collection.index == index]['url'].values[0]
        print(n + 1, index, name, '\n', url)
        print()
        n += 1
        if n > (number - 1):
            break
podcast_list=list(podcast_collection['topic'].unique())
podcast= str(input("Welcome to our podcast recommender:)Please enter a topic you might want to know more about:\nhealth fitness,\ntechnology,\nhistory,\nfood,\nmusic,\nbook,\nrelationship,\nlgbt,\nscience,\neducation,\nspirituality\n")).lower()
result=''
if len(podcast)==0:
    print('You forgot to input a topic, do you want to try again? If yes, press return')
elif podcast not in podcast_list:
    
    print('You input a wrong spelling topic, do you want to try again? If yes, press return')
else:
    b=podcast_collection[podcast_collection['topic']==podcast].sample(5)
    print('We selected the following 5 titles of the topic you liked:',b[['name']])
    name = str(input('Please copy and paste the title that stimulates your curiosity: '))
    print(episodes_rec(name,1))
    name2 = str(input('Do you want to try with an another title?: '))
    print(episodes_rec(name2,1))
    name3 = str(input('Do you want to try with an another title?: '))
    print(episodes_rec(name3,1))
    name4 = str(input('Do you want to try with an another title?: '))
    print(episodes_rec(name4,1))
    name5 = str(input('Do you want to try with an another title?: '))
    print(episodes_rec(name5,1))

Welcome to our podcast recommender:)Please enter a topic you might want to know more about:
health fitness,
technology,
history,
food,
music,
book,
relationship,
lgbt,
science,
education,
spirituality
spirituality
We selected the following 5 titles of the topic you liked:                                                                                                                                                                                       name
8925                                                                                                                                Instagram, Facebook, & TikTok are banning spirituality
9444                                                                                                                                               3 Tips for People "New" to Spirituality
9429                                                                                                                                  The Catholic Spirituality of Die