In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import unidecode
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [2]:
def clean_text(text):
    text = unidecode.unidecode(text)
    text = text.replace('\n', ' ')
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text, flags=re.MULTILINE)
    # Remove "Sponsored by" phrases
    text = re.sub(r'(?i)sponsored\sby\s\w+', ' ', text)
    # Remove special characters and symbols (excluding spaces)
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespaces at beginning and end
    text = text.strip()
    return text

In [3]:
df_podcast = pd.read_pickle('../data/podcast_df_040423.pkl')

In [4]:
df_podcast.head()

Unnamed: 0,title,producer,genre,description,num_episodes,avg_rating,num_reviews,link,episode_descriptions,itunes_id,rating,user
0,One Strange Thing: Paranormal & True-Weird Mys...,One Strange Thing,History,"Paranormal, unexplainable, and uncanny stories...",105,4.6,499.0,https://podcasts.apple.com/us/podcast/one-stra...,[In celebration of our new premium format—two ...,1526579247,5,RobinFerris
1,One Strange Thing: Paranormal & True-Weird Mys...,One Strange Thing,History,"Paranormal, unexplainable, and uncanny stories...",105,4.6,499.0,https://podcasts.apple.com/us/podcast/one-stra...,[In celebration of our new premium format—two ...,1526579247,1,Pops.99
2,One Strange Thing: Paranormal & True-Weird Mys...,One Strange Thing,History,"Paranormal, unexplainable, and uncanny stories...",105,4.6,499.0,https://podcasts.apple.com/us/podcast/one-stra...,[In celebration of our new premium format—two ...,1526579247,5,ReddEye81
3,One Strange Thing: Paranormal & True-Weird Mys...,One Strange Thing,History,"Paranormal, unexplainable, and uncanny stories...",105,4.6,499.0,https://podcasts.apple.com/us/podcast/one-stra...,[In celebration of our new premium format—two ...,1526579247,2,Keyta7777
4,One Strange Thing: Paranormal & True-Weird Mys...,One Strange Thing,History,"Paranormal, unexplainable, and uncanny stories...",105,4.6,499.0,https://podcasts.apple.com/us/podcast/one-stra...,[In celebration of our new premium format—two ...,1526579247,4,Okkupent


In [5]:
df_podcast_clean = df_podcast.drop_duplicates(subset=["itunes_id"],keep="first").reset_index(drop=True)
df_podcast_clean['description'] = df_podcast_clean['description'].apply(clean_text)

In [6]:
df_podcast_clean.head()

Unnamed: 0,title,producer,genre,description,num_episodes,avg_rating,num_reviews,link,episode_descriptions,itunes_id,rating,user
0,One Strange Thing: Paranormal & True-Weird Mys...,One Strange Thing,History,paranormal unexplainable and uncanny stories...,105,4.6,499.0,https://podcasts.apple.com/us/podcast/one-stra...,[In celebration of our new premium format—two ...,1526579247,5,RobinFerris
1,BibleProject,BibleProject Podcast,Religion & Spirituality,the creators of bibleproject have in depth con...,352,4.9,15000.0,https://podcasts.apple.com/us/podcast/biblepro...,"[David was Israel’s greatest king, but even he...",1050832450,5,nina52475
2,The Domonique Foxworth Show,ESPN,Sports,with episodes every tuesday and thursday durin...,70,4.9,1100.0,https://podcasts.apple.com/us/podcast/the-domo...,"[Domonique, Charlie, and Ashley Foxworth along...",1642566714,5,nick ndd as mm name
3,Hacking Humans,CyberWire Inc.,Technology,deception influence and social engineering i...,415,4.7,255.0,https://podcasts.apple.com/us/podcast/hacking-...,"[Kathleen Smith, CMO from ClearedJobs.Net sits...",1391915810,5,FreshDoughnuts
4,Leader Up,AMSC,Government,leader up a podcast by the army management st...,52,5.0,14.0,https://podcasts.apple.com/us/podcast/leader-u...,[MSC's Mr. David Howey meets with CSM Jason C....,1378682853,5,Nolikeynewudatey


In [7]:
df_podcast_clean.iloc[2,:]['description']

'with episodes every tuesday and thursday during the football season  espn and andscape contributor domonique foxworth debuts his new podcast featuring his unique perspectives on football  the personalities surrounding it  and just about anything else he finds interesting or thinks you might  you ll'

In [8]:
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = stopwords.words('english')
df_podcast_clean['description_tokens'] = df_podcast_clean['description'].apply(word_tokenize)
df_podcast_clean['description_tokens'] = df_podcast_clean['description_tokens'].apply(lambda x: [word for word in x if word not in stopwords])
df_podcast_clean['description_lemma'] = df_podcast_clean['description_tokens'].apply(lambda x: [wordnet_lemmatizer.lemmatize(word) for word in x])
df_podcast_clean['description_tokens'] = df_podcast_clean['description_tokens'].apply(lambda x: " ".join(x))
df_podcast_clean = df_podcast_clean.drop(columns=['description_lemma'])

In [9]:
df_podcast_clean.head()

Unnamed: 0,title,producer,genre,description,num_episodes,avg_rating,num_reviews,link,episode_descriptions,itunes_id,rating,user,description_tokens
0,One Strange Thing: Paranormal & True-Weird Mys...,One Strange Thing,History,paranormal unexplainable and uncanny stories...,105,4.6,499.0,https://podcasts.apple.com/us/podcast/one-stra...,[In celebration of our new premium format—two ...,1526579247,5,RobinFerris,paranormal unexplainable uncanny stories ficti...
1,BibleProject,BibleProject Podcast,Religion & Spirituality,the creators of bibleproject have in depth con...,352,4.9,15000.0,https://podcasts.apple.com/us/podcast/biblepro...,"[David was Israel’s greatest king, but even he...",1050832450,5,nina52475,creators bibleproject depth conversations bibl...
2,The Domonique Foxworth Show,ESPN,Sports,with episodes every tuesday and thursday durin...,70,4.9,1100.0,https://podcasts.apple.com/us/podcast/the-domo...,"[Domonique, Charlie, and Ashley Foxworth along...",1642566714,5,nick ndd as mm name,episodes every tuesday thursday football seaso...
3,Hacking Humans,CyberWire Inc.,Technology,deception influence and social engineering i...,415,4.7,255.0,https://podcasts.apple.com/us/podcast/hacking-...,"[Kathleen Smith, CMO from ClearedJobs.Net sits...",1391915810,5,FreshDoughnuts,deception influence social engineering world c...
4,Leader Up,AMSC,Government,leader up a podcast by the army management st...,52,5.0,14.0,https://podcasts.apple.com/us/podcast/leader-u...,[MSC's Mr. David Howey meets with CSM Jason C....,1378682853,5,Nolikeynewudatey,leader podcast army management staff college a...


In [10]:
df_podcast_clean.to_pickle('../data/podcast_df_tokens_040723.pkl')

In [10]:
# Create vocabulary
vocabulary = set()
for tokens in df_podcast_clean['description_tokens']:
    vocabulary.update(tokens.split())
vocabulary = list(vocabulary)
# Create TF-IDF model
tfidf = TfidfVectorizer(vocabulary=vocabulary)
tfidf.fit(df_podcast_clean['description_tokens'])
tfidf_matrix = tfidf.transform(df_podcast_clean['description_tokens'])

In [11]:
def gen_vector_T(tokens):
    Q = np.zeros((len(vocabulary)))    
    x = tfidf.transform(tokens)
    for token in tokens[0].split():
        try:
            ind = vocabulary.index(token)
            Q[ind]  = x[0, tfidf.vocabulary_[token]]
        except:
            pass
    return Q

In [12]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [13]:
def cosine_similarity_T(k, query):
    preprocessed_query = re.sub("\W+", " ", query).strip()
    tokens = word_tokenize(str(preprocessed_query))
    q_df = pd.DataFrame(columns=['q_clean'])
    q_df.loc[0, 'q_clean'] = tokens
    q_df['q_clean'] = q_df['q_clean'].apply(lambda x: [wordnet_lemmatizer.lemmatize(word) for word in x])
    q_df['q_clean'] = q_df['q_clean'].apply(lambda x: " ".join(x))
    d_cosines = []

    query_vector = gen_vector_T(q_df['q_clean'])
    for d in tfidf_matrix.A:
        d_cosines.append(cosine_sim(query_vector, d))

    out = np.array(d_cosines).argsort()[-k:][::-1]
    d_cosines.sort()
    a = pd.DataFrame()
    for i, index in enumerate(out):
        a.loc[i,'index'] = str(index)
        a.loc[i,'Podcast'] = df_podcast_clean['title'][index]
        a.loc[i,'Genre'] = df_podcast_clean['genre'][index]
    for j,simScore in enumerate(d_cosines[-k:][::-1]):
        a.loc[j,'Score'] = simScore
    return a

In [14]:
cosine_similarity_T(10, 'healthy recipes')

Unnamed: 0,index,Podcast,Genre,Score
0,1604,Play Me a Recipe,Arts,0.346577
1,161,The One Recipe,Arts,0.323912
2,1051,Life Kit: Health,Science,0.169684
3,3523,The Laughing Couple,Kids & Family,0.164483
4,3014,Proof,Arts,0.157306
5,3088,Recipe Club,Arts,0.156601
6,2309,Crime Salad,True Crime,0.155852
7,1415,She's My Cherry Pie,Arts,0.14146
8,562,The Checkup with Doctor Mike,Health & Fitness,0.124995
9,167,"Fundamental Health with Paul Saladino, MD",Health & Fitness,0.123285


In [15]:
cosine_similarity_T(10, 'machine learning')

Unnamed: 0,index,Podcast,Genre,Score
0,536,Adventures in Machine Learning,Technology,0.577918
1,581,Machine Learning,Technology,0.394043
2,2224,Gradient Dissent,Technology,0.380504
3,915,ChatGPT,News,0.311805
4,3263,Private Pilot Podcast by MzeroA.com,Leisure,0.296127
5,2229,"Practical AI: Machine Learning, Data Science",Technology,0.265195
6,1890,Data Skeptic,Technology,0.254437
7,321,Super Data Science,Technology,0.246207
8,91,News in Slow French,Education,0.242155
9,415,The TWIML AI Podcast (formerly This Week in Ma...,Technology,0.216544


In [16]:
cosine_similarity_T(10, 'god and christianity')

Unnamed: 0,index,Podcast,Genre,Score
0,3461,Questioning Christianity with Tim Keller,Religion & Spirituality,0.335481
1,3588,The Alisa Childers Podcast,Religion & Spirituality,0.283445
2,2060,Compelled - Christian Stories & Testimonies,Religion & Spirituality,0.27195
3,2741,I Don't Have Enough FAITH to Be an ATHEIST,Religion & Spirituality,0.255668
4,2393,Bethel Redding Sermon of the Week,Religion & Spirituality,0.229422
5,2322,The Russell Moore Show,Religion & Spirituality,0.193257
6,2335,Heaven In Your Home,Religion & Spirituality,0.189199
7,2358,Daily Drive with Lakepointe Church,Religion & Spirituality,0.186813
8,1208,"Hearing Jesus: Daily Bible Study, Daily Devoti...",Religion & Spirituality,0.184402
9,2277,Blessed + Bossed Up,Business,0.177282


In [17]:
cosine_similarity_T(10, 'american football')

Unnamed: 0,index,Podcast,Genre,Score
0,981,Legendary Upside,Sports,0.314015
1,3805,The Mina Kimes Show featuring Lenny,Sports,0.313506
2,1057,Fantasy Footballers - Fantasy Football Podcast,Sports,0.2955
3,2,The Domonique Foxworth Show,Sports,0.287226
4,1332,FantasyPros Dynasty Football Podcast,Sports,0.253914
5,2161,American Contingency,Government,0.247415
6,3425,The Andy Staples Show & Friends: A show about ...,Sports,0.246131
7,3208,The Athletic Football Show: A show about the NFL,Sports,0.238489
8,862,This American President,History,0.207673
9,693,Fantasy Footballers Dynasty - Fantasy Football...,Sports,0.201617


In [18]:
cosine_similarity_T(10, 'ghost and haunted houses')

Unnamed: 0,index,Podcast,Genre,Score
0,3196,The Night Owl: True Ghost Stories,Society & Culture,0.519754
1,2067,Haunted Road,True Crime,0.273631
2,2895,Deeper Shades of House - weekly Deep House Pod...,Music,0.259715
3,1463,GraveYard Tales,Science,0.250926
4,489,True Hauntings & Scary Stories,Fiction,0.226258
5,462,"Ghost Town: Strange History, True Crime, & the...",History,0.224307
6,957,The Witch Farm,Fiction,0.224033
7,575,Jim Harold's Campfire,Society & Culture,0.221682
8,1067,Chapo Trap House,News,0.21715
9,75,In Another Room,Fiction,0.208114
