In [76]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from api_keys import client_id, client_secret

from keywords import keywords

import pandas as pd
import re

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

In [3]:
# Get total number of episodes for the show
show_id = '4rOoJ6Egrf8K2IrywzwOMk'
podcast = sp.show(show_id=show_id, market='US')

total_episodes = podcast['total_episodes']

print(total_episodes)

2112


# Use this cell if you need to get data from API

In [24]:
# Get data for all shows
showNum = 0
data = []
# Pagination for show requests
while showNum <= total_episodes:
    print("Current Show Count: ", showNum)
    result = sp.show_episodes(show_id=show_id, limit=50, offset=showNum, market='US')

    episodes = result['items']

    # Loop through the 50 episodes requested
    for episode in episodes:
        title = episode['name']
        description = episode['description']
        duration_ms = episode['duration_ms']
        release_date = episode['release_date']
        
        myEpisode = {
            'title' : title,
            'description' : description,
            'duration_ms' : duration_ms,
            'release_date' : release_date
        }
        
        data.append(myEpisode)
    showNum += 50

print(len(data))

Current Show Count:  0
Current Show Count:  50
Current Show Count:  100
Current Show Count:  150
Current Show Count:  200
Current Show Count:  250
Current Show Count:  300
Current Show Count:  350
Current Show Count:  400
Current Show Count:  450
Current Show Count:  500
Current Show Count:  550
Current Show Count:  600
Current Show Count:  650
Current Show Count:  700
Current Show Count:  750
Current Show Count:  800
Current Show Count:  850
Current Show Count:  900
Current Show Count:  950
Current Show Count:  1000
Current Show Count:  1050
Current Show Count:  1100
Current Show Count:  1150
Current Show Count:  1200
Current Show Count:  1250
Current Show Count:  1300
Current Show Count:  1350
Current Show Count:  1400
Current Show Count:  1450
Current Show Count:  1500
Current Show Count:  1550
Current Show Count:  1600
Current Show Count:  1650
Current Show Count:  1700
Current Show Count:  1750
Current Show Count:  1800
Current Show Count:  1850
Current Show Count:  1900
Current S

In [25]:
# Create DF from data
df = pd.DataFrame(data)
df.head()

Unnamed: 0,title,description,duration_ms,release_date
0,#1970 - Bill Ottman,"Bill Ottman is founder of Minds, an open sourc...",10369428,2023-04-13
1,#1969 - Sam Tallent,"Sam Tallent is a stand-up comic, writer, and a...",10797097,2023-04-12
2,#1968 - Jason Everman,Jason Everman is a musician and military veter...,9347177,2023-04-11
3,JRE MMA Show #138 with Cory Sandhagen,"Joe is joined by Cory Sandhagen, a professiona...",9816340,2023-04-07
4,#1967 - Mike Vecchione,Mike Vecchione is a stand-up comic and actor. ...,9674281,2023-04-06


In [26]:
# Change MS column to Seconds
df['duration_ms'] = df['duration_ms'].apply(lambda x: float(x / 60000))
df.head()

Unnamed: 0,title,description,duration_ms,release_date
0,#1970 - Bill Ottman,"Bill Ottman is founder of Minds, an open sourc...",172.8238,2023-04-13
1,#1969 - Sam Tallent,"Sam Tallent is a stand-up comic, writer, and a...",179.951617,2023-04-12
2,#1968 - Jason Everman,Jason Everman is a musician and military veter...,155.786283,2023-04-11
3,JRE MMA Show #138 with Cory Sandhagen,"Joe is joined by Cory Sandhagen, a professiona...",163.605667,2023-04-07
4,#1967 - Mike Vecchione,Mike Vecchione is a stand-up comic and actor. ...,161.238017,2023-04-06


In [27]:
# Rename Column
df.rename(columns={'duration_ms' : 'duration_seconds'})

Unnamed: 0,title,description,duration_seconds,release_date
0,#1970 - Bill Ottman,"Bill Ottman is founder of Minds, an open sourc...",172.823800,2023-04-13
1,#1969 - Sam Tallent,"Sam Tallent is a stand-up comic, writer, and a...",179.951617,2023-04-12
2,#1968 - Jason Everman,Jason Everman is a musician and military veter...,155.786283,2023-04-11
3,JRE MMA Show #138 with Cory Sandhagen,"Joe is joined by Cory Sandhagen, a professiona...",163.605667,2023-04-07
4,#1967 - Mike Vecchione,Mike Vecchione is a stand-up comic and actor. ...,161.238017,2023-04-06
...,...,...,...,...
2107,"#5 - John Heffron, Ari Shaffir (Part 1)","Joe sits down with John Heffron, and Ari Shaffir.",64.169500,2010-01-21
2108,"#5 - John Heffron, Ari Shaffir (Part 2)","Joe sits down with John Heffron, and Ari Shaffir.",72.936967,2010-01-21
2109,#3 - Ari Shaffir,Joe sits down with Ari Shaffir.,134.987267,2010-01-06
2110,#2 - Brian Redban,Joe sits down with Brian Redban.,152.107717,2009-12-29


In [28]:
# Split release date into seperate columns
df['release_date'] = df['release_date'].astype('datetime64[ns]')
df['day'] = df['release_date'].map(lambda x: x.day)
df['month'] = df['release_date'].map(lambda x: x.month)
df['year'] = df['release_date'].map(lambda x: x.year)
df.head()

Unnamed: 0,title,description,duration_ms,release_date,day,month,year
0,#1970 - Bill Ottman,"Bill Ottman is founder of Minds, an open sourc...",172.8238,2023-04-13,13,4,2023
1,#1969 - Sam Tallent,"Sam Tallent is a stand-up comic, writer, and a...",179.951617,2023-04-12,12,4,2023
2,#1968 - Jason Everman,Jason Everman is a musician and military veter...,155.786283,2023-04-11,11,4,2023
3,JRE MMA Show #138 with Cory Sandhagen,"Joe is joined by Cory Sandhagen, a professiona...",163.605667,2023-04-07,7,4,2023
4,#1967 - Mike Vecchione,Mike Vecchione is a stand-up comic and actor. ...,161.238017,2023-04-06,6,4,2023


In [50]:
# Find guest in show
guests = []
for row in df['title']:
    guest = re.findall(r"[A-Z][a-z]+,?\s+(?:[A-Z][a-z]*\.?\s*)?[A-Z][a-z]+", row)
    if guest:
         guests.append(guest[0])
    else:
        guests.append(pd.NaT)
df['guest'] = guests
df.head()

Unnamed: 0,title,description,duration_ms,release_date,day,month,year,guest
0,#1970 - Bill Ottman,"Bill Ottman is founder of Minds, an open sourc...",172.8238,2023-04-13,13,4,2023,Bill Ottman
1,#1969 - Sam Tallent,"Sam Tallent is a stand-up comic, writer, and a...",179.951617,2023-04-12,12,4,2023,Sam Tallent
2,#1968 - Jason Everman,Jason Everman is a musician and military veter...,155.786283,2023-04-11,11,4,2023,Jason Everman
3,JRE MMA Show #138 with Cory Sandhagen,"Joe is joined by Cory Sandhagen, a professiona...",163.605667,2023-04-07,7,4,2023,Cory Sandhagen
4,#1967 - Mike Vecchione,Mike Vecchione is a stand-up comic and actor. ...,161.238017,2023-04-06,6,4,2023,Mike Vecchione


In [51]:
df['guest'].count

<bound method Series.count of 0          Bill Ottman
1          Sam Tallent
2        Jason Everman
3       Cory Sandhagen
4       Mike Vecchione
             ...      
2107      John Heffron
2108      John Heffron
2109       Ari Shaffir
2110      Brian Redban
2111      Brian Redban
Name: guest, Length: 2112, dtype: object>

In [53]:
df.dropna()

Unnamed: 0,title,description,duration_ms,release_date,day,month,year,guest
0,#1970 - Bill Ottman,"Bill Ottman is founder of Minds, an open sourc...",172.823800,2023-04-13,13,4,2023,Bill Ottman
1,#1969 - Sam Tallent,"Sam Tallent is a stand-up comic, writer, and a...",179.951617,2023-04-12,12,4,2023,Sam Tallent
2,#1968 - Jason Everman,Jason Everman is a musician and military veter...,155.786283,2023-04-11,11,4,2023,Jason Everman
3,JRE MMA Show #138 with Cory Sandhagen,"Joe is joined by Cory Sandhagen, a professiona...",163.605667,2023-04-07,7,4,2023,Cory Sandhagen
4,#1967 - Mike Vecchione,Mike Vecchione is a stand-up comic and actor. ...,161.238017,2023-04-06,6,4,2023,Mike Vecchione
...,...,...,...,...,...,...,...,...
2107,"#5 - John Heffron, Ari Shaffir (Part 1)","Joe sits down with John Heffron, and Ari Shaffir.",64.169500,2010-01-21,21,1,2010,John Heffron
2108,"#5 - John Heffron, Ari Shaffir (Part 2)","Joe sits down with John Heffron, and Ari Shaffir.",72.936967,2010-01-21,21,1,2010,John Heffron
2109,#3 - Ari Shaffir,Joe sits down with Ari Shaffir.,134.987267,2010-01-06,6,1,2010,Ari Shaffir
2110,#2 - Brian Redban,Joe sits down with Brian Redban.,152.107717,2009-12-29,29,12,2009,Brian Redban


# Use these cells if you already have CSV File

In [72]:
# Pull Shows From CSV
df = pd.read_csv('data_files/podcast_data.csv')
df.head()

Unnamed: 0,title,description,duration_ms,release_date,day,month,year,guest
0,#1970 - Bill Ottman,"Bill Ottman is founder of Minds, an open sourc...",172.8238,2023-04-13,13,4,2023,Bill Ottman
1,#1969 - Sam Tallent,"Sam Tallent is a stand-up comic, writer, and a...",179.951617,2023-04-12,12,4,2023,Sam Tallent
2,#1968 - Jason Everman,Jason Everman is a musician and military veter...,155.786283,2023-04-11,11,4,2023,Jason Everman
3,JRE MMA Show #138 with Cory Sandhagen,"Joe is joined by Cory Sandhagen, a professiona...",163.605667,2023-04-07,7,4,2023,Cory Sandhagen
4,#1967 - Mike Vecchione,Mike Vecchione is a stand-up comic and actor. ...,161.238017,2023-04-06,6,4,2023,Mike Vecchione


In [54]:
# Save to CSV To Reuse
df.to_csv('data_files/podcast_data.csv', index=False)

In [95]:
from keywords import keywords

In [105]:
keywords = [
    'comedian',
    'comic',
    "phd",
    'actor',
    'writer',
    'author',
    'ufc',
    'musician',
    'artist',
    'founder',
    'youtuber',
    'podcast',
    'ceo',
    'philanthropist',
    'singer/songwriter',
    'screenwriter',
    'scientist',
    'co-host',
    'journalist',
    'entrepreneur',
    'astrophysicist',
    'politician',
    'pilot'
]

In [111]:
dataset = []
for desc in df['description']:
    newDesc = desc.lower()
    newDesc = newDesc.replace(',', '')
    newDesc = newDesc.replace('.', '')
    split = set(newDesc.split())
    found = split.intersection(keywords) 
    if found:
        dataset.append(found)
    else:
        dataset.append(pd.NaT)
    
df['category'] = dataset
df.head()

title           2112
description     2112
duration_ms     2112
release_date    2112
day             2112
month           2112
year            2112
guest           2023
category        1338
dtype: int64

In [121]:
df = df.dropna()

In [123]:
df.tail()

Unnamed: 0,title,description,duration_ms,release_date,day,month,year,guest,category
1817,"#316 - Enson Inoue, Chuck Liddell",Enson Inoue is a Japanese-American mixed marti...,116.756967,2013-01-22,22,1,2013,Enson Inoue,"{artist, ufc}"
1818,"#317 - David Choe, Yoshi Obayashi","David Choe is an American painter, muralist, g...",139.110383,2013-01-22,22,1,2013,David Choe,"{artist, comedian, podcast}"
1826,#307 - Bryan Callen,"Bryan Callen is an actor, stand-up comedian, a...",161.478033,2013-01-04,4,1,2013,Bryan Callen,"{comedian, actor, podcast}"
1827,#306 - Christopher Ryan,"Christopher Ryan, Ph.D., is a psychologist, sp...",166.43875,2013-01-03,3,1,2013,Christopher Ryan,"{phd, author}"
1845,#286 - Daniele Bolelli,"Daniele Bolelli is an Italian author, martial ...",137.318367,2012-11-19,19,11,2012,Daniele Bolelli,"{artist, author}"


In [None]:
category_subsets = {
    'comedy' : ['comedian', 'comic', 'stand-up'],
    'science' : ['phd', 'astrophysicist', 'scientist'],
    'music' : ['singer/songwriter', 'musician', 'artist'],
    'sports' : ['ufc', 'nfl', 'mlb', 'nba', 'nhl'],
    'business' : ['founder', 'entrepreneur', 'ceo']
}