In [8]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from api_keys import client_id, client_secret

from keywords import keywords

import pandas as pd
import re

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

In [2]:
# Get total number of episodes for the show

show_id = '4rOoJ6Egrf8K2IrywzwOMk'
podcast = sp.show(show_id=show_id, market='US')

total_episodes = podcast['total_episodes']

print(total_episodes)

2112


In [3]:
# Get data for all shows
showNum = 0
data = []
# Pagination for show requests
while showNum <= total_episodes:
    print("Current Show Count: ", showNum)
    result = sp.show_episodes(show_id=show_id, limit=50, offset=showNum, market='US')

    episodes = result['items']

    # Loop through the 50 episodes requested
    for episode in episodes:
        title = episode['name']
        description = episode['description']
        duration_ms = episode['duration_ms']
        release_date = episode['release_date']
        
        myEpisode = {
            'title' : title,
            'description' : description,
            'duration_ms' : duration_ms,
            'release_date' : release_date
        }
        
        data.append(myEpisode)
    showNum += 50

print(len(data))

Current Show Count:  0
Current Show Count:  50
Current Show Count:  100
Current Show Count:  150
Current Show Count:  200
Current Show Count:  250
Current Show Count:  300
Current Show Count:  350
Current Show Count:  400
Current Show Count:  450
Current Show Count:  500
Current Show Count:  550
Current Show Count:  600
Current Show Count:  650
Current Show Count:  700
Current Show Count:  750
Current Show Count:  800
Current Show Count:  850
Current Show Count:  900
Current Show Count:  950
Current Show Count:  1000
Current Show Count:  1050
Current Show Count:  1100
Current Show Count:  1150
Current Show Count:  1200
Current Show Count:  1250
Current Show Count:  1300
Current Show Count:  1350
Current Show Count:  1400
Current Show Count:  1450
Current Show Count:  1500
Current Show Count:  1550
Current Show Count:  1600
Current Show Count:  1650
Current Show Count:  1700
Current Show Count:  1750
Current Show Count:  1800
Current Show Count:  1850
Current Show Count:  1900
Current S

In [4]:
# Create DF from data
df = pd.DataFrame(data)
df.head()

Unnamed: 0,title,description,duration_ms,release_date
0,#1970 - Bill Ottman,"Bill Ottman is founder of Minds, an open sourc...",10369428,2023-04-13
1,#1969 - Sam Tallent,"Sam Tallent is a stand-up comic, writer, and a...",10797097,2023-04-12
2,#1968 - Jason Everman,Jason Everman is a musician and military veter...,9347177,2023-04-11
3,JRE MMA Show #138 with Cory Sandhagen,"Joe is joined by Cory Sandhagen, a professiona...",9816340,2023-04-07
4,#1967 - Mike Vecchione,Mike Vecchione is a stand-up comic and actor. ...,9674281,2023-04-06


In [5]:
# Change MS column to Seconds
df['duration_ms'] = df['duration_ms'].apply(lambda x: float(x / 60000)).round(decimals=2)
df.head()

Unnamed: 0,title,description,duration_ms,release_date
0,#1970 - Bill Ottman,"Bill Ottman is founder of Minds, an open sourc...",172.82,2023-04-13
1,#1969 - Sam Tallent,"Sam Tallent is a stand-up comic, writer, and a...",179.95,2023-04-12
2,#1968 - Jason Everman,Jason Everman is a musician and military veter...,155.79,2023-04-11
3,JRE MMA Show #138 with Cory Sandhagen,"Joe is joined by Cory Sandhagen, a professiona...",163.61,2023-04-07
4,#1967 - Mike Vecchione,Mike Vecchione is a stand-up comic and actor. ...,161.24,2023-04-06


In [21]:
# Rename Columns
df = df.rename(columns={'duration_ms' : 'duration_seconds'})

Unnamed: 0,title,description,duration_seconds,release_date,guest,category
0,#1970 - Bill Ottman,"Bill Ottman is founder of Minds, an open sourc...",172.82,2023-04-13,Bill Ottman,{founder}
1,#1969 - Sam Tallent,"Sam Tallent is a stand-up comic, writer, and a...",179.95,2023-04-12,Sam Tallent,"{writer, actor, comic, podcast, author}"
2,#1968 - Jason Everman,Jason Everman is a musician and military veter...,155.79,2023-04-11,Jason Everman,{musician}
3,JRE MMA Show #138 with Cory Sandhagen,"Joe is joined by Cory Sandhagen, a professiona...",163.61,2023-04-07,Cory Sandhagen,{artist}
4,#1967 - Mike Vecchione,Mike Vecchione is a stand-up comic and actor. ...,161.24,2023-04-06,Mike Vecchione,"{actor, comic}"
...,...,...,...,...,...,...
1817,"#316 - Enson Inoue, Chuck Liddell",Enson Inoue is a Japanese-American mixed marti...,116.76,2013-01-22,Enson Inoue,"{artist, ufc}"
1818,"#317 - David Choe, Yoshi Obayashi","David Choe is an American painter, muralist, g...",139.11,2013-01-22,David Choe,"{comedian, podcast, artist}"
1826,#307 - Bryan Callen,"Bryan Callen is an actor, stand-up comedian, a...",161.48,2013-01-04,Bryan Callen,"{comedian, actor, podcast}"
1827,#306 - Christopher Ryan,"Christopher Ryan, Ph.D., is a psychologist, sp...",166.44,2013-01-03,Christopher Ryan,"{phd, author}"


In [9]:
# Find guest in show
guests = []
for row in df['title']:
    guest = re.findall(r"[A-Z][a-z]+,?\s+(?:[A-Z][a-z]*\.?\s*)?[A-Z][a-z]+", row)
    if guest:
         guests.append(guest[0])
    else:
        guests.append(pd.NaT)
df['guest'] = guests
df.head()

Unnamed: 0,title,description,duration_ms,release_date,guest
0,#1970 - Bill Ottman,"Bill Ottman is founder of Minds, an open sourc...",172.82,2023-04-13,Bill Ottman
1,#1969 - Sam Tallent,"Sam Tallent is a stand-up comic, writer, and a...",179.95,2023-04-12,Sam Tallent
2,#1968 - Jason Everman,Jason Everman is a musician and military veter...,155.79,2023-04-11,Jason Everman
3,JRE MMA Show #138 with Cory Sandhagen,"Joe is joined by Cory Sandhagen, a professiona...",163.61,2023-04-07,Cory Sandhagen
4,#1967 - Mike Vecchione,Mike Vecchione is a stand-up comic and actor. ...,161.24,2023-04-06,Mike Vecchione


In [10]:
df['guest'].count

<bound method Series.count of 0          Bill Ottman
1          Sam Tallent
2        Jason Everman
3       Cory Sandhagen
4       Mike Vecchione
             ...      
2107      John Heffron
2108      John Heffron
2109       Ari Shaffir
2110      Brian Redban
2111      Brian Redban
Name: guest, Length: 2112, dtype: object>

In [11]:
df.dropna()

Unnamed: 0,title,description,duration_ms,release_date,guest
0,#1970 - Bill Ottman,"Bill Ottman is founder of Minds, an open sourc...",172.82,2023-04-13,Bill Ottman
1,#1969 - Sam Tallent,"Sam Tallent is a stand-up comic, writer, and a...",179.95,2023-04-12,Sam Tallent
2,#1968 - Jason Everman,Jason Everman is a musician and military veter...,155.79,2023-04-11,Jason Everman
3,JRE MMA Show #138 with Cory Sandhagen,"Joe is joined by Cory Sandhagen, a professiona...",163.61,2023-04-07,Cory Sandhagen
4,#1967 - Mike Vecchione,Mike Vecchione is a stand-up comic and actor. ...,161.24,2023-04-06,Mike Vecchione
...,...,...,...,...,...
2107,"#5 - John Heffron, Ari Shaffir (Part 1)","Joe sits down with John Heffron, and Ari Shaffir.",64.17,2010-01-21,John Heffron
2108,"#5 - John Heffron, Ari Shaffir (Part 2)","Joe sits down with John Heffron, and Ari Shaffir.",72.94,2010-01-21,John Heffron
2109,#3 - Ari Shaffir,Joe sits down with Ari Shaffir.,134.99,2010-01-06,Ari Shaffir
2110,#2 - Brian Redban,Joe sits down with Brian Redban.,152.11,2009-12-29,Brian Redban


In [22]:
# Split release date into seperate columns\n,
df['release_date'] = df['release_date'].astype('datetime64[ns]')
df['day'] = df['release_date'].map(lambda x: x.day)
df['month'] = df['release_date'].map(lambda x: x.month)
df['year'] = df['release_date'].map(lambda x: x.year)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['release_date'] = df['release_date'].astype('datetime64[ns]')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day'] = df['release_date'].map(lambda x: x.day)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['month'] = df['release_date'].map(lambda x: x.month)
A value is trying to be set on a

Unnamed: 0,title,description,duration_ms,release_date,guest,category,day,month,year
0,#1970 - Bill Ottman,"Bill Ottman is founder of Minds, an open sourc...",172.82,2023-04-13,Bill Ottman,{founder},13,4,2023
1,#1969 - Sam Tallent,"Sam Tallent is a stand-up comic, writer, and a...",179.95,2023-04-12,Sam Tallent,"{writer, actor, comic, podcast, author}",12,4,2023
2,#1968 - Jason Everman,Jason Everman is a musician and military veter...,155.79,2023-04-11,Jason Everman,{musician},11,4,2023
3,JRE MMA Show #138 with Cory Sandhagen,"Joe is joined by Cory Sandhagen, a professiona...",163.61,2023-04-07,Cory Sandhagen,{artist},7,4,2023
4,#1967 - Mike Vecchione,Mike Vecchione is a stand-up comic and actor. ...,161.24,2023-04-06,Mike Vecchione,"{actor, comic}",6,4,2023


# Use these cells if you already have CSV File

In [None]:
# Pull Shows From CSV
df = pd.read_csv('data_files/podcast_data.csv')
df.head()

In [13]:
# Save to CSV To Reuse
df.to_csv('data_files/podcast_data.csv', index=False)

In [15]:
from keywords import keywordsList

ImportError: cannot import name 'keywordsList' from 'keywords' (/Users/hunter/Documents/GitHub/Wether_Men_Proj_1/keywords.py)

In [16]:
keywords = [
    'comedian',
    'comic',
    "phd",
    'actor',
    'writer',
    'author',
    'ufc',
    'musician',
    'artist',
    'founder',
    'youtuber',
    'podcast',
    'ceo',
    'philanthropist',
    'singer/songwriter',
    'screenwriter',
    'scientist',
    'co-host',
    'journalist',
    'entrepreneur',
    'astrophysicist',
    'politician',
    'pilot'
]

In [17]:
dataset = []
for desc in df['description']:
    newDesc = desc.lower()
    newDesc = newDesc.replace(',', '')
    newDesc = newDesc.replace('.', '')
    split = set(newDesc.split())
    found = split.intersection(keywords) 
    if found:
        dataset.append(found)
    else:
        dataset.append(pd.NaT)
    
df['category'] = dataset
df.head()

Unnamed: 0,title,description,duration_ms,release_date,guest,category
0,#1970 - Bill Ottman,"Bill Ottman is founder of Minds, an open sourc...",172.82,2023-04-13,Bill Ottman,{founder}
1,#1969 - Sam Tallent,"Sam Tallent is a stand-up comic, writer, and a...",179.95,2023-04-12,Sam Tallent,"{writer, actor, comic, podcast, author}"
2,#1968 - Jason Everman,Jason Everman is a musician and military veter...,155.79,2023-04-11,Jason Everman,{musician}
3,JRE MMA Show #138 with Cory Sandhagen,"Joe is joined by Cory Sandhagen, a professiona...",163.61,2023-04-07,Cory Sandhagen,{artist}
4,#1967 - Mike Vecchione,Mike Vecchione is a stand-up comic and actor. ...,161.24,2023-04-06,Mike Vecchione,"{actor, comic}"


In [18]:
df = df.dropna()

In [19]:
df.tail()

Unnamed: 0,title,description,duration_ms,release_date,guest,category
1817,"#316 - Enson Inoue, Chuck Liddell",Enson Inoue is a Japanese-American mixed marti...,116.76,2013-01-22,Enson Inoue,"{artist, ufc}"
1818,"#317 - David Choe, Yoshi Obayashi","David Choe is an American painter, muralist, g...",139.11,2013-01-22,David Choe,"{comedian, podcast, artist}"
1826,#307 - Bryan Callen,"Bryan Callen is an actor, stand-up comedian, a...",161.48,2013-01-04,Bryan Callen,"{comedian, actor, podcast}"
1827,#306 - Christopher Ryan,"Christopher Ryan, Ph.D., is a psychologist, sp...",166.44,2013-01-03,Christopher Ryan,"{phd, author}"
1845,#286 - Daniele Bolelli,"Daniele Bolelli is an Italian author, martial ...",137.32,2012-11-19,Daniele Bolelli,"{author, artist}"


In [20]:
category_subsets = {
    'comedy'   : ['comedian', 'comic', 'stand-up'],
    'science'  : ['phd', 'astrophysicist', 'scientist'],
    'music'    : ['singer/songwriter', 'musician', 'artist'],
    'sports'   : ['ufc', 'nfl', 'mlb', 'nba', 'nhl'],
    'business' : ['founder', 'entrepreneur', 'ceo'],
    'movies'   : ['actor', 'screenwriter'],
    'writer'   : ['author', 'writer'],
}

catList = ['comedy', 'science', 'music', 'sports', 'business', 'movies', 'writer']

In [24]:
category_counts = {
    'comedy' : 0,
    'science' : 0,
    'music' : 0,
    'sports' : 0,
    'business' : 0,
    'movies' : 0,
    'writer' : 0
}


for myList in df['category']:
    for cat in myList:
        for mainCat in catList:
            if cat in category_subsets[mainCat]:
                category_counts[mainCat] += 1
    
    
print(category_counts)

{'comedy': 577, 'science': 61, 'music': 144, 'sports': 96, 'business': 184, 'movies': 322, 'writer': 525}
