In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from api_keys import client_id, client_secret

import pandas as pd
import re

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=client_id, client_secret=client_secret))

# Load in all spotify data

In [None]:
# Get total number of episodes for the show

show_id = '4rOoJ6Egrf8K2IrywzwOMk'
podcast = sp.show(show_id=show_id, market='US')

total_episodes = podcast['total_episodes']

print(total_episodes)

In [None]:
# Get data for all shows
showNum = 0
data = []
# Pagination for show requests
while showNum <= total_episodes:
    print("Current Show Count: ", showNum)
    result = sp.show_episodes(show_id=show_id, limit=50, offset=showNum, market='US')

    episodes = result['items']

    # Loop through the 50 episodes requested
    for episode in episodes:
        title = episode['name']
        description = episode['description']
        duration_ms = episode['duration_ms']
        release_date = episode['release_date']
        
        myEpisode = {
            'title' : title,
            'description' : description,
            'duration_ms' : duration_ms,
            'release_date' : release_date
        }
        
        data.append(myEpisode)
    showNum += 50

print(len(data))

# Create dataframe and format

In [None]:
# Create DF from data
df = pd.DataFrame(data)
df.head()

In [None]:
# Change MS column to Seconds
df['duration_ms'] = df['duration_ms'].apply(lambda x: float(x / 60000)).round(decimals=2)
df.head()

In [None]:
# Rename Columns
df = df.rename(columns={'duration_ms' : 'duration_seconds'})

In [None]:
# Find guest in show
guests = []
for row in df['title']:
    guest = re.findall(r"[A-Z][a-z]+,?\s+(?:[A-Z][a-z]*\.?\s*)?[A-Z][a-z]+", row)
    if guest:
         guests.append(guest[0])
    else:
        guests.append(pd.NaT)
df['guest'] = guests
df.head()

In [None]:
df['guest'].count

In [None]:
df.dropna()

In [None]:
# Split release date into seperate columns\n,
df['release_date'] = df['release_date'].astype('datetime64[ns]')
df['day'] = df['release_date'].map(lambda x: x.day)
df['month'] = df['release_date'].map(lambda x: x.month)
df['year'] = df['release_date'].map(lambda x: x.year)
df.head()

In [None]:
keywords = [
    'comedian',
    'comic',
    "phd",
    'actor',
    'writer',
    'author',
    'ufc',
    'mma',
    'musician',
    'artist',
    'founder',
    'youtuber',
    'podcast',
    'ceo',
    'philanthropist',
    'singer/songwriter',
    'screenwriter',
    'scientist',
    'co-host',
    'journalist',
    'entrepreneur',
    'astrophysicist',
    'politician',
    'pilot',
    'fighter'
]

In [None]:
dataset = []
for desc in df['description']:
    newDesc = desc.lower()
    newDesc = newDesc.replace(',', '')
    newDesc = newDesc.replace('.', '')
    split = set(newDesc.split())
    found = split.intersection(keywords) 
    if found:
        dataset.append(found)
    else:
        dataset.append(pd.NaT)
    
df['category'] = dataset
df.head()

In [None]:
df = df.dropna()

In [None]:
df.tail()

In [None]:
category_subsets = {
    'comedy'   : ['comedian', 'comic', 'stand-up'],
    'science'  : ['phd', 'astrophysicist', 'scientist'],
    'music'    : ['singer/songwriter', 'musician', 'artist'],
    'sports'   : ['mma', 'ufc', 'nfl', 'mlb', 'nba', 'nhl', 'fighter'],
    'business' : ['founder', 'entrepreneur', 'ceo'],
    'movies'   : ['actor', 'screenwriter'],
    'writer'   : ['author', 'writer']
}

catList = ['comedy', 'science', 'music', 'sports', 'business', 'movies', 'writer']

In [None]:
def changeListToFalse(myList):
    for key, value in myList.items():
        myList[key] = False
    return myList

In [None]:

category_counts = {
    'comedy' : 0,
    'science' : 0,
    'music' : 0,
    'sports' : 0,
    'business' : 0,
    'movies' : 0,
    'writer' : 0
}

category_found = {
    'comedy' : False,
    'science' : False,
    'music' : False,
    'sports' : False,
    'business' : False,
    'movies' : False,
    'writer' : False
}


for column in df['category']:
    for cat in column:
        for mainCat in catList:
            if cat in category_subsets[mainCat] and not category_found[mainCat]:
                category_counts[mainCat] += 1
                category_found[mainCat] = True
    
    changeListToFalse(category_found)       
            
    
    
print(category_counts)

In [None]:
df = df.explode('category')

df.tail()

In [None]:
comedyDF = df.loc[df['category'].isin(category_subsets['comedy'])]

comedyDF.tail()

In [None]:
scienceDF = df.loc[df['category'].isin(category_subsets['science'])]

scienceDF.tail()

In [None]:
musicDF = df.loc[df['category'].isin(category_subsets['music'])]

musicDF.tail()

In [None]:
sportsDF = df.loc[df['category'].isin(category_subsets['sports'])]

sportsDF.tail()

In [None]:
businessDF = df.loc[df['category'].isin(category_subsets['business'])]

businessDF.tail()

In [None]:
moviesDF = df.loc[df['category'].isin(category_subsets['movies'])]

moviesDF.tail()

In [None]:
writerDF = df.loc[df['category'].isin(category_subsets['writer'])]

writerDF.tail()

# Save and Load CSV Files

In [None]:
# Save to CSV To Reuse
df.to_csv('data_files/podcast_data.csv', index=False)

In [None]:
# Pull Shows From CSV
df = pd.read_csv('data_files/podcast_data.csv')
df.head()