In this notebook, we'll be using requests and some custom functions to take the list of shows we got in the previous notebook and search for them in the TVMaze API, returning some additional metadata and the IMDB ID of each show.

In [267]:
import pandas as pd
import numpy as np
import requests
import time
import re
shows = pd.read_pickle("ismyshowcancelled_final.pkl")

In [249]:
# Building a requester for the shows we want more information on
def show_request(show):
    
    # proper formatting
    show = '-'.join(show.lower().split()).replace(':','')
    
    # define params for json request
    params = {'q':show}#,'embed':'episodes'}

    # send request via requests
    r = requests.get('http://api.tvmaze.com/singlesearch/shows?',params=params)
    
    return r

In [250]:
# Extract the information we want for each show
def extract_info(r):
    
    # Check that the request worked
    if r.reason != 'OK':
        print('No Matches found for {}'.format(show))
        attributes = 7*[np.nan]
    
    else:
        data = r.json()

        # Pulling the show info we want
        name = data['name']
        tv_id = data['id']
        imdb = data['externals']['imdb']
        prem_date = data['premiered']
        rating = data['rating']['average']
        runtime = data['runtime']
        if len(data['schedule']['days']) == 1: 
            ep_day = data['schedule']['days'][0]
        else: ep_day = 'Multiple'

        attributes = [name,tv_id,imdb,prem_date,rating,runtime,ep_day]

        return attributes

In [270]:
# Create a dataframe to house this data
tvmaze = pd.DataFrame(columns=['name','tv_id','imdb','prem_date','rating','runtime','ep_day'])
index = 0

# Creating a loop to hopefully get all this show info!
for show in shows['title']:
    
    # Make the request for the show
    r = show_request(show)
    
    # Extract the attributes
    attributes = extract_info(r)
    
    # Add these as an entry in the dataframe (nulls populate if our search wasn't successful)
    tvmaze.loc[index] = attributes
    
    index += 1
    
    # Sleep for half a second so we don't get "too many requests" errors
    time.sleep(0.5)

No Matches found for 24: Live Another Day
No Matches found for Extreme Makeover: Weight Loss Edition
No Matches found for Lopez Tonight
No Matches found for Match Game (2016)
No Matches found for Numb3rs (aka Numbers)
No Matches found for Panic 911
No Matches found for Rock Center with Brian Williams
No Matches found for Teach: Tony Danza
No Matches found for The $100,000 Pyramid (2016)
No Matches found for The Gong Show (2017)
No Matches found for The Martha Stewart Show
No Matches found for The X-Files (2016)
No Matches found for To Tell The Truth (2016)
No Matches found for Twin Peaks (2017)


In [272]:
tvmaze.to_csv("tvmaze_tmp_1.csv")

In [365]:
tvmaze.head()

Unnamed: 0,name,tv_id,imdb,prem_date,rating,runtime,ep_day
0,$#*! My Dad Says,1986,tt1612578,2010-09-23,6.2,30,Thursday
1,100 Code,3953,tt3515512,2015-03-11,8.1,60,Wednesday
2,101 Ways to Leave a Gameshow,12166,tt1702030,2010-07-10,,60,Saturday
3,12 Monkeys,614,tt3148266,2015-01-16,7.9,60,Friday
4,13 Reasons Why,7194,tt1837492,2017-03-31,8.2,60,Friday


In [856]:
tvmaze = pd.read_csv("tvmaze_tmp_1.csv",index_col=0)

In [857]:
# Shows we need to redo
re_search_shows = ['24: Live Another Day',
 'Extreme Makeover: Weight Loss Edition',
 'Lopez Tonight',
 'Match Game (2016)',
 'Numb3rs (aka Numbers)',
 'Panic 911',
 'Rock Center with Brian Williams',
 'Teach: Tony Danza',
 'The $100,000 Pyramid (2016)',
 'The Gong Show (2017)',
 'The Martha Stewart Show',
 'The X-Files (2016)',
 'To Tell The Truth (2016)',
 'Twin Peaks (2017)']

# Their indices in our original data
indices = []
for item in re_search_shows:
    indices.append(shows[shows['title'] == item].index[0])

# Redoing searches with some simple rules to see if it helps
new_terms = []
for show in re_search_shows:
    show = re.sub(r'\(.*\)','', show).strip()
    new_terms.append(show)
    
# Changing the pyramid one
new_terms[8] = 'the-100000-pyramid'

In [859]:
new_terms

['24: Live Another Day',
 'Extreme Makeover: Weight Loss Edition',
 'Lopez Tonight',
 'Match Game',
 'Numb3rs',
 'Panic 911',
 'Rock Center with Brian Williams',
 'Teach: Tony Danza',
 'the-100000-pyramid',
 'The Gong Show',
 'The Martha Stewart Show',
 'The X-Files',
 'To Tell The Truth',
 'Twin Peaks']

This would probably work better with the above terms. A more surefire way, though, would be to just look up the IMDB IDs for these ten and index off those, just to be safe. This has the added benefit that even if we don't find them in tvmaze, we'll still be able to look up their IMDB information later.

In [869]:
imdb_ids= ['tt1598754',
 'tt1713288',
 'tt1489432',
 'tt5672484',
 'tt0433309',
 'tt2738058',
 'tt2084611',
 'tt1442170',
 'tt5330088',
 'tt6128376',
 'tt3450386',
 'tt0106179',
 'tt5817158',
 'tt4093826']

In [871]:
imdb_dict = {x:y for x,y in zip(imdb_ids,indices)}

In [872]:
# Building a requester for the shows we want more information on
def imdb_request(imdb):
    
    # define params for json request
    params = {'imdb':imdb}

    # send request via requests
    r = requests.get('http://api.tvmaze.com/lookup/shows?',params=params)
    
    return r

In [873]:
# Extract the information we want for each show
def extract_imdb_info(r,imdb):
    
    # Check that the request worked
    if r.reason != 'OK':
        print('No Matches found for {}'.format(shows.iloc[imdb_dict[imdb_id],:].loc['title']))
        attributes = np.array([np.nan,np.nan,imdb,np.nan,np.nan,np.nan,np.nan])
        attributes[2] = imdb
    
    else:
        data = r.json()

        # Pulling the show info we want
        name = data['name']
        tv_id = data['id']
        imdb = data['externals']['imdb']
        prem_date = data['premiered']
        rating = data['rating']['average']
        runtime = data['runtime']
        if len(data['schedule']['days']) == 1: 
            ep_day = data['schedule']['days'][0]
        else: ep_day = 'Multiple'

        attributes = [name,tv_id,imdb,prem_date,rating,runtime,ep_day]
        print(attributes)
        return attributes

In [874]:
redo_shows = pd.DataFrame(data={'index':indices,'re_search_shows':re_search_shows,'imdb':imdb_ids})
redo_shows

Unnamed: 0,imdb,index,re_search_shows
0,tt1598754,12,24: Live Another Day
1,tt1713288,413,Extreme Makeover: Weight Loss Edition
2,tt1489432,737,Lopez Tonight
3,tt5672484,805,Match Game (2016)
4,tt0433309,890,Numb3rs (aka Numbers)
5,tt2738058,924,Panic 911
6,tt2084611,1017,Rock Center with Brian Williams
7,tt1442170,1189,Teach: Tony Danza
8,tt5330088,1202,"The $100,000 Pyramid (2016)"
9,tt6128376,1289,The Gong Show (2017)


In [875]:
# Let's try to ping for this info!
for imdb_id, index in zip(redo_shows['imdb'],redo_shows['index']):
    
    # Request based on the imdb id
    r = imdb_request(imdb_id)
    
    # Extract from the result, saving the id if we don't get anything back
    attributes = extract_imdb_info(r,imdb_id)        
    
    tvmaze.iloc[index,:] = attributes

No Matches found for 24: Live Another Day
['Extreme Weight Loss', 877, 'tt1713288', '2011-05-30', None, 120, 'Tuesday']
No Matches found for Lopez Tonight
No Matches found for Match Game (2016)
['Numb3rs', 682, 'tt0433309', '2005-01-23', 8, 60, 'Friday']
['Panic 9-1-1', 19907, 'tt2738058', '2012-11-29', None, 60, 'Thursday']
No Matches found for Rock Center with Brian Williams
No Matches found for Teach: Tony Danza
['The $100,000 Pyramid', 11491, 'tt5330088', '2016-06-26', None, 60, 'Sunday']
No Matches found for The Gong Show (2017)
No Matches found for The Martha Stewart Show
['The X-Files', 430, 'tt0106179', '1993-09-10', 8.7, 60, 'Wednesday']
No Matches found for To Tell The Truth (2016)
No Matches found for Twin Peaks (2017)


In [912]:
tvmaze[(tvmaze['name'].isnull()) & (tvmaze['imdb'].isnull())]

Unnamed: 0,name,tv_id,imdb,prem_date,rating,runtime,ep_day
12,,,,,,,
737,,,,,,,
805,,,,,,,
1017,,,,,,,
1189,,,,,,,
1289,,,,,,,
1337,,,,,,,
1451,,,,,,,
1482,,,,,,,


Don't quite understand why this isn't at least appending the IMDB IDs. Doing that here...

In [877]:
for imdb,index in zip(redo_shows['imdb'],redo_shows['index']):
    print(index,imdb)

12 tt1598754
413 tt1713288
737 tt1489432
805 tt5672484
890 tt0433309
924 tt2738058
1017 tt2084611
1189 tt1442170
1202 tt5330088
1289 tt6128376
1337 tt3450386
1438 tt0106179
1451 tt5817158
1482 tt4093826


In [928]:
for imdb,index in zip(redo_shows.copy()['imdb'],redo_shows.copy()['index']):
    tvmaze.loc[:,'imdb'].iloc[index] = imdb

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [937]:
tvmaze.loc[:,'imdb'].iloc[924]

'tt2738058'

In [935]:
tvmaze[(tvmaze['name'].isnull()) & (tvmaze['imdb'].isnull())]

Unnamed: 0,name,tv_id,imdb,prem_date,rating,runtime,ep_day


In [938]:
tvmaze.to_csv("tvmaze_tmp_2.csv")

Phew, okay now we at least have an IMDB ID or tvmaze supplemental information for each show. Let's do one last pass with the cleaned show names we abandoned earlier.

In [946]:
new_show_dict = {x:y for x,y in zip(new_terms,indices)}
for item in ['Panic 911','Extreme Makeover: Weight Loss Edition','Numb3rs','the-100000-pyramid','The X-Files']:
    new_show_dict.pop(item)
new_show_dict

{'24: Live Another Day': 12,
 'Lopez Tonight': 737,
 'Match Game': 805,
 'Rock Center with Brian Williams': 1017,
 'Teach: Tony Danza': 1189,
 'The Gong Show': 1289,
 'The Martha Stewart Show': 1337,
 'To Tell The Truth': 1451,
 'Twin Peaks': 1482}

We have values for Extreme Weight Loss, Numb3rs, Panic 911, 100k pyramid, and x files, so we won't iterate over those again

In [947]:
tvmaze.loc[:, tvmaze.columns != 'imdb'].iloc[924]

name         Panic 9-1-1
tv_id              19907
prem_date     2012-11-29
rating               NaN
runtime               60
ep_day          Thursday
Name: 924, dtype: object

In [949]:
# Creating a loop to hopefully get all this show info!
for show,index in new_show_dict.items():
    
    # Make the request for the show
    r = show_request(show)
    
    # Extract the attributes
    attributes = extract_info(r)
    
    # Making sure IMDB id is not overwritten
    if attributes is not None:
        #attributes = [x for i,x in enumerate(attributes) if i!=2]
        print(attributes)

No Matches found for 24: Live Another Day
No Matches found for Lopez Tonight
['Match Game', 16694, None, '2016-06-26', None, 60, 'Thursday']
No Matches found for Rock Center with Brian Williams
No Matches found for Teach: Tony Danza
['The Gong Show', 21746, None, '2017-06-22', 5, 60, 'Thursday']
No Matches found for The Martha Stewart Show
['To Tell the Truth', 16041, None, '2016-06-14', None, 60, 'Monday']
['Twin Peaks', 156, 'tt0098936', '1990-04-08', 8.2, 60, 'Sunday']


Cool, four more matches. I'll do this manually since I'm having trouble not overriding the preexisting IMDB IDs that we've independently collected and verified.

In [951]:
redo_shows

Unnamed: 0,imdb,index,re_search_shows
0,tt1598754,12,24: Live Another Day
1,tt1713288,413,Extreme Makeover: Weight Loss Edition
2,tt1489432,737,Lopez Tonight
3,tt5672484,805,Match Game (2016)
4,tt0433309,890,Numb3rs (aka Numbers)
5,tt2738058,924,Panic 911
6,tt2084611,1017,Rock Center with Brian Williams
7,tt1442170,1189,Teach: Tony Danza
8,tt5330088,1202,"The $100,000 Pyramid (2016)"
9,tt6128376,1289,The Gong Show (2017)


In [966]:
tvmaze.iloc[805] = ['Match Game', 16694, 'tt5672484', '2016-06-26', None, 60, 'Thursday']
tvmaze.iloc[1289] = ['The Gong Show', 21746, 'tt6128376', '2017-06-22', 5, 60, 'Thursday']
tvmaze.iloc[1451] = ['To Tell the Truth', 16041, 'tt5817158', '2016-06-14', None, 60, 'Monday']
# Manually changing this to just go with the newer version of Twin Peaks
tvmaze.iloc[1482] = ['Twin Peaks', 156, 'tt4093826', '1990-04-08', 8.2, 60, 'Sunday']

In [969]:
tvmaze[tvmaze['name'].isnull()]

Unnamed: 0,name,tv_id,imdb,prem_date,rating,runtime,ep_day
12,,,tt1598754,,,,
737,,,tt1489432,,,,
1017,,,tt2084611,,,,
1189,,,tt1442170,,,,
1337,,,tt3450386,,,,


Pretty good. Only five total shows unaccounted for in the tvmaze data, and for all of those we have the IMDB IDs to pull more data.

In [970]:
tvmaze.to_csv("tvmaze_tmp_3.csv")

In [972]:
tvmaze.iloc[737]

name              None
tv_id              NaN
imdb         tt1489432
prem_date         None
rating             NaN
runtime            NaN
ep_day            None
Name: 737, dtype: object

In [973]:
shows.iloc[737]

genre                                                           Talk
link               http://www.ismyshowcancelled.com/show/2009/lop...
network                                                          TBS
status                                                     Cancelled
tagline            A late-night talk show starring comedian Georg...
title                                                  Lopez Tonight
years                                                    2009 - 2011
start_year                                                      2009
end_year                                                        2011
synopsis           Lopez Tonight is an hour-long talk show that f...
primary_genre                                                   Talk
secondary_genre                                                  NaN
Comedy                                                             0
Drama                                                              0
Game Show                         

In [2]:
import pandas as pd

In [5]:
tvmaze = pd.read_csv("tvmaze_tmp_3.csv",index_col=0)

In [6]:
tvmaze.dtypes

name          object
tv_id        float64
imdb          object
prem_date     object
rating       float64
runtime      float64
ep_day        object
dtype: object

In [None]:
# Doing some downcasting to save to pkl...
tvmaze[['name','imdb','tv_id','ep_day']] = tvmaze[['name','imdb','tv_id','ep_day']].apply(lambda x: x.astype('category')) 