In this notebook we'll be using beautifulsoup and requests to scrape for a list of shows and their cancellation information, along with some rudimentary metadata.

In [263]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import math

In [2]:
url = "http://www.ismyshowcancelled.com/shows/all/"

content = requests.get(url).content

soup = BeautifulSoup(content, "lxml")

print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="user-scalable = yes, width = device-width" name="viewport"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <title>
   All TV Shows - Cancelled, Renewed, and Concluded TV Shows -  - IsMyShowCancelled.com
  </title>
  <meta content="Our full list of TV shows, including cancelled, on air, and concluded series. Get the latest information about all your favorite television shows at IsMyShowCancelled.com" name="description"/>
  <meta content="is my show cancelled,is my show canceled,cancelled tv show, tv show, television, television show, cancelled tv shows, on air shows, cancelled shows, canceled show, tv news, canceled,ismyshowcancelled,is my,ismy,TV series, renewed, renew, season, premiere, finale, network, cable" name="keywords"/>
  <meta content="INDEX,FOLLOW,NOODP" name="ROBOTS"/>
  <link href="http://www.ismyshowcancelled.com/feed/news/" rel="alternate" title="Is My Show Canc



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [3]:
# Locating the name of the show
titles = []

for item in soup.find_all('div',class_='text-section'):
    if item.div is not None:
        title = item.find_all('h3')[0].string
        titles.append(title)

In [4]:
# Getting show statuses
statuses = []

# Defining boolean function for multiple possible classes
# E.g. cancelled, concluded, or currently on air series
def status_class(css_class):
    return css_class in ['status-can','status-conc','status-onair','status-new']

for status in soup.find_all("span",class_=status_class):
    statuses.append(status.string)

In [5]:
# Locating the shows' attributes
attributes = []

for attr in soup.find_all('div',class_='stat'):
    attributes.append(attr.find_all('span')[1].string)

years = [attributes[i] for i in range(0, len(attributes), 3)]
networks = [attributes[i] for i in range(1, len(attributes), 3)]
genres = [attributes[i] for i in range(2, len(attributes), 3)]

In [6]:
# Locating taglines for each show
taglines = []

for desc in soup.find_all('div',class_='excerpt'):
    description = desc.text
    #getting rid of some extra test that's in some of them:
    if '\xa0' in description:
        description = description.split('\xa0')[0]
    taglines.append(description)

In [7]:
# Grabbing links to get more information (we'll use this later)
links = []

for item in soup.find_all('div',class_='excerpt'):
    for link in item.find_all('a'):
         links.append(link.attrs['href'])

In [8]:
# Creating a dataframe to house the data
shows = pd.DataFrame(data={'title':titles,'status':statuses,'years':years,'network':networks,'genre':genres,\
                          'tagline':taglines,'link':links})

Now let's scrape a bunch more pages!!! I'm going to delete the previous df so we can run it all together in one clean run.

In [16]:
# Creating the loops of the pages we want to crawl
scrape_urls = ['http://www.ismyshowcancelled.com/shows/all/']

for num in np.arange(2,159):
    new_url = 'http://www.ismyshowcancelled.com/shows/all/page/{}/'.format(num)
    scrape_urls.append(new_url)

# Defining boolean function for multiple possible classes
# E.g. cancelled, concluded, or currently on air series
def status_class(css_class):
    return css_class in ['status-can','status-conc','status-onair','status-new']

# Setting this to none so we can create it, as opposed to append to it
shows = None

In [17]:
for url in scrape_urls:
    
    content = requests.get(url).content

    soup = BeautifulSoup(content, "lxml")
    
    # Locating the name of the show
    titles = []

    for item in soup.find_all('div',class_='text-section'):
        if item.div is not None:
            title = item.find_all('h3')[0].string
            titles.append(title)
    
    # Getting show statuses
    statuses = []

    for status in soup.find_all("span",class_=status_class):
        statuses.append(status.string)
        
    # Locating the shows' attributes
    attributes = []

    for attr in soup.find_all('div',class_='stat'):
        attributes.append(attr.find_all('span')[1].string)

    years = [attributes[i] for i in range(0, len(attributes), 3)]
    networks = [attributes[i] for i in range(1, len(attributes), 3)]
    genres = [attributes[i] for i in range(2, len(attributes), 3)]
    
    # Locating taglines for each show
    taglines = []

    for desc in soup.find_all('div',class_='excerpt'):
        description = desc.text
        #getting rid of some extra test that's in some of them:
        if '\xa0' in description:
            description = description.split('\xa0')[0]
        taglines.append(description)
        
    # Grabbing links to get more information (we'll use this later)
    links = []

    for item in soup.find_all('div',class_='excerpt'):
        for link in item.find_all('a'):
             links.append(link.attrs['href'])
                
    # Creating a dataframe to house the data
    new_shows = pd.DataFrame(data={'title':titles,'status':statuses,'years':years,'network':networks,'genre':genres,\
                              'tagline':taglines,'link':links})
    
    if shows is None:
        shows = new_shows.copy()
    else:
        shows = shows.append(new_shows,ignore_index=True)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [18]:
shows

Unnamed: 0,genre,link,network,status,tagline,title,years
0,Comedy,http://www.ismyshowcancelled.com/show/2010/ble...,CBS,Cancelled,"A sitcom based on the Twitter feed ""S*** My Da...",$#*! My Dad Says,2010 - 2011
1,Drama / Crime,http://www.ismyshowcancelled.com/show/2018/100...,WGN America,Coming Soon,A thriller following an New York cop who trave...,100 Code,2018 - Present
2,Game Show,http://www.ismyshowcancelled.com/show/2011/101...,ABC,Cancelled,A game show competition where contestants are ...,101 Ways to Leave a Gameshow,2011 - 2011
3,Drama / Sci-fi,http://www.ismyshowcancelled.com/show/2015/12-...,Syfy,On Air,A drama following a man sent back in time to p...,12 Monkeys,2015 - Present
4,Drama,http://www.ismyshowcancelled.com/show/2017/13-...,Netflix,On Air,A drama following the revelation of why a youn...,13 Reasons Why,2017 - Present
5,Reality,http://www.ismyshowcancelled.com/show/2009/16-...,MTV,On Air,A reality series documenting the hardships of ...,16 and Pregnant,2009 - Present
6,Comedy,http://www.ismyshowcancelled.com/show/2012/160...,NBC,Cancelled,A comedy following a dysfunctional family livi...,1600 Penn,2012 - 2013
7,Comedy,http://www.ismyshowcancelled.com/show/2010/18-...,CBC,Cancelled,A Canadian comedy series about a modern-day Ro...,18 to Life,2010 - 2011
8,Reality,http://www.ismyshowcancelled.com/show/2008/19-...,TLC,Cancelled,A reality series following the Duggar family.,19 Kids and Counting,2008 - 2015
9,Comedy,http://www.ismyshowcancelled.com/show/2011/2-b...,CBS,Cancelled,A comedy following the lives of two waitresses...,2 Broke Girls,2011 - 2017


Saving to pkl now...

In [57]:
#shows.to_pickle("ismyshowcancelled_raw_pull.pkl")
shows = pd.read_pickle("ismyshowcancelled_raw_pull.pkl")

Creating a version that's a little cleaned up and has downcasted variables

In [58]:
shows.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1575 entries, 0 to 1574
Data columns (total 7 columns):
genre      1567 non-null object
link       1575 non-null object
network    1574 non-null object
status     1575 non-null object
tagline    1575 non-null object
title      1575 non-null object
years      1575 non-null object
dtypes: object(7)
memory usage: 1.1 MB


In [64]:
# Downcasting certain columns
shows[['genre','network','link','status','tagline','title']] \
    = shows[['genre','network','link','status','tagline','title']].apply(lambda x: x.astype('category'))

# Fixing erroneous end year of "0" for Tori & Dean show
shows.iat[1462,6] = '2007 - 2012'

# Splitting out the years given into actual years
shows['start_year'] = [x[0] for x in shows['years'].str.split(' - ')]
shows['end_year'] = [x[1] for x in shows['years'].str.split(' - ')]

# Changing "present" end date to null
shows['end_year'] = shows['end_year'].replace('Present',np.NaN)

# Downcasting
shows[['start_year','end_year']] = shows[['start_year','end_year']].apply(pd.to_numeric,downcast='integer')

# Getting rid of the "http://" bit in the links - this seems to cause errors when we open those pages
#shows['link'] = shows['link'].str.replace('http://','').values

In [65]:
shows.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1575 entries, 0 to 1574
Data columns (total 9 columns):
genre         1567 non-null category
link          1575 non-null category
network       1574 non-null category
status        1575 non-null category
tagline       1575 non-null category
title         1575 non-null category
years         1575 non-null object
start_year    1575 non-null int16
end_year      979 non-null float64
dtypes: category(6), float64(1), int16(1), object(1)
memory usage: 867.6 KB


In [38]:
{title for title in shows['title']}

{'Lopez Tonight',
 'The Newsroom',
 'Til Death',
 'Dominion',
 'Breakout Kings',
 "Snoop Dogg Presents The Joker's Wild",
 "Adam Devine's House Party",
 'Ghosted',
 'Sons of Tucson',
 'See Dad Run',
 'Gossip Girl',
 'Bitten',
 'Candy Crush',
 'Young Sheldon',
 'Skins',
 'King & Maxwell',
 'Pose',
 'Seven Seconds',
 'This Is Us',
 'NCIS: New Orleans',
 'Amish Mafia',
 'Criminal Minds: Beyond Borders',
 'Lipstick Jungle',
 'At Home with Amy Sedaris',
 "Lachey's Bar",
 'Modern Family',
 'Quick Draw',
 'The McCarthys',
 'Lucky 7',
 'Billions',
 'Atlanta',
 'Moonlight',
 'Banshee',
 'Married at First Sight',
 'You Deserve It',
 "Let's Stay Together",
 'The Jay Leno Show',
 'Up All Night',
 'Kim of Queens',
 'Gold Rush',
 'Queer Eye',
 'Vinyl',
 'Reverie',
 'Make It or Break It',
 'My Five Wives',
 'Opposite Worlds',
 'Camp',
 'Weird Loners',
 'Stuck in the Middle',
 'Bad Teacher',
 'The Odd Couple',
 'Ash vs. Evil Dead',
 "Harry's Law",
 'Marry Me',
 'CSI: Miami',
 'Mozart in the Jungle',
 

In [56]:
for link in shows['link'][0:5]:
    print(link)

www.ismyshowcancelled.com/show/2010/bleep-my-dad-says/
www.ismyshowcancelled.com/show/2018/100-code/
www.ismyshowcancelled.com/show/2011/101-ways-to-leave-a-gameshow/
www.ismyshowcancelled.com/show/2015/12-monkeys/
www.ismyshowcancelled.com/show/2017/13-reasons-why/


In [83]:
shows[shows.synopsis.isnull()]

Unnamed: 0,genre,link,network,status,tagline,title,years,start_year,end_year,synopsis
1112,Reality,http://www.ismyshowcancelled.com/show/2008/so-...,CTV,Cancelled,"Hosted by Leah Miller, this weekly dance compe...",So You Think You Can Dance Canada,2008 - 2011,2008,2011.0,
1113,Reality,http://www.ismyshowcancelled.com/show/2013/som...,TLC,On Air,A reality series where brides-to-be get a choi...,"Something Borrowed, Something New",2013 - Present,2013,,
1114,Drama / Crime,http://www.ismyshowcancelled.com/show/2017/som...,ABC,On Air,A thriller following a woman helping police tr...,Somewhere Between,2017 - Present,2017,,
1115,Comedy / Animated,http://www.ismyshowcancelled.com/show/2016/son...,Fox,Cancelled,A comedy following an animated warrior who ret...,Son of Zorn,2016 - 2017,2016,2017.0,
1116,Drama,http://www.ismyshowcancelled.com/show/2008/son...,FX,Concluded,A drama focused on the lives of a close-knit o...,Sons of Anarchy,2008 - 2014,2008,2014.0,
1117,Reality,http://www.ismyshowcancelled.com/show/2011/son...,Discovery,Cancelled,A reality series showcasing one of America's m...,Sons of Guns,2011 - 2014,2011,2014.0,
1118,Comedy,http://www.ismyshowcancelled.com/show/2010/son...,Fox,Cancelled,A family comedy following three brothers who h...,Sons of Tucson,2010 - 2010,2010,2010.0,
1119,Reality,http://www.ismyshowcancelled.com/show/2011/sou...,truTV,Cancelled,"A ""docudrama"" set in the world of car reposses...",South Beach Tow,2011 - 2014,2011,2014.0,
1120,Drama / Horror,http://www.ismyshowcancelled.com/show/2015/sou...,WE,On Air,A drama following an exorcist-for-hire who has...,South of Hell,2015 - Present,2015,,
1121,Comedy / Animated,http://www.ismyshowcancelled.com/show/1997/sou...,Comedy Central,On Air,An animated comedy that uses dark humor to sat...,South Park,1997 - Present,1997,,


In [80]:
# Adding an empty column to house the synopses we're scraping for
shows['synopsis'] = np.nan

index = 0

# Looping through all the additional info links
for url in shows['link']:
    
    content = requests.get(url).content

    soup = BeautifulSoup(content, "lxml")
    
    shows.loc[:,'synopsis'].iloc[index] = soup.find_all('div',class_='plot')[0].text
    
    print(shows.loc[:,'title'].iloc[index])
    print(shows.loc[:,'synopsis'].iloc[index])
    
    index += 1

print('Completed. Show synopses have been scraped.')
shows.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


KeyboardInterrupt: 

In [111]:
index = 1531
# I didn't get all of them that first go-round, so now getting the rest
for url in shows['link'][1531:]:
    
    content = requests.get(url).content

    soup = BeautifulSoup(content, "lxml")
    
    shows.loc[:,'synopsis'].iloc[index] = soup.find_all('div',class_='plot')[0].text
    
    # To check that it continues to work
    print(shows.loc[:,'title'].iloc[index])
    print(shows.loc[:,'synopsis'].iloc[index])
    
    index += 1

print('Completed. Show synopses have been scraped.')
shows.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Wendell & Vinnie
Vinnie is loving the care-free life of owning a pop culture memorabilia store, playing video games, and eating junk food. But his world is turned upside down after his nephew's parents are suddenly killed, making Vinnie his legal guardian.
Westworld
This dark science fiction odyssey is set at the intersection of the near future and the re-imagined past, during the emergence of artificial consciousness and the evolution of sin.Westworld is a high-tech adult theme park where every human appetite can be indulged, regardless of how noble or depraved.
What Not To Wear
In What Not To Wear, fashion experts Stacy London and Clinton Kelly help makeover selected participants by using style advice and makeup modifications.
What Would Ryan Lochte Do?
In this reality series, cameras capture the daily life of competitive swimmer and Olympian Ryan Lochte as he prepares for the 2016 Summer Olympic Games. In between training he also spends time working on his fashion line, making media

Working Class
Carli Mitchell is a single mom who never had a lot of money. Trying to provide a better life for three kids, she moves to an upscale neighborhood that is a little out of her pay grade.It's not long before she finds that trying to live the good life is a little more difficult than she expected.
Working the Engels
The Engel family find themselves in debt after the family patriarch's untimely death. Jenna, an attorney, steps up to keep the family law firm running, with the help from her less-than-qualified mother and siblings.
World of Dance
This series brings together some of the world's most elite dancers in a fierce competition of artistry and athleticism. Solo dancers, duos, and crews engage in epic battles that showcase a wide range of dance styles including everything from hip hop and break dancing to ballet and ballroom.A judging panel of experts scores the acts using a precise point system that takes into account performance, technique, choreography, creativity and p

Unnamed: 0,genre,link,network,status,tagline,title,years,start_year,end_year,synopsis
0,Comedy,http://www.ismyshowcancelled.com/show/2010/ble...,CBS,Cancelled,"A sitcom based on the Twitter feed ""S*** My Da...",$#*! My Dad Says,2010 - 2011,2010,2011.0,Ed is an opinionated and divorced 72-year-old ...
1,Drama / Crime,http://www.ismyshowcancelled.com/show/2018/100...,WGN America,Coming Soon,A thriller following an New York cop who trave...,100 Code,2018 - Present,2018,,NYPD Detective Tommy Conley travels to Sweden ...
2,Game Show,http://www.ismyshowcancelled.com/show/2011/101...,ABC,Cancelled,A game show competition where contestants are ...,101 Ways to Leave a Gameshow,2011 - 2011,2011,2011.0,"In 101 Ways to Leave a Game Show, contestants ..."
3,Drama / Sci-fi,http://www.ismyshowcancelled.com/show/2015/12-...,Syfy,On Air,A drama following a man sent back in time to p...,12 Monkeys,2015 - Present,2015,,"By the year 2043, a deadly virus has wiped out..."
4,Drama,http://www.ismyshowcancelled.com/show/2017/13-...,Netflix,On Air,A drama following the revelation of why a youn...,13 Reasons Why,2017 - Present,2017,,Hannah Baker is a teenage girl who takes her o...


Doing some spot checks...

In [127]:
row = 900

In [128]:
shows.iloc[row:row+50,:]

Unnamed: 0,genre,link,network,status,tagline,title,years,start_year,end_year,synopsis
900,Drama / Comedy,http://www.ismyshowcancelled.com/show/2018/on-...,Netflix,On Air,A dramedy following a group of friends enterin...,On My Block,2018 - Present,2018,,"Monse, Ruby, Jamal, and Cesar are four smart, ..."
901,Reality,http://www.ismyshowcancelled.com/show/2014/on-...,TNT,On Air,A cooking competition featuring everyday cooks...,On The Menu,2014 - Present,2014,,Ty Pennington and chef Emeril Lagasse headline...
902,Drama / Fantasy,http://www.ismyshowcancelled.com/show/2011/onc...,ABC,Concluded,A fantasy drama inspired by classic fairy tale...,Once Upon a Time,2011 - 2018,2011,2018.0,"In Once Upon a Time, fairy tales and the moder..."
903,Drama / Fantasy,http://www.ismyshowcancelled.com/show/2013/onc...,ABC,Cancelled,A fantasy drama that puts a twist on the class...,Once Upon a Time in Wonderland,2013 - 2014,2013,2014.0,"A spin-off of Once Upon a Time, this fantasy d..."
904,Comedy,http://www.ismyshowcancelled.com/show/2015/one...,NBC,Cancelled,A comedy following a lesbian who is pregnant w...,One Big Happy,2015 - 2015,2015,2015.0,Lizzy and Luke have been best friends since th...
905,Reality,http://www.ismyshowcancelled.com/show/2011/one...,Lifetime,On Air,A hospital documentary series that takes an in...,One Born Every Minute,2011 - Present,2011,,"""One Born Every Minute"" captures the unexpecte..."
906,Comedy,http://www.ismyshowcancelled.com/show/2017/one...,Netflix,On Air,A comedy following three generations of a Cuba...,One Day at a Time,2017 - Present,2017,,Penelope Alvarez is a former military veteran ...
907,Drama,http://www.ismyshowcancelled.com/show/1968/one...,ABC,Cancelled,A daytime soap opera that explores social issu...,One Life To Live,1968 - 2012,1968,2012.0,"Set in the fictional town of Llanview, One Lif..."
908,Comedy,http://www.ismyshowcancelled.com/show/2015/one...,Amazon,Cancelled,A comedy following a woman returning to her ho...,One Mississippi,2015 - 2017,2015,2017.0,Loosely inspired by comedian Tig Notaro's life...
909,Drama,http://www.ismyshowcancelled.com/show/2003/one...,The CW,Concluded,A drama following the lives and loves of two h...,One Tree Hill,2003 - 2012,2003,2012.0,"Set in the fictional small town of Tree Hill, ..."


Looks very good! Now let's resave for this checkpoint.

In [388]:
# Having some issues saving to pickle, so I'll save to CSV, reupload, and do the downcasting again before I save the pkl
#shows.to_csv("ismyshowcancelled_tmp_1.csv")
shows = pd.read_csv("ismyshowcancelled_tmp_1.csv",index_col=0)

In [389]:
# creating a save point to presere the synopses we do have...
shows2 = shows.copy()

# Downcasting certain columns
shows2[['genre','network','link','status','tagline','title','synopsis']] \
    = shows2[['genre','network','link','status','tagline','title','synopsis']].apply(lambda x: x.astype('category'))

shows2[['start_year','end_year']] = shows2[['start_year','end_year']].apply(pd.to_numeric,downcast='integer')

shows2.to_pickle("ismyshowcancelled_tmp_1.pkl")
shows = pd.read_pickle("ismyshowcancelled_tmp_1.pkl")

In [390]:
shows.dtypes

genre         category
link          category
network       category
status        category
tagline       category
title         category
years           object
start_year       int16
end_year       float64
synopsis      category
dtype: object

In [391]:
shows.isnull().sum()

genre           8
link            0
network         1
status          0
tagline         0
title           0
years           0
start_year      0
end_year      596
synopsis        0
dtype: int64

Checking that everything's intact...

Let's do some additional cleaning and make the features a little more usable. First, we do have some null values for genre (8) and network (1). I'm going to look these up manually and input the correct values.

In [392]:
shows[shows['genre'].isnull()]

Unnamed: 0,genre,link,network,status,tagline,title,years,start_year,end_year,synopsis
84,,http://www.ismyshowcancelled.com/show/2015/ano...,Comedy Central,On Air,A satirical comedy following an upscale family...,Another Period,2015 - Present,2015,,The Bellacourts are an extravagantly rich fami...
148,,http://www.ismyshowcancelled.com/show/2012/bes...,Oxygen,Cancelled,A competition reality series where tattoo arti...,Best Ink,2012 - 2014,2012,2014.0,"Hosted by recording artist Kimberly Caldwell, ..."
579,,http://www.ismyshowcancelled.com/show/2015/hom...,Fox,On Air,A competition series where couples compete to ...,Home Free,2015 - Present,2015,,Mike Holmes hosts this reality competition whe...
864,,http://www.ismyshowcancelled.com/show/2008/my-...,NBC,Cancelled,A drama following a secret agent who has no kn...,My Own Worst Enemy,2008 - 2008,2008,2008.0,Edward Albright is a secret agent who has had ...
1163,,http://www.ismyshowcancelled.com/show/2013/sup...,ABC,Cancelled,A comedy following an attorney whose relations...,Super Fun Night,2013 - 2014,2013,2014.0,Kimmie Boubier is a junior attorney and for th...
1474,,http://www.ismyshowcancelled.com/show/2013/tro...,ABC,Cancelled,A comedy following a woman who marries an olde...,Trophy Wife,2013 - 2014,2013,2014.0,Kate is a former party girl who has recently b...
1523,,http://www.ismyshowcancelled.com/show/2013/we-...,CBS,Cancelled,A comedy following four single guys who unexpe...,We Are Men,2013 - 2013,2013,2013.0,Carter has just moved into a short-term rental...
1534,,http://www.ismyshowcancelled.com/show/2013/wha...,E!,Cancelled,A reality series following Olympic swimmer Rya...,What Would Ryan Lochte Do?,2013 - 2013,2013,2013.0,"In this reality series, cameras capture the da..."


In [393]:
# Looked up the shows with missing genre and determined best fit
shows.loc[shows['title'] == 'Another Period','genre'] = 'Comedy'
shows.loc[shows['title'] == 'Best Ink','genre'] = 'Reality'
shows.loc[shows['title'] == 'Home Free','genre'] = 'Reality'
shows.loc[shows['title'] == 'My Own Worst Enemy','genre'] = 'Drama'
shows.loc[shows['title'] == 'Super Fun Night','genre'] = 'Comedy'
shows.loc[shows['title'] == 'Trophy Wife','genre'] = 'Comedy'
shows.loc[shows['title'] == 'We Are Men','genre'] = 'Comedy'
shows.loc[shows['title'] == 'What Would Ryan Lochte Do?','genre'] = 'Reality'

# Looks like the site has out of date info for this "Home Free" show
# It wasn't explicitly cancelled, but FOX decided not to bring it back, so let's make that adjustment
shows.loc[shows['title'] == 'Home Free','years'] = '2015 - 2016'
shows.loc[shows['title'] == 'Home Free','start_year'] = '2015'
shows.loc[shows['title'] == 'Home Free','end_year'] = '2016'
shows['years'] = shows['years'].astype('category')
shows[['start_year','end_year']] = shows[['start_year','end_year']].apply(pd.to_numeric,downcast='integer')

# Top of the Lake doesn't have a network value. The show is technically a BBC series,
# but is carried by Sundance domestically. Since this is a new network, we'll have to upcast then re-downcast the dtype
shows['network'] = shows['network'].astype('object')
shows.loc[shows['title'] == 'Top of the Lake','network'] = 'Sundance'
shows['network'] = shows['network'].astype('category')

In [394]:
shows.isnull().sum()

genre           0
link            0
network         0
status          0
tagline         0
title           0
years           0
start_year      0
end_year      595
synopsis        0
dtype: int64

Our dataset is pretty clean now, with no nulls except for in the end year (which are intentional and represent shows that have not ended yet). There's still one quick improvement to make: splitting out genres into new columns that have primary and secondary genre, for those that have more than one. Let's also do an encoding of show genres, which will have a binary flag for if a genre is present, irrespective of whether it's the primary or secondary genre for that show.

In [395]:
shows['primary_genre'] = shows['genre'].str.split(' / ').apply(lambda x: x[0])
shows['secondary_genre'] = shows['genre'].str.split(' / ').apply(lambda x: x[1] if len(x)>1 else np.nan)

In [396]:
dummies = pd.get_dummies(shows.primary_genre)

for index, value in enumerate(shows.secondary_genre):
    
    if (value is not np.nan) & (value not in dummies.columns):
        
        # If the encoded column doesn't exist, create it and merge it back to dummies
        encode = pd.get_dummies(shows.secondary_genre)[value]
        dummies = pd.concat([dummies,encode],axis=1)
        print('Added: {}'.format(dummies.columns[-1]))

Added: Crime
Added: Action
Added: Fantasy
Added: Animated
Added: Horror
Added: Legal
Added: Medical


In [397]:
shows = pd.concat([shows,dummies],axis=1)
shows.head()

Unnamed: 0,genre,link,network,status,tagline,title,years,start_year,end_year,synopsis,...,Reality,Sci-fi,Talk,Crime,Action,Fantasy,Animated,Horror,Legal,Medical
0,Comedy,http://www.ismyshowcancelled.com/show/2010/ble...,CBS,Cancelled,"A sitcom based on the Twitter feed ""S*** My Da...",$#*! My Dad Says,2010 - 2011,2010,2011.0,Ed is an opinionated and divorced 72-year-old ...,...,0,0,0,0,0,0,0,0,0,0
1,Drama / Crime,http://www.ismyshowcancelled.com/show/2018/100...,WGN America,Coming Soon,A thriller following an New York cop who trave...,100 Code,2018 - Present,2018,,NYPD Detective Tommy Conley travels to Sweden ...,...,0,0,0,1,0,0,0,0,0,0
2,Game Show,http://www.ismyshowcancelled.com/show/2011/101...,ABC,Cancelled,A game show competition where contestants are ...,101 Ways to Leave a Gameshow,2011 - 2011,2011,2011.0,"In 101 Ways to Leave a Game Show, contestants ...",...,0,0,0,0,0,0,0,0,0,0
3,Drama / Sci-fi,http://www.ismyshowcancelled.com/show/2015/12-...,Syfy,On Air,A drama following a man sent back in time to p...,12 Monkeys,2015 - Present,2015,,"By the year 2043, a deadly virus has wiped out...",...,0,0,0,0,0,0,0,0,0,0
4,Drama,http://www.ismyshowcancelled.com/show/2017/13-...,Netflix,On Air,A drama following the revelation of why a youn...,13 Reasons Why,2017 - Present,2017,,Hannah Baker is a teenage girl who takes her o...,...,0,0,0,0,0,0,0,0,0,0


In [399]:
shows[['primary_genre','secondary_genre']] = shows[['primary_genre','secondary_genre']].apply(lambda x: x.astype('category'))
shows.to_pickle("ismyshowcancelled_final.pkl")
shows.to_csv("ismyshowcancelled_final.csv")

In [400]:
# One last downcasting check...
shows.dtypes

genre              category
link               category
network            category
status             category
tagline            category
title              category
years              category
start_year            int16
end_year            float64
synopsis           category
primary_genre      category
secondary_genre    category
Comedy                uint8
Drama                 uint8
Game Show             uint8
Reality               uint8
Sci-fi                uint8
Talk                  uint8
Crime                 uint8
Action                uint8
Fantasy               uint8
Animated              uint8
Horror                uint8
Legal                 uint8
Medical               uint8
dtype: object