# Import modules

In [1]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup #Pull data out of HTML and XML.
from selenium import webdriver
import time
import re
import collections

# Scrapping

In [2]:
'''
 initialisation des variables contenants les liens du site
'''
url_rank_serie = 'https://www.imdb.com/chart/toptv/'
url_imdb_base = 'https://www.imdb.com'
url_serie_ = f'?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=12230b0e-0e00-43ed-9e59-8d5353703cce&pf_rd_r=BFQY8EMES4H7BK35D10R&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=toptv&ref_=chttvtp_tt_'


In [3]:
def get_html_from_link(page_link):
    '''
        Get HTML from web page and parse it.

        :param page_link: link of the webpage we want to scrap
        :type page_link: string
        :return: BeautifulSoup object (HTML parsed)
        :rtype: bs4.BeautifulSoup
    '''

    response = requests.get(page_link)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [4]:
html_all = get_html_from_link(url_rank_serie)

In [5]:
def get_link_to_serie(root_html):

    """
    This function extract the link to acces the series information page.
    
    :param root_html: BeautifulSoup Element that contains all books links.
    :type book_html: bs4.BeautifulSoup.
    :return: list of all serie links in the page.
    :rtype: list(str).
    """
    serie_links = []
    reg = re.compile('/title/+')
    for elem in root_html.find_all('td', {'class':'titleColumn'}):
        for elements in elem.find_all('a', {'href' : reg}):
            attribut = elements["href"]
            serie_links.append(attribut)
    #len(serie_links)
    return(serie_links)

In [6]:
serie_links=get_link_to_serie(html_all)
#serie_links

In [94]:
def get_info_serie(serie_html):
    
    """
    Return series informations
    
    :param serie_html: BeautifulSoup element that contains serie infos.
    :type serie_html: bs4.element.Tag.
    :return:
            - serie_title: tile of the TV serie.
            - serie_genre: genre-s of the TV serie.
            - serie_nb_season:
            - serie_type: the serie can be TV Mini-Series or TV Series.
            - serie_actors : actors who play in the TV serie.
            - serie_creator : creator-s of the TV serie
            - serie_origin: origins/country of the TV serie.
            - serie_language: speaking language of the TV serie.
            - serie_nb_episode:
            - serie_certification : certificate of the TV serie (Tous public, 12, 16...).
            - serie_rating: rating of the TV serie.
    :rtype: tuple(list(str), string, string, list(str), list(str), string)
    """
    
    serie_title = serie_html.find('h1').text.strip()
    
    serie_genre=[]
    reg_search = re.compile('/search/title+')
    for div in serie_html.find_all("div", {"class":"see-more inline canwrap"}):
        for a in div.find_all("a", {"href": reg_search}):
            genre = a.text
            serie_genre.append(genre)
            
            
    nb_season=[]
    reg_season = re.compile('/title/')
    for div in serie_html.find_all("div", {"class":"seasons-and-year-nav"}):
        for c in div.find_all("a", {"href": reg_season}):
                season_not_filtered = c.text
                nb_season.append(season_not_filtered)
                serie_nb_season = ""
                if nb_season != 1:
                        serie_nb_season = nb_season[0]
                        
                        
    serie_nb_episode=serie_html.find('span', {'class':'bp_sub_heading'}).text
    
    
    reg = re.compile('/title/+')
    serie_type = serie_html.find("a", {"href": reg, "title":"See more release dates"}).text
    
    
    serie_actors=[]
    reg_name = re.compile('/name/nm+')
    for div in serie_html.find_all("div", {"class":"article", "id":"titleCast"}):
        for a in div.find_all("a", {"href": reg_name}):
            actors = a.text
            if actors != '':
                serie_actors.append(actors)
                
                
    serie_creators=[]
    for div in serie_html.find_all("div", {"class":"credit_summary_item"}):
        creators = ""
        if div.find("h4", {"class":"inline"}).text == "Creator:":
            for c in div.find_all("a", {"href": reg_name}):
                creators = c.text
                serie_creators.append(creators)
        elif div.find("h4", {"class":"inline"}).text == "Creators:":
            for c in div.find_all("a", {"href": reg_name}):
                creators = c.text
                serie_creators.append(creators)
                
    
    serie_origin=[]
    for div in serie_html.find_all("div", {"class":"txt-block"}):
        for h in div.find_all("h4", {"class":"inline"}):
            country = ""
            if h.text == "Country:":
                for a in div.find_all("a", {"href": reg_search}):
                    country = a.text
                    serie_origin.append(country)
    
    serie_language = ""
    for div in serie_html.find_all("div", {"class":"txt-block"}):
        for h in div.find_all("h4", {"class":"inline"}):
            if h.text == "Language:":
                for a in div.find_all("a", {"href": reg_search}):
                    serie_language = a.text
                    
                    
    
    certif =[]
    serie_certification = ""
    certificate_not_filtered = ""
    for div in serie_html.find_all("div", {"class":"txt-block"}):
        for h in div.find_all("h4", {"class":"inline"}):
            if h.text == "Certificate:":
                for s in div.find_all("span"):
                    certificate_not_filtered = s.text
                    certif.append(certificate_not_filtered)
                    if len(certif) != 1:
                        serie_certification = certif[0]
                        
                        
    serie_rating = serie_html.find('span', {'itemprop':'ratingValue'}).text
                        
    
    return(serie_title, serie_genre, serie_nb_season, serie_nb_episode, serie_type, serie_actors, serie_creators, serie_origin, serie_language, serie_certification,  serie_rating)

Loop on the first 100 serie links to get informations.

The list start at rank 0 : the first film with classement rank 1 on IMDb starts at rank 0 in the list. The classement rank is set at n + 1 (0 + 1).

In [150]:
'''
      creation d'une liste avec les séries classées par rang
'''

%%time
serie_details = []

#Initiate the rank number.
rank_number = 0

for n in range(len(serie_links)-150):
    
    rank_number = n + 1
    # Add rank_number at the en of the url.
    url_serie_rank = url_serie_ + str(rank_number)
    
    # Get the entire url of the page that contains serie informations.
    link_serie = url_imdb_base + serie_links[n] + url_serie_rank
    #print(link_serie) 
    
    html = get_html_from_link(link_serie)
    
    # Apply get_info_serie function to return every link informations.
    info = get_info_serie(html)
    # Add rank.
    info = (rank_number,) + info
    
    # Info contains tuples : add them to a list.
    serie_details.append(info)
    print(rank_number)
    #print(serie_details)
    
print(serie_details)

    

1
2
3
4
5
6
7


KeyboardInterrupt: 

In [152]:
url_test = url_imdb_base + serie_links[3] + url_serie_ + str(4)
#print(url_test)
cast = get_html_from_link(url_test)
#print(type(cast))

serie_language = ""
for div in cast.find_all("div", {"class":"txt-block"}):
    
    for h in div.find_all("h4", {"class":"inline"}):
            
        if h.text == "Language:":
            for a in div.find_all("a", {"href": reg_search}):
                serie_language = a.text
serie_language

NameError: name 'reg_search' is not defined

## Put de data into a dataframe.

Put the data from the list of tuple into a dataframe df_serie.
Then, export the dataframe to a csv file to save it.

In [91]:
df_serie = pd.DataFrame(serie_details, 
                        columns = ["Rank", "Title", "Genre", "Number_of_season", "Number_of_episodes", "Type", "Actors", 
                                   "Creators", "Origin", "Language", "Certification", "Rating"])

df_serie.to_csv('C:\\Users\\stosc\\Documents\\ESME\\Ingé2_2019-2020\\S2\\UE1\\DataTools\\Projet\series_data.csv', 
                index=False, header=True)

df_serie

Unnamed: 0,Rank,Title,Genre,Number_of_season,Number_of_episodes,Type,Actors,Creators,Origin,Language,Certification,Rating
0,1,Planet Earth II,[ Documentary],1,6 episodes,TV Mini-Series (2016)\n,[ David Attenborough\n],[],[UK],English,,9.5
1,2,Planet Earth,[ Documentary],1,11 episodes,TV Mini-Series (2006)\n,"[ David Attenborough\n, Sigourney Weaver\n, ...",[],[UK],English,,9.4
2,3,Band of Brothers,"[ Action, Drama, History, War]",1,10 episodes,TV Mini-Series (2001)\n,"[ Scott Grimes\n, Damian Lewis\n, Ron Living...",[],"[USA, UK]",Lithuanian,,9.4
3,4,Breaking Bad,"[ Crime, Drama, Thriller]",5,62 episodes,TV Series (2008–2013)\n,"[ Bryan Cranston\n, Anna Gunn\n, Aaron Paul\...",[Vince Gilligan],[USA],Spanish,,9.5
4,5,Chernobyl,"[ Drama, History, Thriller]",1,5 episodes,TV Mini-Series (2019)\n,"[ Jessie Buckley\n, Jared Harris\n, Stellan ...",[Craig Mazin],"[USA, UK]",English,,9.4
5,6,The Wire,"[ Crime, Drama, Thriller]",5,60 episodes,TV Series (2002–2008)\n,"[ Dominic West\n, John Doman\n, Deirdre Love...",[David Simon],[USA],Spanish,,9.3
6,7,Blue Planet II,[ Documentary],1,8 episodes,TV Mini-Series (2017–2018)\n,"[ David Attenborough\n, Peter Drost\n]",[],[UK],English,,9.3
7,8,Our Planet,[ Documentary],1,8 episodes,TV Mini-Series (2019)\n,[ David Attenborough\n],[],"[USA, UK]",English,,9.3
8,9,Cosmos: A Spacetime Odyssey,[ Documentary],1,13 episodes,TV Series (2014)\n,[ Neil deGrasse Tyson\n],[],[USA],English,,9.3


In [16]:
# Read the csv file into a dataframe.
df_serie = pd.read_csv("C:\\Users\\stosc\\Documents\\ESME\\Ingé2_2019-2020\\S2\\UE1\\DataTools\\Projet\series_data.csv",
                      header=0, index_col=0)

df_serie

Unnamed: 0_level_0,Title,Genre,Number_of_season,Number_of_episodes,Type,Actors,Creators,Origin,Language,Certification,Rating
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Planet Earth II,[' Documentary'],1,6 episodes,TV Mini-Series (2016)\n,[' David Attenborough\n'],[],['UK'],English,,9.5
2,Planet Earth,[' Documentary'],1,11 episodes,TV Mini-Series (2006)\n,"[' David Attenborough\n', ' Sigourney Weaver\n...",[],['UK'],English,,9.4
3,Band of Brothers,"[' Action', ' Drama', ' History', ' War']",1,10 episodes,TV Mini-Series (2001)\n,"[' Scott Grimes\n', ' Damian Lewis\n', ' Ron L...",[],"['USA', 'UK']",Lithuanian,,9.4
4,Breaking Bad,"[' Crime', ' Drama', ' Thriller']",5,62 episodes,TV Series (2008–2013)\n,"[' Bryan Cranston\n', ' Anna Gunn\n', ' Aaron ...",['Vince Gilligan'],['USA'],Spanish,,9.5
5,Chernobyl,"[' Drama', ' History', ' Thriller']",1,5 episodes,TV Mini-Series (2019)\n,"[' Jessie Buckley\n', ' Jared Harris\n', ' Ste...",['Craig Mazin'],"['USA', 'UK']",English,,9.4
6,The Wire,"[' Crime', ' Drama', ' Thriller']",5,60 episodes,TV Series (2002–2008)\n,"[' Dominic West\n', ' John Doman\n', ' Deirdre...",['David Simon'],['USA'],Spanish,,9.3
7,Blue Planet II,[' Documentary'],1,8 episodes,TV Mini-Series (2017–2018)\n,"[' David Attenborough\n', ' Peter Drost\n']",[],['UK'],English,,9.3
8,Our Planet,[' Documentary'],1,8 episodes,TV Mini-Series (2019)\n,[' David Attenborough\n'],[],"['USA', 'UK']",English,,9.3
9,Cosmos: A Spacetime Odyssey,[' Documentary'],1,13 episodes,TV Series (2014)\n,[' Neil deGrasse Tyson\n'],[],['USA'],English,,9.3


In [17]:
'''
    Moyenne du nombre de saison par série
    
 '''

print(df_serie['Number_of_season'].mean())

1.8888888888888888
9.377777777777776


In [149]:
'''
       Reccupération de la langue la plus utilisée et des occurences des 3 plus communes
'''

print(df_serie['Language'].value_counts().idxmax())
redondant_language=collections.Counter(df_serie['Language']).most_common(3)
print(redondant_language)

English
[('English', 6), ('Spanish', 2), ('Lithuanian', 1)]


In [153]:
'''
       Reccupération du format d'épisode et du type les plus communs
'''
redondant_format=collections.Counter(df_serie['Number_of_episodes']).most_common(3)
print(redondant_format)


redondant_type=collections.Counter(df_serie['Type']).most_common(3)
print(redondant_type)

[('8 episodes', 2), ('6 episodes', 1), ('11 episodes', 1)]
[('TV Mini-Series (2019)\n', 2), ('TV Mini-Series (2016)\n', 1), ('TV Mini-Series (2006)\n', 1)]


In [154]:
'''
       Reccupération des 10 acteurs les plus cotés sur le top 10
'''
list_acteur=[]
for f in df_serie["Actors"] :
    list_acteur+=f
redondant_actors=collections.Counter(list).most_common(10)
print(redondant_actors)

[(' David Attenborough\n', 4), (' Philip Barantini\n', 2), (' Sigourney Weaver\n', 1), (' Thomas Anguti Johnston\n', 1), (' Scott Grimes\n', 1), (' Damian Lewis\n', 1), (' Ron Livingston\n', 1), (' Shane Taylor\n', 1), (' Donnie Wahlberg\n', 1), (' Peter Youngblood Hills\n', 1)]


In [147]:
'''
       Reccupération du genre qui marche le mieux 
'''
list_genre=[]
for f in df_serie["Genre"] :
    list_genre+=f
from collections import Counter 
redondant_genre=collections.Counter(list_genre).most_common(3)
print(redondant_genre)

[(' Documentary', 5), (' Drama', 4), (' Thriller', 3)]


In [133]:
list_creators=[]
for f in df_serie["Creators"] :
    list_creators+=f
from collections import Counter 
redondant_creators=collections.Counter(list_creators).most_common(6)
print(redondant_creators)

[('Vince Gilligan', 1), ('Craig Mazin', 1), ('David Simon', 1)]


In [146]:
list_origin=[]
for f in df_serie["Origin"] :
    list_origin+=f
from collections import Counter 
redondant_origin=collections.Counter(list_origin).most_common(2)
print(redondant_origin)

[('UK', 6), ('USA', 6)]
