# Import modules

In [2]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup #Pull data out of HTML and XML.
from selenium import webdriver
import time
import re


# Scrapping

In [3]:
url_rank_serie = 'https://www.imdb.com/chart/toptv/'
url_imdb_base = 'https://www.imdb.com'
url_serie_ = f'?pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=12230b0e-0e00-43ed-9e59-8d5353703cce&pf_rd_r=BFQY8EMES4H7BK35D10R&pf_rd_s=center-1&pf_rd_t=15506&pf_rd_i=toptv&ref_=chttvtp_tt_'


In [4]:
def get_html_from_link(page_link):
    '''
        Get HTML from web page and parse it.

        :param page_link: link of the webpage we want to scrap
        :type page_link: string
        :return: BeautifulSoup object (HTML parsed)
        :rtype: bs4.BeautifulSoup
    '''

    response = requests.get(page_link)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [5]:
html_all = get_html_from_link(url_rank_serie)
#print(html_all.prettify())

In [6]:
def get_link_to_serie(root_html):

    """
    This function extract the link to acces the series information page.
    
    :param root_html: BeautifulSoup Element that contains all books links.
    :type book_html: bs4.BeautifulSoup.
    :return: list of all serie links in the page.
    :rtype: list(str).
    """
    serie_links = []
    reg = re.compile('/title/+')
    for elem in root_html.find_all('td', {'class':'titleColumn'}):
        for elements in elem.find_all('a', {'href' : reg}):
            attribut = elements["href"]
            serie_links.append(attribut)
    #len(serie_links)
    return(serie_links)

In [7]:
serie_links=get_link_to_serie(html_all)
#serie_links

In [29]:
def get_info_serie(serie_html):
    
    """
    Return series informations
    
    :param serie_html: BeautifulSoup element that contains serie infos.
    :type serie_html: bs4.element.Tag.
    :return:
            - serie_title: tile of the TV serie.
            - serie_genre: genre-s of the TV serie.
            - serie_nb_season: number of season of the serie.
            - serie_nb_episode: number of episode of the serie.
            - serie_type: the serie can be TV Mini-Series or TV Series.
            - serie_actors : actors who play in the TV serie.
            - serie_creator : creator-s of the TV serie
            - serie_origin: origins/country of the TV serie.
            - serie_language: speaking language of the TV serie.
            - serie_certification : certificate of the TV serie (Tous public, 12, 16...).
            - serie_rating: rating of the TV serie.
    :rtype: tuple(string, list(str), string, string, string, list(str), list(str), list(str), string, string, string)
    """
    
    serie_title = serie_html.find('h1').text.strip()
    
    serie_genre=[]
    reg_search = re.compile('/search/title+')
    for div in serie_html.find_all("div", {"class":"see-more inline canwrap"}):
        for a in div.find_all("a", {"href": reg_search}):
            genre = a.text
            serie_genre.append(genre)
            
            
    nb_season=[]
    reg_season = re.compile('/title/')
    for div in serie_html.find_all("div", {"class":"seasons-and-year-nav"}):
        for c in div.find_all("a", {"href": reg_season}):
                season_not_filtered = c.text
                nb_season.append(season_not_filtered)
                serie_nb_season = ""
                if nb_season != 1:
                        serie_nb_season = nb_season[0]
                        
                        
    serie_nb_episode=serie_html.find('span', {'class':'bp_sub_heading'}).text
    
    
    reg = re.compile('/title/+')
    serie_type = serie_html.find("a", {"href": reg, "title":"See more release dates"}).text
    
    
    serie_actors=[]
    reg_name = re.compile('/name/nm+')
    for div in serie_html.find_all("div", {"class":"article", "id":"titleCast"}):
        for a in div.find_all("a", {"href": reg_name}):
            actors = a.text
            if actors != '':
                serie_actors.append(actors)
                
                
    serie_creators=[]
    for div in serie_html.find_all("div", {"class":"credit_summary_item"}):
        creators = ""
        if div.find("h4", {"class":"inline"}).text == "Creator:":
            for c in div.find_all("a", {"href": reg_name}):
                creators = c.text
                serie_creators.append(creators)
        elif div.find("h4", {"class":"inline"}).text == "Creators:":
            for c in div.find_all("a", {"href": reg_name}):
                creators = c.text
                serie_creators.append(creators)
                
    
    serie_origin=[]
    for div in serie_html.find_all("div", {"class":"txt-block"}):
        for h in div.find_all("h4", {"class":"inline"}):
            country = ""
            if h.text == "Country:":
                for a in div.find_all("a", {"href": reg_search}):
                    country = a.text
                    serie_origin.append(country)
    
    serie_language = ""
    for div in serie_html.find_all("div", {"class":"txt-block"}):
        for h in div.find_all("h4", {"class":"inline"}):
            if h.text == "Language:":
                for a in div.find_all("a", {"href": reg_search}):
                    serie_language = a.text
                    
                    
    
    certif =[]
    serie_certification = ""
    certificate_not_filtered = ""
    for div in serie_html.find_all("div", {"class":"txt-block"}):
        for h in div.find_all("h4", {"class":"inline"}):
            if h.text == "Certificate:":
                for s in div.find_all("span"):
                    certificate_not_filtered = s.text
                    certif.append(certificate_not_filtered)
                    if len(certif) != 1:
                        serie_certification = certif[0]
                        
                        
    serie_rating = serie_html.find('span', {'itemprop':'ratingValue'}).text
                        
    
    return(serie_title, serie_genre, serie_nb_season, serie_nb_episode, serie_type, serie_actors, serie_creators, serie_origin, serie_language, serie_certification,  serie_rating)

In [30]:
def get_serie_storyline(serie_html):
    
    """
    Return series storyline.
    
    :param serie_html: BeautifulSoup element that contains serie infos.
    :type serie_html: bs4.element.Tag.
    :return: storyline of the TV series.
    :rtype: tuple(string)
    """
    
    
    for div in serie_html.find_all("div", {"id":"titleStoryLine"}):
        for story in div.find_all("div", {"class":"inline canwrap"}):
            storyline = story.find("span").text
    
    return(storyline)


Loop on the first 100 serie links to get informations.

The list start at rank 0 : the first film with classement rank 1 on IMDb starts at rank 0 in the list. The classement rank is set at n + 1 (0 + 1).

In [33]:
%%time # ~4min

serie_details = []

#Initiate the rank number.
rank_number = 0

for n in range(len(serie_links)-150):
    
    rank_number = n + 1
    # Add rank_number at the en of the url.
    url_serie_rank = url_serie_ + str(rank_number)
    
    # Get the entire url of the page that contains serie informations.
    link_serie = url_imdb_base + serie_links[n] + url_serie_rank
    #print(link_serie) 
    
    html = get_html_from_link(link_serie)
    
    # Apply get_info_serie function to return every link informations.
    info = get_info_serie(html)
    # Add rank.
    info = (rank_number,) + info
    
    # Info contains tuples : add them to a list.
    serie_details.append(info)
    
    print(rank_number)
    #print(serie_details)
    
#print(serie_details)

    

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
Wall time: 3min 58s


Récupération des storylines pour le NLP.

In [26]:
%%time # ~3min30

serie_resume = []

#Initiate the rank number.
rank_number = 0

for n in range(len(serie_links)-150):
    
    rank_number = n + 1
    # Add rank_number at the en of the url.
    url_serie_rank = url_serie_ + str(rank_number)
    
    # Get the entire url of the page that contains serie informations.
    link_serie = url_imdb_base + serie_links[n] + url_serie_rank
    #print(link_serie) 
    
    html_story = get_html_from_link(link_serie)
    
    # Apply get_info_storyline function to return every storyline.
    serie_storyline = get_serie_storyline(html_story)
    
    serie_resume.append(serie_storyline)

    
    print(rank_number)
    
    

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
Wall time: 3min 24s


In [37]:
#Get list of rank numbers to insert into dataframe.

rank_number = 0
rank=[]

for n in range(len(serie_links)-150):
    
    rank_number = n + 1
    
    rank.append(rank_number)
    
print(rank)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]


In [36]:
url_test = url_imdb_base + serie_links[3] + url_serie_ + str(4)
#print(url_test)
cast = get_html_from_link(url_test)
#print(type(cast))

print(rank)

[1, 2, 3]


## Put de data into a dataframe.

Put the data from the list of tuple into a dataframe df_serie.
Then, export the dataframe to a csv file to save it.

In [100]:
df_serie = pd.DataFrame(serie_details, 
                        columns = ["Rank", "Title", "Genre", "Number_of_season", "Number_of_episodes", "Type", "Actors", 
                                   "Creators", "Origin", "Language", "Certification", "Rating"])

df_serie.to_csv('C:\\Users\\stosc\\Documents\\ESME\\Ingé2_2019-2020\\S2\\UE1\\DataTools\\Projet\series_data.csv', 
                index=False, header=True)

#df_serie

Unnamed: 0,Rank,Title,Genre,Number_of_season,Number_of_episodes,Type,Actors,Creators,Origin,Language,Certification,Rating
0,1,Planet Earth II,[ Documentary],1,6 episodes,TV Mini-Series (2016)\n,[ David Attenborough\n],[],[UK],English,,9.5
1,2,Planète Terre,[ Documentary],1,11 episodes,TV Mini-Series (2006)\n,"[ David Attenborough\n, Sigourney Weaver\n, ...",[],[UK],English,Tous publics,9.4
2,3,Frères d'armes,"[ Action, Drama, History, War]",1,10 episodes,TV Mini-Series (2001)\n,"[ Scott Grimes\n, Damian Lewis\n, Ron Living...",[],"[USA, UK]",Lithuanian,Tous publics,9.4
3,4,Breaking Bad,"[ Crime, Drama, Thriller]",5,62 episodes,TV Series (2008–2013)\n,"[ Bryan Cranston\n, Anna Gunn\n, Aaron Paul\...",[Vince Gilligan],[USA],Spanish,Tous publics,9.5
4,5,Chernobyl,"[ Drama, History, Thriller]",1,5 episodes,TV Mini-Series (2019)\n,"[ Jessie Buckley\n, Jared Harris\n, Stellan ...",[Craig Mazin],"[USA, UK]",English,12,9.4
...,...,...,...,...,...,...,...,...,...,...,...,...
95,96,The Thick of It,[ Comedy],4,24 episodes,TV Series (2005–2012)\n,"[ Chris Addison\n, James Smith\n, Peter Capa...",[],[UK],English,,8.7
96,97,Demon Slayer,"[ Animation, Action, Fantasy, Thriller]",1,27 episodes,TV Series (2019– )\n,"[ Natsuki Hanae\n, Zach Aguilar\n, Abby Trot...",[],[Japan],Japanese,Tous publics avec avertissement,8.8
97,98,The Promised Neverland,"[ Animation, Fantasy, Horror, Mystery, Sci...",2,13 episodes,TV Series (2019– )\n,"[ Sumire Morohoshi\n, Maaya Uchida\n, Mariya...",[],[Japan],Japanese,,8.8
98,99,South Park,"[ Animation, Comedy]",24,309 episodes,TV Series (1997– )\n,"[ Trey Parker\n, Matt Stone\n, Mona Marshall...","[Trey Parker, Matt Stone, Brian Graden]",[USA],English,16,8.7


Save the storylines of the TV series as dataframe to analyse them using NLP.

In [43]:
df_storyline = pd.DataFrame(serie_resume, columns=["Storyline"], index=rank)

df_storyline.to_csv('C:\\Users\\stosc\\Documents\\ESME\\Ingé2_2019-2020\\S2\\UE1\\DataTools\\Projet\series_storylines.csv', 
                header=True)

df_storyline

Unnamed: 0,Storyline
1,David Attenborough returns in this breatht...
2,Each 50 minute episode features a global o...
3,"This is the story of ""E"" Easy Company, 506..."
4,When chemistry teacher Walter White is dia...
5,"In April 1986, a huge explosion erupted at..."
...,...
96,"Nathaniel Fisher, his wife Ruth, and their..."
97,"From the earliest times, the humanity know..."
98,"The curious, adventure-seeking, fourth gra..."
99,"At Grace Field House, life couldn't be bet..."


In [99]:
# Read the csv file into a dataframe.
df_serie = pd.read_csv("C:\\Users\\stosc\\Documents\\ESME\\Ingé2_2019-2020\\S2\\UE1\\DataTools\\Projet\series_data.csv",
                      header=0, index_col=0)

df_serie

Unnamed: 0_level_0,Title,Genre,Number_of_season,Number_of_episodes,Type,Actors,Creators,Origin,Language,Certification,Rating
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Planet Earth II,[' Documentary'],1,6 episodes,TV Mini-Series (2016)\n,[' David Attenborough\n'],[],['UK'],English,,9.5
2,Planète Terre,[' Documentary'],1,11 episodes,TV Mini-Series (2006)\n,"[' David Attenborough\n', ' Sigourney Weaver\n...",[],['UK'],English,Tous publics,9.4
3,Frères d'armes,"[' Action', ' Drama', ' History', ' War']",1,10 episodes,TV Mini-Series (2001)\n,"[' Scott Grimes\n', ' Damian Lewis\n', ' Ron L...",[],"['USA', 'UK']",Lithuanian,Tous publics,9.4
4,Breaking Bad,"[' Crime', ' Drama', ' Thriller']",5,62 episodes,TV Series (2008–2013)\n,"[' Bryan Cranston\n', ' Anna Gunn\n', ' Aaron ...",['Vince Gilligan'],['USA'],Spanish,Tous publics,9.5
5,Chernobyl,"[' Drama', ' History', ' Thriller']",1,5 episodes,TV Mini-Series (2019)\n,"[' Jessie Buckley\n', ' Jared Harris\n', ' Ste...",['Craig Mazin'],"['USA', 'UK']",English,12,9.4
...,...,...,...,...,...,...,...,...,...,...,...
96,The Thick of It,[' Comedy'],4,24 episodes,TV Series (2005–2012)\n,"[' Chris Addison\n', ' James Smith\n', ' Peter...",[],['UK'],English,,8.7
97,Demon Slayer,"[' Animation', ' Action', ' Fantasy', ' Thrill...",1,27 episodes,TV Series (2019– )\n,"[' Natsuki Hanae\n', ' Zach Aguilar\n', ' Abby...",[],['Japan'],Japanese,Tous publics avec avertissement,8.8
98,The Promised Neverland,"[' Animation', ' Fantasy', ' Horror', ' Myster...",2,13 episodes,TV Series (2019– )\n,"[' Sumire Morohoshi\n', ' Maaya Uchida\n', ' M...",[],['Japan'],Japanese,,8.8
99,South Park,"[' Animation', ' Comedy']",24,309 episodes,TV Series (1997– )\n,"[' Trey Parker\n', ' Matt Stone\n', ' Mona Mar...","['Trey Parker', 'Matt Stone', 'Brian Graden']",['USA'],English,16,8.7


In [45]:
df_storylines = pd.read_csv("C:\\Users\\stosc\\Documents\\ESME\\Ingé2_2019-2020\\S2\\UE1\\DataTools\\Projet\series_storylines.csv",
                      #header=0, index_col=0)

df_storylines