# Creating the dataset using beautiful soup

We will be scraping Wikipedia pages to create a dataset on Tarantino movies. 

### imports

In [1]:
from bs4 import BeautifulSoup as bs
import requests

### loading the first wiki webpage

let's start by scraping data from the pulp fiction wiki page: https://en.wikipedia.org/wiki/Pulp_Fiction

In [2]:
#https://url
r=requests.get("https://en.wikipedia.org/wiki/Pulp_Fiction")
#covert the web page into a beautifulsoup object
soup=bs(r.content)
#print a html using bs
contents=soup.prettify()
#print(contents)
infobox=soup.find(class_="infobox vevent")

In [3]:
#print(infobox.prettify())

In [4]:
## we'll be building a dictionary from the little infobox on wiki pages
inforows=infobox.find_all("tr")
for row in inforows:
    print(row.prettify())


<tr>
 <th class="infobox-above summary" colspan="2" style="font-size: 125%; font-style: italic;">
  Pulp Fiction
 </th>
</tr>

<tr>
 <td class="infobox-image" colspan="2">
  <a class="image" href="/wiki/File:Pulp_Fiction_(1994)_poster.jpg" title='A pulp-magazine themed poster shows with a woman in a bedroom lying on her stomach in a bed holding a cigarette. Her left hands lays over a novel that reads "Pulp Fiction" on it. An ash tray, pack of cigarettes, and a pistol is laid down near her. The top tagline reads "WINNER - BEST PICTURE - 1994 CANNES FILM FESTIVAL". A sticker below the title reads "10₵".'>
   <img alt='A pulp-magazine themed poster shows with a woman in a bedroom lying on her stomach in a bed holding a cigarette. Her left hands lays over a novel that reads "Pulp Fiction" on it. An ash tray, pack of cigarettes, and a pistol is laid down near her. The top tagline reads "WINNER - BEST PICTURE - 1994 CANNES FILM FESTIVAL". A sticker below the title reads "10₵".' class="thumbb

In [5]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0"," ") for li in row_data.find_all("li")]
    else:
        return(row_data.get_text(" ", strip=True).replace("\xa0"," "))

movieinfo={}
for index,row in enumerate(inforows):
    #movie title
    if index==0 : 
        movieinfo['title'] = row.find("th").get_text(" ", strip=True)
    #picture don't needit
    elif index==1:
        continue
    #the table
    else:
        contentkey=row.find("th").get_text(" ", strip=True)
        contentvalue=get_content_value(row.find("td"))
        movieinfo[contentkey]=contentvalue


In [6]:
movieinfo

{'title': 'Pulp Fiction',
 'Directed by': 'Quentin Tarantino',
 'Written by': 'Quentin Tarantino',
 'Story by': ['Quentin Tarantino', 'Roger Avary'],
 'Produced by': 'Lawrence Bender',
 'Starring': ['John Travolta',
  'Samuel L. Jackson',
  'Uma Thurman',
  'Harvey Keitel',
  'Tim Roth',
  'Amanda Plummer',
  'Maria de Medeiros',
  'Ving Rhames',
  'Eric Stoltz',
  'Rosanna Arquette',
  'Christopher Walken',
  'Bruce Willis'],
 'Cinematography': 'Andrzej Sekuła',
 'Edited by': 'Sally Menke',
 'Production companies': ['A Band Apart', 'Jersey Films'],
 'Distributed by': 'Miramax Films',
 'Release dates': ['May 21, 1994 ( 1994-05-21 ) ( Cannes )',
  'October 14, 1994 ( 1994-10-14 ) (United States)'],
 'Running time': '154 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$8–8.5 million [2] [3]',
 'Box office': '$213.9 million [2]'}

### loding data from every tarantino box-office movie

https://fr.wikipedia.org/wiki/Pulp_Fiction

<b> get one movie info

In [7]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")
#cleaning refrences[*] and double(date)
def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()
        
def get_info_box(url):

    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)

    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
            
    return movie_info 

In [8]:
get_info_box("https://en.wikipedia.org/wiki/Kill_Bill:_Volume_1")


{'title': 'Kill Bill: Volume 1',
 'Directed by': 'Quentin Tarantino',
 'Written by': 'Quentin Tarantino',
 'Produced by': 'Lawrence Bender',
 'Starring': ['Uma Thurman',
  'Lucy Liu',
  'Vivica A. Fox',
  'Michael Madsen',
  'Daryl Hannah',
  'David Carradine',
  'Sonny Chiba',
  'Julie Dreyfus',
  'Chiaki Kuriyama',
  'Gordon Liu',
  'Michael Parks'],
 'Cinematography': 'Robert Richardson',
 'Edited by': 'Sally Menke',
 'Music by': 'RZA',
 'Production company': 'A Band Apart',
 'Distributed by': 'Miramax Films',
 'Release date': ['October 10, 2003'],
 'Running time': '111 minutes',
 'Country': 'United States',
 'Languages': ['English', 'Chinese', 'Japanese'],
 'Budget': '$30 million',
 'Box office': '$180.9 million'}

In [9]:
r = requests.get("https://en.wikipedia.org/wiki/Quentin_Tarantino_filmography")
soup = bs(r.content)
movies = soup.find("table",{"class":"wikitable plainrowheaders sortable"})

#movies = soup.select(".wikitable.plainrowheaders.sortable")
base_path = "https://en.wikipedia.org/"

movie_info_list = []
links=movies.find_all("a")
links=[i for i in links if links.count(i) == 1]
for link in links:
    if (link.get('title') is not None):
        try:
            relative_path = link.get('href')
            full_path = base_path + relative_path
            title = link.get('title')
            
            movie_info_list.append(get_info_box(full_path))
            
        except Exception as e:
            print(e)

'NoneType' object has no attribute 'find_all'
'NoneType' object has no attribute 'find_all'


In [10]:

len(movie_info_list)

21

### Save/Reload Movies Data

In [11]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [12]:
import json

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [13]:
save_data("tarantino_data_cleaned.json", movie_info_list)

### Cleaning the data

In [14]:
movie_info_list = load_data("tarantino_data_cleaned.json")

we will be 

undefined. Convert running time into an integer

undefined. Convert dates into datetime object

undefined. Split up the long strings

undefined. Convert Budget & Box office to numbers

In [15]:
movie_info_list[0]

{'title': "My Best Friend's Birthday",
 'Directed by': 'Quentin Tarantino',
 'Written by': ['Quentin Tarantino', 'Craig Hamann'],
 'Produced by': ['Quentin Tarantino', 'Craig Hamann', 'Rand Vossler'],
 'Starring': ['Quentin Tarantino',
  'Craig Hamann',
  'Crystal Shaw',
  'Allen Garfield',
  'Al Harrell',
  'Brenda Hillhouse',
  'Linda Kaye',
  'Stevo Polyi',
  'Alan Sanborn',
  'Rich Turner',
  'Rowland Wafford'],
 'Cinematography': ['Roger Avary',
  'Scott Magill',
  'Roberto A. Quezada',
  'Rand Vossler'],
 'Edited by': 'Quentin Tarantino',
 'Distributed by': 'Super Happy Fun',
 'Release date': ['1987'],
 'Running time': ['70 minutes (original version)',
  '36 minutes (remaining version)'],
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$5,000 (estimated)'}

In [16]:
print([movie.get('Running time', 'N/A') for movie in movie_info_list])

[['70 minutes (original version)', '36 minutes (remaining version)'], '100 minutes', '99 minutes', '118 minutes', '154 minutes', ['119 minutes'], '78 minutes', '116 minutes', '98 minutes', '108 minutes', '136 minutes', '154 minutes', '111 minutes', '137 minutes', '124 minutes', '113 minutes', '105 minutes 91 minutes (Grindhouse)', '153 minutes', '165 minutes', ['187 minutes ( Roadshow )', '168 minutes ( General )'], '161 minutes']


In [17]:
# "85 minutes"
def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else: # is a string
        return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', "N/A"))

In [18]:
print([movie.get('Running time (int)', 'N/A') for movie in movie_info_list])

[70, 100, 99, 118, 154, 119, 78, 116, 98, 108, 136, 154, 111, 137, 124, 113, 105, 153, 165, 187, 161]


In [19]:
print([movie.get('Release date', 'N/A') for movie in movie_info_list])

[['1987'], ['October 18, 1991'], 'N/A', ['September 10, 1993'], 'N/A', ['August 26, 1994'], ['August 26, 1994 (United States)'], ['May 12, 1995'], ['December 25, 1995'], ['January 19, 1996'], ['June 7, 1996'], 'N/A', ['October 10, 2003'], 'N/A', 'N/A', ['April 6, 2007'], ['April 6, 2007'], 'N/A', 'N/A', 'N/A', 'N/A']


In [20]:
# June 28, 1950
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
        
    date_str = clean_date(date)

    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None

In [21]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [22]:
print([movie.get('Budget', 'N/A') for movie in movie_info_list])


['$5,000 (estimated)', 'N/A', '$1.2–3 million', '$12.5 million', '$8–8.5 million', '$34 million', '$8 million', '$53 million', '$4 million', '$19 million', '$75 million', '$12 million', '$30 million', '$30 million', '$40 million', '$30 million', '$23 million', '$70 million', '$100 million', '$44–62 million', '$90–96 million']


In [23]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''
def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None

In [24]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

In [25]:
print([movie.get('Release date', 'N/A') for movie in movie_info_list])


[['1987'], ['October 18, 1991'], 'N/A', ['September 10, 1993'], 'N/A', ['August 26, 1994'], ['August 26, 1994 (United States)'], ['May 12, 1995'], ['December 25, 1995'], ['January 19, 1996'], ['June 7, 1996'], 'N/A', ['October 10, 2003'], 'N/A', 'N/A', ['April 6, 2007'], ['April 6, 2007'], 'N/A', 'N/A', 'N/A', 'N/A']


In [26]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [27]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [28]:
save_data_pickle("tarantino_movie_data_cleaned_more.pickle", movie_info_list)

In [29]:
a = load_data_pickle("tarantino_movie_data_cleaned_more.pickle")

In [30]:
a == movie_info_list

True

### Attach IMDB/Rotten Tomatoes/Metascore scores

In [31]:
movie_info_list = load_data_pickle('tarantino_movie_data_cleaned_more.pickle')

In [32]:
movie_info_list[-20]

{'title': 'Past Midnight',
 'Directed by': 'Jan Eliasberg',
 'Written by': ['Frank Norwood', 'Quentin Tarantino', '(uncredited)'],
 'Produced by': ['Lisa M. Hansen',
  'Catalaine Knell',
  'Nancy Rae Stone',
  'Quentin Tarantino'],
 'Starring': ['Rutger Hauer',
  'Natasha Richardson',
  'Clancy Brown',
  'Guy Boyd'],
 'Cinematography': 'Robert D. Yeoman',
 'Edited by': 'Christopher Rouse',
 'Music by': 'Steve Bartek',
 'Production company': 'CineTel Films',
 'Distributed by': ['New Line Cinema',
  'Cineplex Odeon Films',
  'Columbia TriStar'],
 'Release date': ['October 18, 1991'],
 'Running time': '100 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Running time (int)': 100,
 'Release date (datetime)': datetime.datetime(1991, 10, 18, 0, 0),
 'Budget (float)': None,
 'Box office (float)': None}

In [35]:
import requests
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": 'f8685777', 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None

get_omdb_info("pulp fiction")


{'Title': 'Pulp Fiction',
 'Year': '1994',
 'Rated': 'R',
 'Released': '14 Oct 1994',
 'Runtime': '154 min',
 'Genre': 'Crime, Drama',
 'Director': 'Quentin Tarantino',
 'Writer': 'Quentin Tarantino, Roger Avary',
 'Actors': 'John Travolta, Uma Thurman, Samuel L. Jackson',
 'Plot': 'The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.',
 'Language': 'English, Spanish, French',
 'Country': 'United States',
 'Awards': 'Won 1 Oscar. 70 wins & 75 nominations total',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BNGNhMDIzZTUtNTBlZi00MTRlLWFjM2ItYzViMjE3YzI5MjljXkEyXkFqcGdeQXVyNzkwMjQ5NzM@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.9/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '92%'},
  {'Source': 'Metacritic', 'Value': '94/100'}],
 'Metascore': '94',
 'imdbRating': '8.9',
 'imdbVotes': '2,014,735',
 'imdbID': 'tt0110912',
 'Type': 'movie',
 'DVD': '20 Aug 20

In [36]:
for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['genre'] = omdb_info.get('Genre', None)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)

In [39]:
for movie in movie_info_list:
    movie['imdb'] = float(movie['imdb'])
    movie['metascore'] = float(movie['metascore'])
    movie['rotten_tomatoes'] = float(movie['rotten_tomatoes'].strip('%'))

ValueError: could not convert string to float: 'N/A'

In [40]:
save_data_pickle('tarantino_movie_data_final.pickle', movie_info_list)

### Save data as JSON & CSV

In [42]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [43]:
for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [44]:
save_data("tarantino_data_final.json", movie_info_copy)

Convert data to CSV

In [45]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [46]:
df.head()

Unnamed: 0,title,Directed by,Written by,Produced by,Starring,Cinematography,Edited by,Distributed by,Release date,Running time,...,Production company,Production companies,Release dates,Box office,Story by,Screenplay by,Based on,Languages,Countries,Narrated by
0,My Best Friend's Birthday,Quentin Tarantino,"[Quentin Tarantino, Craig Hamann]","[Quentin Tarantino, Craig Hamann, Rand Vossler]","[Quentin Tarantino, Craig Hamann, Crystal Shaw...","[Roger Avary, Scott Magill, Roberto A. Quezada...",Quentin Tarantino,Super Happy Fun,[1987],"[70 minutes (original version), 36 minutes (re...",...,,,,,,,,,,
1,Past Midnight,Jan Eliasberg,"[Frank Norwood, Quentin Tarantino, (uncredited)]","[Lisa M. Hansen, Catalaine Knell, Nancy Rae St...","[Rutger Hauer, Natasha Richardson, Clancy Brow...",Robert D. Yeoman,Christopher Rouse,"[New Line Cinema, Cineplex Odeon Films, Columb...","[October 18, 1991]",100 minutes,...,CineTel Films,,,,,,,,,
2,Reservoir Dogs,Quentin Tarantino,Quentin Tarantino,Lawrence Bender,"[Harvey Keitel, Tim Roth, Chris Penn, Steve Bu...",Andrzej Sekuła,Sally Menke,Miramax Films,,99 minutes,...,,"[Live America Inc., Dog Eat Dog Productions]","[January 21, 1992 ( Sundance ), October 9, 199...",$2.9 million,,,,,,
3,True Romance,Tony Scott,Quentin Tarantino,"[Samuel Hadida, Steve Perry, Bill Unger]","[Christian Slater, Patricia Arquette, Dennis H...",Jeffrey L. Kimball,"[Michael Tronick, Christian Wagner]",Warner Bros.,"[September 10, 1993]",118 minutes,...,,"[Morgan Creek Productions, Davis Films, A Band...",,$12.6 million,,,,,,
4,Pulp Fiction,Quentin Tarantino,Quentin Tarantino,Lawrence Bender,"[John Travolta, Samuel L. Jackson, Uma Thurman...",Andrzej Sekuła,Sally Menke,Miramax Films,,154 minutes,...,,"[A Band Apart, Jersey Films]","[May 21, 1994 ( Cannes ), October 14, 1994 (Un...",$213.9 million,"[Quentin Tarantino, Roger Avary]",,,,,


In [57]:
_deepnote_run_altair(df, """{"data":{"name":"placeholder"},"mark":{"type":"bar","tooltip":true},"height":220,"$schema":"https://vega.github.io/schema/vega-lite/v4.json","autosize":{"type":"fit"},"encoding":{"x":{"sort":null,"type":"nominal","field":"title","scale":{"type":"linear","zero":false}},"y":{"sort":null,"type":"nominal","field":"Running time","scale":{"type":"linear","zero":true}},"color":{"sort":null,"type":"quantitative","field":"Box office (float)","scale":{"type":"linear","zero":false}}}}""")

and i thoughn once upon a time was long ! it turns out be the shortest one wtf !!

![Picture title](image-20220905-141628.png)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=79814980-67cc-47de-82db-161486c6da89' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>