In [1182]:
import pandas as pd
import numpy as np
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

# Scrap the BBC page

In [1183]:
raw_html = urlopen("http://www.bbc.com/culture/story/20160819-the-21st-centurys-100-greatest-films-who-voted").read()
soup_doc = BeautifulSoup(raw_html, "html.parser")

In [1184]:
top_ten_list = []

paragraphs = (soup_doc
              .find('div', attrs={'class':'body-content'})
              .find_all('p', text=re.compile(r'^.*–.*\(.*\)$')))

for p in paragraphs:
    films = []
    for film in p.find_next_sibling():
        if film.name == None:
            films.append(re.search(r'^\d\d?\. (.*)', film).group(1))
    top_ten_list.append({
        'author': p.get_text(),
        'list': films
    })
    
top_ten_list[0]

{'author': 'Simon Abrams – Freelance film critic (US)',
 'list': ['Mulholland Drive (David Lynch, 2001)',
  'In the Mood for Love (Wong Kar-wai, 2000)',
  'The Tree of Life (Terrence Malick, 2011)',
  'Yi Yi: A One and a Two (Edward Yang, 2000)',
  'Goodbye to Language (Jean-Luc Godard, 2014)',
  'The White Meadows (Mohammad Rasoulof, 2009)',
  'Night Across the Street (Raoul Ruiz, 2012)',
  'Certified Copy (Abbas Kiarostami, 2010)',
  'Sparrow (Johnnie To, 2008)',
  'Fados (Carlos Saura, 2007)']}

# Create lists of unique values
### And save them in psql
Be sure to avoid duplications

## Films
Extract the films from each dictionary

In [1185]:
# Extract the films from each dictionary
film_list = [top_ten['list'] for top_ten in top_ten_list]
# Flatten the list
film_list = [film for films in film_list for film in films]
# Get the unique values
films = list(set(film_list))
films.sort()

In [1186]:
# Check for duplications
films

['12 Years a Slave (Steve McQueen, 2013)',
 '2046 (Wong Kar-wai, 2004)',
 '24 Hour Party People (Michael Winterbottom, 2002)',
 '25th Hour (Spike Lee, 2002)',
 '3-Iron (Kim Ki-duk, 2004)',
 '35 Shots of Rum (Claire Denis, 2008)',
 '4 Months, 3 Weeks & 2 Days (Cristian Mungiu, 2007)',
 '5 Broken Cameras (Emad Burnat and Guy Davidi, 2011)',
 '5x2 (François Ozon, 2004)',
 '678 (Mohamed Diab, 2010)',
 '7 Letters (Boo Junfeng, Eric Khoo, Jack Neo, K. Rajagopal, Tan Pin Pin, Royston Tan and Kelvin Tong, 2015)',
 '99 Homes (Ramin Bahrani, 2014)',
 'A Borrowed Identity (Eran Riklis, 2014)',
 'A Commuter’s Life (What a Life!) (Ernie Gehr, 2014)',
 'A Girl Walks Home Alone at Night (Ana Lily Amirpour, 2014)',
 'A History of Violence (David Cronenberg, 2005)',
 'A Letter to Nelson Mandela (Khalo Matabane, 2013)',
 'A Pigeon Sat on a Branch Reflecting on Existence (Roy Andersson, 2014)',
 'A Prophet (Jacques Audiard, 2009)',
 'A Separation (Asghar Farhadi, 2011)',
 'A Serious Man (Joel and Ethan C

In [1187]:
films = [f.replace('Man on Wire (James Marsch, 2008)', 'Man on Wire (James Marsh, 2008)') for f in films]
films = [f.replace('Morvern Callar (Lynne Ramsay, 2012)', 'Morvern Callar (Lynne Ramsay, 2002)') for f in films]
films = [f.replace('Moolaadé (Ousmane, Sembèène 2004)', 'Moolaadé (Ousmane Sembène, 2004)') for f in films]
films = [f.replace('Fantastic Mr Fox (Wes Anderson, 2012)', 'Fantastic Mr Fox (Wes Anderson, 2009)') for f in films]
films = [f.replace('Finding Nemo (Andrew Stanton and Lee Unkrich, 2003)', 'Finding Nemo (Andrew Stanton, 2003)') for f in films]

In [1188]:
films = list(set(films))
films.sort()
films

['12 Years a Slave (Steve McQueen, 2013)',
 '2046 (Wong Kar-wai, 2004)',
 '24 Hour Party People (Michael Winterbottom, 2002)',
 '25th Hour (Spike Lee, 2002)',
 '3-Iron (Kim Ki-duk, 2004)',
 '35 Shots of Rum (Claire Denis, 2008)',
 '4 Months, 3 Weeks & 2 Days (Cristian Mungiu, 2007)',
 '5 Broken Cameras (Emad Burnat and Guy Davidi, 2011)',
 '5x2 (François Ozon, 2004)',
 '678 (Mohamed Diab, 2010)',
 '7 Letters (Boo Junfeng, Eric Khoo, Jack Neo, K. Rajagopal, Tan Pin Pin, Royston Tan and Kelvin Tong, 2015)',
 '99 Homes (Ramin Bahrani, 2014)',
 'A Borrowed Identity (Eran Riklis, 2014)',
 'A Commuter’s Life (What a Life!) (Ernie Gehr, 2014)',
 'A Girl Walks Home Alone at Night (Ana Lily Amirpour, 2014)',
 'A History of Violence (David Cronenberg, 2005)',
 'A Letter to Nelson Mandela (Khalo Matabane, 2013)',
 'A Pigeon Sat on a Branch Reflecting on Existence (Roy Andersson, 2014)',
 'A Prophet (Jacques Audiard, 2009)',
 'A Separation (Asghar Farhadi, 2011)',
 'A Serious Man (Joel and Ethan C

### Refine the list

In [1189]:
for index, film in enumerate(films):
    title = re.search(r'(.*) \(', film).group(1)
    director = re.search(r'\(((.(?!\) \())+), [0-9]{4}\)$', film).group(1)
    release_year = int(re.search(r'\((.*), ([0-9]{4})', film).group(2))
    films[index] = {
        'title': title, 
        'director': director, # Can be a list of directors
        'release_year': release_year
    }
    
df_films = pd.DataFrame(films)

In [1190]:
df_films.head()

Unnamed: 0,director,release_year,title
0,Steve McQueen,2013,12 Years a Slave
1,Wong Kar-wai,2004,2046
2,Michael Winterbottom,2002,24 Hour Party People
3,Spike Lee,2002,25th Hour
4,Kim Ki-duk,2004,3-Iron


In [1191]:
# Save the CSV to be imported in PSQL
df_films.to_csv('films.csv', index=False)

In [1192]:
# Get the CSV from PSQL
df_films = pd.read_csv('films-psql.csv', index_col='id')
df_films.head()

Unnamed: 0_level_0,title,director,release_year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1468,12 Years a Slave,Steve McQueen,2013
1469,2046,Wong Kar-wai,2004
1470,24 Hour Party People,Michael Winterbottom,2002
1471,25th Hour,Spike Lee,2002
1472,3-Iron,Kim Ki-duk,2004


## Directors

In [1193]:
# Extract the directors from each dictionary
directors = []

for film in films:
    for d in re.findall(r"[^,]+", film['director']):
        for dd in d.split(' and '):
            directors.append(dd.lstrip())

directors = list(set(directors))
directors.sort()
directors

['Abbas Kiarostami',
 'Abdellatif Kechiche',
 'Abderrahmane Sissako',
 'Adam Curtis',
 'Adam McKay',
 'Agnieszka Holland',
 'Agnès Jaoui',
 'Agnès Varda',
 'Aki Kaurismäki',
 'Alain Cavalier',
 'Alain Gomis',
 'Alain Guiraudie',
 'Alain Resnais',
 'Alan Mak',
 'Albert Serra',
 'Alejandro González Iñárritu',
 'Aleksandr Sokurov',
 'Aleksey Fedorchenko',
 'Aleksey German',
 'Alex Garland',
 'Alexander Payne',
 'Alfonso Cuarón',
 'Amma Asante',
 'Ana Lily Amirpour',
 'Andrea Arnold',
 'Andrew Adamson',
 'Andrew Dominik',
 'Andrew Dosunmu',
 'Andrew Haigh',
 'Andrew Lau',
 'Andrew Stanton',
 'Andrey Zvyagintsev',
 'Andrzej Wajda',
 'Andrzej Zulawski',
 'André Singer',
 'Ang Lee',
 'Angela Ricci Lucchi',
 'Annemarie Jacir',
 'Anthony',
 'Antonio Di Trapani',
 'Anurag Kashyap',
 'Apichatpong Weerasethakul',
 'Ari Folman',
 'Arnaud Desplechin',
 'Asghar Farhadi',
 'Ashutosh Gowariker',
 'Asif Kapadia',
 'Ava DuVernay',
 'Avi Nesher',
 'Bahman Ghobadi',
 'Bart Layton',
 'Baz Luhrmann',
 'Bazi 

### Refine the list

In [1194]:
for index, director in enumerate(directors):
    if re.search(r'((.*)( .*)?) (.*)$', director) == None:
        first_name = director
        last_name = np.nan
    else:
        first_name = re.search(r'((.*)( .*)?) (.*)$', director).group(1)
        last_name = re.search(r'((.*)( .*)?) (.*)$', director).group(4)
        
    directors[index] = {
        'first_name': first_name, 
        'last_name': last_name
    }
    
df_directors = pd.DataFrame(directors)

In [1195]:
df_directors.head()

Unnamed: 0,first_name,last_name
0,Abbas,Kiarostami
1,Abdellatif,Kechiche
2,Abderrahmane,Sissako
3,Adam,Curtis
4,Adam,McKay


In [1196]:
# Save the CSV to be imported in PSQL
df_directors.to_csv('directors.csv', index=False)

In [1197]:
# Manually correct the CSV...
# A few last names are missing

In [1198]:
# Get the CSV from PSQL
df_directors = pd.read_csv('directors-psql.csv', index_col='id')
df_directors.head()

Unnamed: 0_level_0,first_name,last_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Abbas,Kiarostami
2,Abdellatif,Kechiche
3,Abderrahmane,Sissako
4,Adam,Curtis
5,Adam,McKay


## Critics
Extract the critic from each dictionary


In [1199]:
# Extract the author from each dictionary
critics = [top_ten['author'] for top_ten in top_ten_list]

### Refine the list 

In [1200]:
for index, critic in enumerate(critics):
    first_name = re.search(r'((.*)( .*)?) (.*) \–', critic).group(1)
    last_name = re.search(r'((.*)( .*)?) (.*) \–', critic).group(4)
    nationality = re.search(r'\((.*)\)', critic).group(1)
    company = re.search(r' – (.*) \(', critic).group(1)
    critics[index] = {
        'first_name': first_name, 
        'last_name': last_name,
        'nationality': nationality,
        'company': company
    }
    
df_critics = pd.DataFrame(critics)

In [1201]:
df_critics.head()

Unnamed: 0,company,first_name,last_name,nationality
0,Freelance film critic,Simon,Abrams,US
1,Freelance film critic,Sam,Adams,US
2,Freelance film critic,Thelma,Adams,US
3,Rolling Stone Mexico,Arturo,Aguilar,Mexico
4,BBC Culture,Matthew,Anderson,UK


#### Replace the country by it's ISO code

In [1202]:
ISO_countries = pd.read_csv('iso_3166_2_countries.csv')[['Common Name', 'ISO 3166-1 2 Letter Code']]
ISO_countries.columns = ['nationality', 'nationality_code']
ISO_countries.head()

Unnamed: 0,nationality,nationality_code
0,Afghanistan,AF
1,Albania,AL
2,Algeria,DZ
3,Andorra,AD
4,Angola,AO


In [1203]:
# Merge the nationalities with the codes
# left: use only keys from left frame, similar to a SQL left outer join; preserve key order
df_critics = df_critics.merge(ISO_countries, on='nationality', how='left')

In [1204]:
# create a list of the countries that have not been found
list(df_critics[df_critics['nationality_code'].isnull()]['nationality'].unique())

['US', 'UK', 'South Korea', 'UAE', 'Namibia', 'Taiwan', 'China']

In [1205]:
# Manually replace the missing countries unrecognized by the merge
countries = {
    'US':'US', 
    'UK':'GB', 
    'South Korea':'KR', 
    'UAE':'AE', 
    'Namibia':'NA', 
    'Taiwan':'TW', 
    'China':'CN'
}
df_critics.loc[df_critics['nationality_code'].isnull(), 'nationality_code'] = df_critics[df_critics['nationality_code'].isnull()]['nationality']
df_critics['nationality_code'].replace(countries, inplace=True)

In [1206]:
# Get rid of the non-standard country column
df_critics = df_critics.drop('nationality', 1)

In [1207]:
# Save the CSV to be imported in PSQL
df_critics.to_csv('critics.csv', index=False)

In [1208]:
# Get the CSV from PSQL
df_critics = pd.read_csv('critics-psql.csv', index_col='id')
df_critics.head()

Unnamed: 0_level_0,first_name,last_name,nationality_code,company
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Simon,Abrams,US,Freelance film critic
2,Sam,Adams,US,Freelance film critic
3,Thelma,Adams,US,Freelance film critic
4,Arturo,Aguilar,MX,Rolling Stone Mexico
5,Matthew,Anderson,GB,BBC Culture


# Create a flatten dataframe of all information available

In [1209]:
df_critics.head()

Unnamed: 0_level_0,first_name,last_name,nationality_code,company
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Simon,Abrams,US,Freelance film critic
2,Sam,Adams,US,Freelance film critic
3,Thelma,Adams,US,Freelance film critic
4,Arturo,Aguilar,MX,Rolling Stone Mexico
5,Matthew,Anderson,GB,BBC Culture


In [1210]:
reviews = []

for top_ten in top_ten_list:
    for index, review in enumerate(top_ten['list']):
        reviews.append({
            'critic': top_ten['author'],
            'film': review,
            'rank': index
        })

df_reviews = pd.DataFrame(reviews)

In [1211]:
df_reviews.head()

Unnamed: 0,critic,film,rank
0,Simon Abrams – Freelance film critic (US),"Mulholland Drive (David Lynch, 2001)",0
1,Simon Abrams – Freelance film critic (US),"In the Mood for Love (Wong Kar-wai, 2000)",1
2,Simon Abrams – Freelance film critic (US),"The Tree of Life (Terrence Malick, 2011)",2
3,Simon Abrams – Freelance film critic (US),"Yi Yi: A One and a Two (Edward Yang, 2000)",3
4,Simon Abrams – Freelance film critic (US),"Goodbye to Language (Jean-Luc Godard, 2014)",4


# Correct all the duplications

In [1212]:
df_reviews['film'].unique()

array(['Mulholland Drive (David Lynch, 2001)',
       'In the Mood for Love (Wong Kar-wai, 2000)',
       'The Tree of Life (Terrence Malick, 2011)',
       'Yi Yi: A One and a Two (Edward Yang, 2000)',
       'Goodbye to Language (Jean-Luc Godard, 2014)',
       'The White Meadows (Mohammad Rasoulof, 2009)',
       'Night Across the Street (Raoul Ruiz, 2012)',
       'Certified Copy (Abbas Kiarostami, 2010)',
       'Sparrow (Johnnie To, 2008)', 'Fados (Carlos Saura, 2007)',
       'Eternal Sunshine of the Spotless Mind (Michel Gondry, 2004)',
       'Syndromes and a Century (Apichatpong Weerasethakul, 2006)',
       'Spirited Away (Hayao Miyazaki, 2001)',
       'The Act of Killing (Joshua Oppenheimer, 2012)',
       'The Grand Budapest Hotel (Wes Anderson, 2014)',
       'The New World (Terrence Malick, 2004)',
       'The World (Jia Zhangke, 2004)', 'Elephant (Gus Van Sant, 2003)',
       'Zero Dark Thirty (Kathryn Bigelow, 2012)',
       'A History of Violence (David Cronenberg, 2

In [1213]:
df_reviews.loc[df_reviews['film'] == 'Man on Wire (James Marsch, 2008)', 'film'] = 'Man on Wire (James Marsh, 2008)'
df_reviews.loc[df_reviews['film'] == 'Morvern Callar (Lynne Ramsay, 2012)', 'film'] = 'Morvern Callar (Lynne Ramsay, 2002)'
df_reviews.loc[df_reviews['film'] == 'Moolaadé (Ousmane, Sembèène 2004)', 'film'] = 'Moolaadé (Ousmane Sembène, 2004)'
df_reviews.loc[df_reviews['film'] == 'Fantastic Mr Fox (Wes Anderson, 2012)', 'film'] = 'Fantastic Mr Fox (Wes Anderson, 2009)'
df_reviews.loc[df_reviews['film'] == 'Finding Nemo (Andrew Stanton and Lee Unkrich, 2003)', 'film'] = 'Finding Nemo (Andrew Stanton, 2003)'

In [1214]:
len(df_reviews['film'].unique())

595

# Create the reviews table

## Films

In [1215]:
films = []

def set_films(row):
    if row['film'] not in films:
        films.append(row['film'])
    return films.index(row['film'])
    
df_reviews['film'] = df_reviews.apply(set_films, axis=1)
df_reviews.rename(columns={'film':'film_id'}, inplace=True)

## Critics

In [1216]:
critics = []

def set_critics(row):
    if row['critic'] not in critics:
        critics.append(row['critic'])
    return critics.index(row['critic'])
    
df_reviews['critic'] = df_reviews.apply(set_critics, axis=1)
df_reviews.rename(columns={'critic':'critic_id'}, inplace=True)

In [1217]:
df_reviews.head()

Unnamed: 0,critic_id,film_id,rank
0,0,0,0
1,0,1,1
2,0,2,2
3,0,3,3
4,0,4,4


## Save the reviews table 

In [1218]:
df_reviews.index.name = 'id'
df_reviews.to_csv('reviews.csv')

In [1219]:
# Control if everything is ok with the csv exported from pandas
df_reviews = pd.read_csv('reviews.csv', index_col='id')
df_reviews.head()

Unnamed: 0_level_0,critic_id,film_id,rank
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,0
1,0,1,1
2,0,2,2
3,0,3,3
4,0,4,4


In [1220]:
# Control if everything is ok with the csv exported from psql
df_reviews = pd.read_csv('reviews-psql.csv', index_col='id')
df_reviews.head()

Unnamed: 0_level_0,critic_id,film_id,rank
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,0
1,0,1,1
2,0,2,2
3,0,3,3
4,0,4,4


In [1221]:
# Basic analytics
df_reviews[df_reviews['rank'] <= 2].groupby('film_id')['rank'].value_counts().groupby(level=1).nlargest(3)

rank  film_id  rank
0     0        0       16
      1        0       13
      2        0        7
1     0        1       10
      33       1        9
      1        1        6
2     0        2        8
      10       2        5
      42       2        5
Name: rank, dtype: int64

## Create the Critics table

In [1222]:
for index, critic in enumerate(critics):
    first_name = re.search(r'((.*)( .*)?) (.*) \–', critic).group(1)
    last_name = re.search(r'((.*)( .*)?) (.*) \–', critic).group(4)
    nationality = re.search(r'\((.*)\)', critic).group(1)
    company = re.search(r' – (.*) \(', critic).group(1)
    critics[index] = {
        'first_name': first_name, 
        'last_name': last_name,
        'nationality': nationality,
        'company': company
    }
    
df_critics = pd.DataFrame(critics)
df_critics.index.name = 'id'

In [1223]:
df_critics.head()

Unnamed: 0_level_0,company,first_name,last_name,nationality
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Freelance film critic,Simon,Abrams,US
1,Freelance film critic,Sam,Adams,US
2,Freelance film critic,Thelma,Adams,US
3,Rolling Stone Mexico,Arturo,Aguilar,Mexico
4,BBC Culture,Matthew,Anderson,UK


In [1278]:
ISO_countries = pd.read_csv('iso_3166_2_countries.csv')[['Common Name', 'ISO 3166-1 2 Letter Code']]
ISO_countries.columns = ['nationality', 'nationality_code']
ISO_countries.head()

Unnamed: 0,nationality,nationality_code
0,Afghanistan,AF
1,Albania,AL
2,Algeria,DZ
3,Andorra,AD
4,Angola,AO


In [1279]:
# Merge the nationalities with the codes
# left: use only keys from left frame, similar to a SQL left outer join; preserve key order
df_critics = df_critics.merge(ISO_countries, on='nationality', how='left')

In [1280]:
# create a list of the countries that have not been found
list(df_critics[df_critics['nationality_code'].isnull()]['nationality'].unique())

['US', 'UK', 'South Korea', 'UAE', 'Namibia', 'Taiwan', 'China']

In [1281]:
# Manually replace the missing countries unrecognized by the merge
countries = {
    'US':'US', 
    'UK':'GB', 
    'South Korea':'KR', 
    'UAE':'AE', 
    'Namibia':'NA', 
    'Taiwan':'TW', 
    'China':'CN'
}
df_critics.loc[df_critics['nationality_code'].isnull(), 'nationality_code'] = df_critics[df_critics['nationality_code'].isnull()]['nationality']
df_critics['nationality_code'].replace(countries, inplace=True)

In [1282]:
# Get rid of the non-standard country column
df_critics = df_critics.drop('nationality', 1)

In [1283]:
df_critics.to_csv('critics.csv')

## Create the films table

In [1224]:
directors_list = []

for index, film in enumerate(films):
    title = re.search(r'(.*) \(', film).group(1)
    director = re.search(r'\(((.(?!\) \())+), [0-9]{4}\)$', film).group(1)
    release_year = int(re.search(r'\((.*), ([0-9]{4})', film).group(2))
    directors_list.append(director)
    films[index] = {
        'title': title, 
        'release_year': release_year
    }
    
df_films = pd.DataFrame(films)
df_films.index.name = 'id'
df_films.to_csv('films.csv')

In [1225]:
df_films.head()

Unnamed: 0_level_0,release_year,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2001,Mulholland Drive
1,2000,In the Mood for Love
2,2011,The Tree of Life
3,2000,Yi Yi: A One and a Two
4,2014,Goodbye to Language


## Create the directions and directors table

In [1234]:
directions = []
directors = []

for index, d in enumerate(directors_list):
    for director in re.findall(r"[^,]+", d):
        for dd in director.split(' and '):
            if dd not in directors:
                directors.append(dd.lstrip())
            directions.append({
                'director_id': directors.index(dd.lstrip()),
                'film_id': index
            })
            
df_directions = pd.DataFrame(directions)
df_directions.index.name = 'id'
df_directions.to_csv('directions.csv')

In [1236]:
df_directions.head()

Unnamed: 0_level_0,director_id,film_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,0
1,1,1
2,2,2
3,3,3
4,4,4


In [1237]:
for index, director in enumerate(directors):
    if re.search(r'((.*)( .*)?) (.*)$', director) == None:
        first_name = director
        last_name = np.nan
    else:
        first_name = re.search(r'((.*)( .*)?) (.*)$', director).group(1)
        last_name = re.search(r'((.*)( .*)?) (.*)$', director).group(4)
        
    directors[index] = {
        'first_name': first_name, 
        'last_name': last_name
    }
    
df_directors = pd.DataFrame(directors)

In [1240]:
# Check for errors
df_directors[df_directors['last_name'].isnull()]

Unnamed: 0,first_name,last_name
39,Joel,
135,Jean-Pierre,
143,Josh,
427,Anthony,


In [1244]:
# Manually correct the missing last names
df_directors.loc[(df_directors['first_name'] == 'Joel') & df_directors['last_name'].isnull(), 'last_name'] = 'Cohen'
df_directors.loc[(df_directors['first_name'] == 'Jean-Pierre') & df_directors['last_name'].isnull(), 'last_name'] = 'Dardenne'
df_directors.loc[(df_directors['first_name'] == 'Josh') & df_directors['last_name'].isnull(), 'last_name'] = 'Safdie'
df_directors.loc[(df_directors['first_name'] == 'Anthony') & df_directors['last_name'].isnull(), 'last_name'] = 'Russo'

In [1246]:
# Check again
df_directors[df_directors['last_name'].isnull()]

Unnamed: 0,first_name,last_name


In [1249]:
# Save the directors table
df_directors = pd.DataFrame(directors)
df_directors.index.name = 'id'
df_directors.to_csv('directors.csv')

In [1250]:
# Manually correct the comma misplacements in the csv

In [1251]:
df_directors = pd.read_csv('directors.csv', index_col='id')

In [1253]:
df_directors.head()

Unnamed: 0_level_0,first_name,last_name
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,David,Lynch
1,Wong,Kar-wai
2,Terrence,Malick
3,Edward,Yang
4,Jean-Luc,Godard


# All the tables

In [1277]:
print('\n-----------------------')
print('Critics')
print('-----------------------')
print(df_critics.dtypes)
print('\n-----------------------')
print('Films')
print('-----------------------')
print(df_films.dtypes)
print('\n-----------------------')
print('Directors')
print('-----------------------')
print(df_directors.dtypes)
print('\n-----------------------')
print('Reviews')
print('-----------------------')
print(df_reviews.dtypes)
print('\n-----------------------')
print('Directions')
print('-----------------------')
print(df_directions.dtypes)


-----------------------
Critics
-----------------------
company        object
first_name     object
last_name      object
nationality    object
dtype: object

-----------------------
Films
-----------------------
release_year     int64
title           object
dtype: object

-----------------------
Directors
-----------------------
first_name    object
last_name     object
dtype: object

-----------------------
Reviews
-----------------------
critic_id    int64
film_id      int64
rank         int64
dtype: object

-----------------------
Directions
-----------------------
director_id    int64
film_id        int64
dtype: object
