In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import os

## Capturing Movies Image from Wikipedia (MediaWiki)
We will use `wptools`, a wrapper around the MediaWiki API that makes it even easier to extract information for specific 'wiki-pages'.

In [25]:
import wptools
import requests
from PIL import Image
from io import BytesIO
import os
import pandas as pd
from tqdm import tqdm_notebook as progressbar

In [14]:
media_wiki_title_list = [
 'The_Wizard_of_Oz_(1939_film)',
 'Citizen_Kane',
 'The_Third_Man',
 'Get_Out_(film)',
 'Mad_Max:_Fury_Road',
 'The_Cabinet_of_Dr._Caligari',
 'All_About_Eve',
 'Inside_Out_(2015_film)',
 'The_Godfather',
 'Metropolis_(1927_film)',
 'E.T._the_Extra-Terrestrial',
 'Modern_Times_(film)',
 'It_Happened_One_Night',
 "Singin'_in_the_Rain",
 'Boyhood_(film)',
 'Casablanca_(film)',
 'Moonlight_(2016_film)',
 'Psycho_(1960_film)',
 'Laura_(1944_film)',
 'Nosferatu',
 'Snow_White_and_the_Seven_Dwarfs_(1937_film)',
 "A_Hard_Day%27s_Night_(film)",
 'La_Grande_Illusion',
 'North_by_Northwest',
 'The_Battle_of_Algiers',
 'Dunkirk_(2017_film)',
 'The_Maltese_Falcon_(1941_film)',
 'Repulsion_(film)',
 '12_Years_a_Slave_(film)',
 'Gravity_(2013_film)',
 'Sunset_Boulevard_(film)',
 'King_Kong_(1933_film)',
 'Spotlight_(film)',
 'The_Adventures_of_Robin_Hood',
 'Rashomon',
 'Rear_Window',
 'Selma_(film)',
 'Taxi_Driver',
 'Toy_Story_3',
 'Argo_(2012_film)',
 'Toy_Story_2',
 'The_Big_Sick',
 'Bride_of_Frankenstein',
 'Zootopia',
 'M_(1931_film)',
 'Wonder_Woman_(2017_film)',
 'The_Philadelphia_Story_(film)',
 'Alien_(film)',
 'Bicycle_Thieves',
 'Seven_Samurai',
 'The_Treasure_of_the_Sierra_Madre_(film)',
 'Up_(2009_film)',
 '12_Angry_Men_(1957_film)',
 'The_400_Blows',
 'Logan_(film)',
 'All_Quiet_on_the_Western_Front_(1930_film)',
 'Army_of_Shadows',
 'Arrival_(film)',
 'Baby_Driver',
 'A_Streetcar_Named_Desire_(1951_film)',
 'The_Night_of_the_Hunter_(film)',
 'Star_Wars:_The_Force_Awakens',
 'Manchester_by_the_Sea_(film)',
 'Dr._Strangelove',
 'Frankenstein_(1931_film)',
 'Vertigo_(film)',
 'The_Dark_Knight_(film)',
 'Touch_of_Evil',
 'The_Babadook',
 'The_Conformist_(film)',
 'Rebecca_(1940_film)',
 "Rosemary%27s_Baby_(film)",
 'Finding_Nemo',
 'Brooklyn_(film)',
 'The_Wrestler_(2008_film)',
 'The_39_Steps_(1935_film)',
 'L.A._Confidential_(film)',
 'Gone_with_the_Wind_(film)',
 'The_Good,_the_Bad_and_the_Ugly',
 'Skyfall',
 'Rome,_Open_City',
 'Tokyo_Story',
 'Hell_or_High_Water_(film)',
 'Pinocchio_(1940_film)',
 'The_Jungle_Book_(2016_film)',
 'La_La_Land_(film)',
 'Star_Trek_(film)',
 'High_Noon',
 'Apocalypse_Now',
 'On_the_Waterfront',
 'The_Wages_of_Fear',
 'The_Last_Picture_Show',
 'Harry_Potter_and_the_Deathly_Hallows_–_Part_2',
 'The_Grapes_of_Wrath_(film)',
 'Roman_Holiday',
 'Man_on_Wire',
 'Jaws_(film)',
 'Toy_Story',
 'The_Godfather_Part_II',
 'Battleship_Potemkin'
]

In [57]:
folder = 'bestofrt_posters'
# Make directory if it doesn't already exist
if not os.path.exists(folder):
    os.makedirs(folder)

In [58]:
# List of dictionaries to build and convert to a DataFrame later
images_list = []
image_errors = {}
for title in progressbar(media_wiki_title_list):
    
    try:
        # This cell is slow so print ranking to gauge time remaining
        ranking = media_wiki_title_list.index(title) + 1
        
        page = wptools.page(title, silent=True).get()
        
        # Retrieving Image url
        image_url = page.data['image'][0]['url']
        r = requests.get(image_url)
        
        # Download movie poster image
        i = Image.open(BytesIO(r.content))
        file_format = image_url.split('.')[-1]
        
        # If image already donwloaded, skip to next movie
        if os.path.isfile(f'{folder}/{str(ranking)}_{title}.{file_format}'):
            continue
        
        i.save(f'{folder}/{str(ranking)}_{title}.{file_format}')
        
        # Append to list of dictionaries
        images_list.append({'ranking': int(ranking),
                        'title': title,
                        'poster_url': image_url})
    
    # Not best practice to catch all exceptions but fine for this short script
    except Exception as e:
        image_errors[str(ranking) + "_" + title] = str(ranking) + "_" + title

HBox(children=(IntProgress(value=0), HTML(value='')))

API error: {'code': 'invalidtitle', 'info': 'Bad title "A_Hard_Day%27s_Night_(film)".', 'docref': 'See https://en.wikipedia.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/mailman/listinfo/mediawiki-api-announce&gt; for notice of API deprecations and breaking changes.'}
API error: {'code': 'invalidtitle', 'info': 'Bad title "Rosemary%27s_Baby_(film)".', 'docref': 'See https://en.wikipedia.org/w/api.php for API usage. Subscribe to the mediawiki-api-announce mailing list at &lt;https://lists.wikimedia.org/mailman/listinfo/mediawiki-api-announce&gt; for notice of API deprecations and breaking changes.'}


In [59]:
# Check all images that could not be downloaded
for key in image_errors.keys():
    print(key)

22_A_Hard_Day%27s_Night_(film)
64_Dr._Strangelove
72_Rosemary%27s_Baby_(film)
83_Hell_or_High_Water_(film)


In [61]:
# Inspect unidentifiable images and download them individually
for rank_title in progressbar(image_errors.keys()):
    if rank_title == '22_A_Hard_Day%27s_Night_(film)':
        url = 'https://upload.wikimedia.org/wikipedia/en/4/47/A_Hard_Days_night_movieposter.jpg'
        'https://upload.wikimedia.org/wikipedia/pt/8/8f/The_Beatles_-_A_Hard_Day%27s_Night.jpg'
    if rank_title == '64_Dr._Strangelove':
        url = 'https://upload.wikimedia.org/wikipedia/pt/7/73/Dr._Strangelove.jpg'
    if rank_title == '72_Rosemary%27s_Baby_(film)':
        url = 'https://upload.wikimedia.org/wikipedia/en/e/ef/Rosemarys_baby_poster.jpg'
    if rank_title == '83_Hell_or_High_Water_(film)':
        url = 'https://upload.wikimedia.org/wikipedia/pt/3/37/Hell_or_High_Water.png'
    
    title = rank_title[3:]
    images_list.append({'ranking': int(rank_title[:2]),
                    'title': title,
                    'poster_url': url})
    
    r = requests.get(url)
    # Download movie poster image
    i = Image.open(BytesIO(r.content))
    file_format = url.split('.')[-1]
    i.save(f'{folder}/{rank_title}.{file_format}')

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

In [62]:
len(images_list)

100

In [65]:
# Create DataFrame from list of dictionaries
df = pd.DataFrame(images_list, columns = ['ranking', 'title', 'poster_url'])
df = df.sort_values('title').reset_index(drop=True)

# Store the dataframe into a CSV file
df.to_csv('movie_poster_urls.csv', index=False)