In [1]:
# Basics
import pandas as pd
import numpy as np
import os
# Scraping
import re
import requests
import time
# html parsing
from bs4 import BeautifulSoup
# Save images locally
import shutil

In [28]:
df = pd.read_csv(f'movie_poster_links.csv', index_col = 0)
df = df.set_index('title')
df

Unnamed: 0_level_0,img_link
title,Unnamed: 1_level_1
Toy Story (1995),https://posters.movieposterdb.com/13_05/1995/1...
Grumpier Old Men (1995),https://posters.movieposterdb.com/09_04/1995/1...
Heat (1995),https://posters.movieposterdb.com/05_05/1995/0...
Seven (a.k.a. Se7en) (1995),https://posters.movieposterdb.com/10_02/1960/5...
"Usual Suspects, The (1995)",https://posters.movieposterdb.com/09_04/2009/1...
...,...
Terms of Endearment (1983),https://posters.movieposterdb.com/08_02/1983/8...
Little Nicky (2000),https://posters.movieposterdb.com/05_02/2000/0...
Joe Dirt (2001),https://posters.movieposterdb.com/05_12/2001/0...
RoboCop 2 (1990),https://posters.movieposterdb.com/07_10/1990/1...


In [29]:
df.to_csv('movie_poster_links.csv')

In [9]:

df = pd.read_csv(f'movie_poster_links.csv', index_col=0)
filename = df.loc['Toy Story (1995)', 'img_link']
filename

title
Toy Story (1995)    https://posters.movieposterdb.com/13_05/1995/1...
Toy Story (1995)    https://posters.movieposterdb.com/13_05/1995/1...
Toy Story (1995)    https://posters.movieposterdb.com/13_05/1995/1...
Toy Story (1995)    https://posters.movieposterdb.com/13_05/1995/1...
Toy Story (1995)    https://posters.movieposterdb.com/13_05/1995/1...
                                          ...                        
Toy Story (1995)    https://posters.movieposterdb.com/13_05/1995/1...
Toy Story (1995)    https://posters.movieposterdb.com/13_05/1995/1...
Toy Story (1995)    https://posters.movieposterdb.com/13_05/1995/1...
Toy Story (1995)    https://posters.movieposterdb.com/13_05/1995/1...
Toy Story (1995)    https://posters.movieposterdb.com/13_05/1995/1...
Name: img_link, Length: 215, dtype: object

In [10]:
movies_path = '~/Documents/spiced_projects/fenugreek-student-code/week10/data/ml-latest-small/movies.csv'
ratings_path = '~/Documents/spiced_projects/fenugreek-student-code/week10/data/ml-latest-small/ratings.csv'

In [11]:
def load_data():
    # Load the data
    df_movies = pd.read_csv(movies_path)
    df_ratings = pd.read_csv(ratings_path)
    
    # Merge dataframes
    df_full = pd.merge(left=df_ratings, right=df_movies, how='left', on='movieId')

    # Filter out movies with count below 20
    mask = pd.merge(df_full, pd.DataFrame(df_full.groupby('movieId')['userId'].count()>20), 
                how='left', on='movieId').iloc[:,-1]
    df = df_full[mask]

    # Return dataframe
    return df

In [12]:
# Load movie data (from https://grouplens.org/datasets/movielens/)
df = load_data()
df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100803,610,148626,4.0,1493847175,"Big Short, The (2015)",Drama
100808,610,152081,4.0,1493846503,Zootopia (2016),Action|Adventure|Animation|Children|Comedy
100829,610,164179,5.0,1493845631,Arrival (2016),Sci-Fi
100830,610,166528,4.0,1493879365,Rogue One: A Star Wars Story (2016),Action|Adventure|Fantasy|Sci-Fi


In [13]:
df['img_link']=''
df_img_links = df[['title', 'img_link']].set_index('title')
df_img_links

Unnamed: 0_level_0,img_link
title,Unnamed: 1_level_1
Toy Story (1995),
Grumpier Old Men (1995),
Heat (1995),
Seven (a.k.a. Se7en) (1995),
"Usual Suspects, The (1995)",
...,...
"Big Short, The (2015)",
Zootopia (2016),
Arrival (2016),
Rogue One: A Star Wars Story (2016),


In [22]:
def get_poster(title):
    """
    Gets poster for regarding movie from https://www.movieposterdb.com. 
    Title required as input. 
    """
    # Only download if poster does not exists yet!
    filename = title.lower().replace('/', '_').replace(',','').replace(' ', '_').replace('(','').replace(')','').replace(':', '') + '.jpg'
    if not os.path.exists(f'../movie_poster/{filename}'):
        
        # Wait random amount of time
        time.sleep(np.random.randint(2, 7) / 10)
        
        # Define header
        headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.4 Safari/605.1.15'}

        # Define url base
        url_base = 'https://www.movieposterdb.com/search?q='

        # Define query based on title
        query = '%20'.join(word for i, word in enumerate(title.split()) if i + 1 != len(title.split()))
        url_search = url_base + query

        # Get response from url with poster links
        response = requests.get(url_search, headers)

        # Use BeautifulSoup to parse answer
        results = BeautifulSoup(response.text, 'html')

        # Filter out image links
        image_links = results.find_all(name='img', class_='vertical-image img-responsive poster_img')

        # Get image link for first result in search
        image_url = ''
        try:
            # Option 1: Get the poster with the right alt_name ("title -year + poster")
            alt_name = ' '.join(word for i, word in enumerate(title.split()) if i + 1 != len(title.split()))
            alt_name += ' poster'
            #print(alt_name)
            for image_link in image_links:
                #print(image_link['alt'])
                if image_link['alt'] == alt_name:
                    # Filter out "no posters yet links"
                    if not re.findall('no-posters-yet', image_link['src']):
                        image_url = image_link['src']
                        break
            
            # Option 2: Take first result of search
            if image_url == '':
                image_url = image_links[0]['src']
                
        except:
            print(f'No image_url found for {title}!') 
            return None

        # Download movie poster and save it in movie_poster folder
        poster = requests.get(image_url, stream = True)
        if poster.status_code == 200:
            poster.raw.decode_content = True
            with open(f'../movie_poster/{filename}','wb') as f:
                shutil.copyfileobj(poster.raw, f)
            # Save link in df
            df_img_links.loc[title, 'img_link'] = image_url
            print(f'Poster for {title} sucessfully downloaded: ')
        else:
            print(f'Poster for {title} couldn\'t be retrieved')  
    
    else:
        # Message if poster already downloaded
        #print(f'Poster for {title} already downloaded!')
        return None

In [17]:
title = df['title'].unique()[0]
title

'Toy Story (1995)'

In [227]:
get_poster(title)

In [30]:
# Download all posters
for i, title in enumerate(df['title'].unique()):
    #if i % 20 == 0:
     #   print(f"_______waiting 5s_________{i}/{len(df['title'].unique())}_downloaded___________")
      #  time.sleep(3)
    get_poster(title)   

Poster for Toy Story (1995) sucessfully downloaded: 
Poster for Quiz Show (1994) couldn't be retrieved
Poster for Adaptation (2002) couldn't be retrieved
Poster for Wayne's World 2 (1993) couldn't be retrieved
Poster for Animal House (1978) couldn't be retrieved
Poster for Welcome to the Dollhouse (1995) couldn't be retrieved
Poster for Man in the Iron Mask, The (1998) couldn't be retrieved
Poster for Beach, The (2000) couldn't be retrieved


In [33]:
df_img_links.loc['Adaptation (2002)', 'img_link'] = 'https://xl.movieposterdb.com/10_12/2002/268126/xl_268126_b0ed07c2.jpg'

In [34]:
df_img_links.loc['Animal House (1978)', 'img_link'] = 'https://xl.movieposterdb.com/13_06/1978/77975/xl_77975_032130dd.jpg'

In [35]:
df_img_links.loc['Man in the Iron Mask, The (1998)', 'img_link'] = 'https://xl.movieposterdb.com/07_10/1998/120744/xl_120744_38aa4f16.jpg'

In [36]:
df_img_links.loc['Beach, The (2000)', 'img_link'] =  'https://xl.movieposterdb.com/11_02/2000/163978/xl_163978_b8eaeca0.jpg'

In [37]:
#df_img_links.to_csv('movie_poster_links.csv')

In [41]:
# Manual download for reamining (e.g. for Adaptio (wrong alt tab), the beach (title beach, the)...)

title = 'Beach, The (2000)'
filename = title.lower().replace('/', '_').replace(',','').replace(' ', '_').replace('(','').replace(')','').replace(':', '') + '.jpg'
image_url = 'https://xl.movieposterdb.com/11_02/2000/163978/xl_163978_b8eaeca0.jpg'

poster = requests.get(image_url, stream = True)
if poster.status_code == 200:
    poster.raw.decode_content = True
    with open(f'../movie_poster/{filename}','wb') as f:
        shutil.copyfileobj(poster.raw, f)
    print(f'Poster for {title} sucessfully downloaded: ')
else:
    print(f'Poster for {title} couldn\'t be retrieved')  

Poster for Beach, The (2000) sucessfully downloaded: 


In [17]:
# Rename files (if needed)
path = '../movie_poster/'
files = os.listdir(path)
i = 1

for file in files:
    print(file)
    file_new = file.replace(':', '')
    os.rename(os.path.join(path, file), os.path.join(path, file_new))

gone_in_60_seconds_2000.jpg
hulk_2003.jpg
nightmare_before_christmas_the_1993.jpg
my_family_1995.jpg
transporter_the_2002.jpg
hurt_locker_the_2008.jpg
time_bandits_1981.jpg
raging_bull_1980.jpg
seven_a.k.a._se7en_1995.jpg
jumanji_1995.jpg
dark_knight_the_2008.jpg
relative_fear_1994.jpg
ghostbusters_ii_1989.jpg
jimmy_hollywood_1994.jpg
pi_1998.jpg
disclosure_1994.jpg
army_of_darkness_1993.jpg
forever_young_1992.jpg
corpse_bride_2005.jpg
white_man's_burden_1995.jpg
rush_hour_2_2001.jpg
sweet_home_alabama_2002.jpg
shadow_the_1994.jpg
underneath_1995.jpg
star_wars:_episode_v_-_the_empire_strikes_back_1980.jpg
indiana_jones_and_the_kingdom_of_the_crystal_skull_2008.jpg
harry_potter_and_the_sorcerer's_stone_a.k.a._harry_potter_and_the_philosopher's_stone_2001.jpg
stargate_1994.jpg
maverick_1994.jpg
edward_scissorhands_1990.jpg
fear_and_loathing_in_las_vegas_1998.jpg
cruel_intentions_1999.jpg
transformers_2007.jpg
in_the_realm_of_the_senses_ai_no_corrida_1976.jpg
river_runs_through_it_a_1992.

In [11]:
text = "3/10_to_yuma_2007"
text = text.replace('/', '')
text

'310_to_yuma_2007'