In [1]:
#Import the required modules
import pandas as pd
import numpy as np
import io
import sys
import os.path
import urllib.request
from tqdm import tqdm
from os import listdir
from PIL import Image
import glob
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Read movie data csv with poster url
df = pd.read_csv("new.csv")
df.head()

Unnamed: 0,Title,Release_Year,Genre,Synopsis,Poster_URL,Movie_ID,Movie_URL,Duration,Certification,Voters,Rating
0,New Amsterdam,(2018–2023),Drama,A new medical director breaks the rules to hea...,https://m.media-amazon.com/images/M/MV5BNDEyZG...,7817340,https://www.imdb.com/title/tt7817340/,43 min,Not Rated,44918,8.0
1,Bad Sisters,(2022– ),"Comedy,Drama,Thriller",The Garvey sisters are bound together by their...,https://m.media-amazon.com/images/M/MV5BNjc2ZW...,15469618,https://www.imdb.com/title/tt15469618/,53 min,A,25559,8.3
2,Charmed,(1998–2006),"Drama,Fantasy,Mystery",Three Halliwell sisters discover that they are...,https://m.media-amazon.com/images/M/MV5BNTIxNm...,158552,https://www.imdb.com/title/tt0158552/,42 min,U,87169,7.1
3,Star Trek Into Darkness,(2013),"Action,Adventure,Sci-Fi",After the crew of the Enterprise find an unsto...,https://m.media-amazon.com/images/M/MV5BMTk2Nz...,1408101,https://www.imdb.com/title/tt1408101/,132 min,UA,489386,7.7
4,The Northman,(2022),"Action,Adventure,Drama",A young Viking prince is on a quest to avenge ...,https://m.media-amazon.com/images/M/MV5BMzVlMm...,11138512,https://www.imdb.com/title/tt11138512/,137 min,R,228278,7.0


In [3]:
#Shape of dataframe
df.shape

(10000, 11)

In [4]:
#Drop duplicates
df.drop_duplicates(subset="Movie_ID", inplace=True)

#Check null values(any column)
check_nan_in_df = df.isnull().values.any()
print (check_nan_in_df)

True


In [5]:
#Remove NaN valued entries
df= df.dropna(how='any')
df.shape

(6963, 11)

In [6]:
#Method to download images using urllib
found = []
not_found = []
for index, row in tqdm(df.iterrows()):
    url = row['Poster_URL']
    imdb_id = row['Movie_ID']
    
    file_path = "Posters/" + str(imdb_id) + ".jpg"
    
    try:
        response = urllib.request.urlopen(url)
        data = response.read()
        file = open(file_path, 'wb')
        file.write(bytearray(data))
        file.close()
        found.append(file_path)
    except:
        not_found.append(imdb_id)
        
print("Number of posters not found = ", len(not_found))
print("Following IMDB_ID posters were not found::", not_found)

6963it [35:37,  3.26it/s]

Number of posters not found =  15
Following IMDB_ID posters were not found:: [118715, 16304446, 3488710, 9883996, 114323, 406759, 327162, 1403241, 10451914, 6450186, 369735, 1623288, 15384586, 7488208, 17219484]





In [7]:
# Remove from dataframe those whose posters were not found
df = df[~df['Movie_ID'].isin(not_found)]
df.shape

(6948, 11)

In [8]:
# Check which downloaded images are corrupt and remove them 
bad_images = []
for file in glob.glob("Posters/*.jpg"):

    try:
        img = Image.open(file) # open image file
        img.verify() # verify its an image
    except (IOError, SyntaxError) as e:
        print('Bad file:', file) 

        bad_images.append(file)
        
print("Number of corrupt files:", len(bad_images))

Number of corrupt files: 0


In [9]:
# #Checking if all the imdb_id listed here actually have its poster image
# for file in glob.glob("Posters/*.jpg"):
#     image_list.append(file)
# print("Number of files found:", len(image_list))
df["Path"] = found

In [10]:
# Display and Save the updated Dataframe
df.to_csv('data_with_path.csv', index=False)
display(df)

Unnamed: 0,Title,Release_Year,Genre,Synopsis,Poster_URL,Movie_ID,Movie_URL,Duration,Certification,Voters,Rating,Path
0,New Amsterdam,(2018–2023),Drama,A new medical director breaks the rules to hea...,https://m.media-amazon.com/images/M/MV5BNDEyZG...,7817340,https://www.imdb.com/title/tt7817340/,43 min,Not Rated,44918,8.0,Posters/7817340.jpg
1,Bad Sisters,(2022– ),"Comedy,Drama,Thriller",The Garvey sisters are bound together by their...,https://m.media-amazon.com/images/M/MV5BNjc2ZW...,15469618,https://www.imdb.com/title/tt15469618/,53 min,A,25559,8.3,Posters/15469618.jpg
2,Charmed,(1998–2006),"Drama,Fantasy,Mystery",Three Halliwell sisters discover that they are...,https://m.media-amazon.com/images/M/MV5BNTIxNm...,158552,https://www.imdb.com/title/tt0158552/,42 min,U,87169,7.1,Posters/158552.jpg
3,Star Trek Into Darkness,(2013),"Action,Adventure,Sci-Fi",After the crew of the Enterprise find an unsto...,https://m.media-amazon.com/images/M/MV5BMTk2Nz...,1408101,https://www.imdb.com/title/tt1408101/,132 min,UA,489386,7.7,Posters/1408101.jpg
4,The Northman,(2022),"Action,Adventure,Drama",A young Viking prince is on a quest to avenge ...,https://m.media-amazon.com/images/M/MV5BMzVlMm...,11138512,https://www.imdb.com/title/tt11138512/,137 min,R,228278,7.0,Posters/11138512.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...
9991,Mirai nikki,(2011–2013),"Animation,Action,Drama",A young man competes with people around the wo...,https://m.media-amazon.com/images/M/MV5BMjExOW...,2069441,https://www.imdb.com/title/tt2069441/,25 min,18,18609,7.5,Posters/2069441.jpg
9993,The Pirates! In an Adventure with Scientists!,(2012),"Animation,Adventure,Comedy",Pirate Captain sets out on a mission to defeat...,https://m.media-amazon.com/images/M/MV5BNDhkOG...,1430626,https://www.imdb.com/title/tt1430626/,88 min,U,49931,6.7,Posters/1430626.jpg
9995,Charlie Says,(2018),"Biography,Crime,Drama",The tragic tale of an all-American girl who wa...,https://m.media-amazon.com/images/M/MV5BMTU4Nj...,1759744,https://www.imdb.com/title/tt1759744/,110 min,R,5084,5.9,Posters/1759744.jpg
9998,Invader ZIM,(2001–2006),"Animation,Action,Adventure",An alien named Zim from the planet Irk is sent...,https://m.media-amazon.com/images/M/MV5BOTZjZW...,235923,https://www.imdb.com/title/tt0235923/,24 min,PG,21982,8.4,Posters/235923.jpg
