In [1]:
pip install movieposters

Collecting movieposters
  Using cached movieposters-0.0.7-py3-none-any.whl (16 kB)
Collecting beautifulsoup4<5.0.0,>=4.11.1
  Using cached beautifulsoup4-4.12.2-py3-none-any.whl (142 kB)
Installing collected packages: beautifulsoup4, movieposters
  Attempting uninstall: beautifulsoup4
    Found existing installation: beautifulsoup4 4.10.0
    Uninstalling beautifulsoup4-4.10.0:
      Successfully uninstalled beautifulsoup4-4.10.0
Successfully installed beautifulsoup4-4.12.2 movieposters-0.0.7
Note: you may need to restart the kernel to use updated packages.


In [1]:
# Python Package imports
import requests
from bs4 import BeautifulSoup
from dateutil.parser import parse
import concurrent.futures
import pandas as pd
import movieposters as mp

In [2]:
# Maximum number of threads that will be spawned
MAX_THREADS = 50

In [3]:
#Declaring empty lists to be used
title = []
year = []
genres = []
synopsis =[]
poster  = []
ids = []
url = []
duration = []
voters = []
rating = []
certificate = []

In [4]:
# Creating functions

#function to get the movie title
def getMovieTitle(header):
    try:
        return header[0].find("a").getText()
    except:
        return 'NA'

#function to get the movie release year
def getReleaseYear(header):
    try:
        return header[0].find('span', class_='lister-item-year').text
    except:
        return 'NA'

#function to get the movie genres
def getGenre(muted_text):
    try:
        return muted_text.find("span",  {"class":  "genre"}).getText()
    except:
        return 'NA'

#function to get the movie synopsis/plot
def getsynopsys(movie):
    try:
        return movie.find_all("p", {"class":  "text-muted"})[1].getText()
    except:
        return 'NA'

#function to get the movie poster link
def getPoster(image):
    try:
        return image.get('loadlate')
        
    except:
        return 'NA'

#function to get the movie duration
def getDuration(header):
    try:
        return header.find('span', class_='runtime').text if header.p.find('span', class_='runtime') else '-'
    except:
        return 'NA'

#function to get the voters count
def getVoters(header):
    try:
        nv = header.find_all('span', attrs={'name':'nv'})
        return nv[0].text
    except:
        return 'NA'

#function to get the movie Rating
def getRating(header):
    try:
        return float(header.strong.text)
    except:
        return 'NA'
    
#function to get the movie certification
def getCertificate(muted_text):
    try:
        return muted_text.find("span",  {"class":  "certificate"}).getText()
    except:
        return 'NA'

In [5]:
#Main function, the main imdb url is passed as an arguement
def main(imdb_url):
    
    #use the imdb url, by using BeautifulSoup module
    response = requests.get(imdb_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Movie Name
    movies_list  = soup.find_all("div", {"class": "lister-item mode-advanced"})
    
    #traversing the movies
    for movie in movies_list:
        header = movie.find_all("h3", {"class":  "lister-item-header"})
        muted_text = movie.find_all("p", {"class":  "text-muted"})[0]
        imageDiv =  movie.find("div", {"class": "lister-item-image float-left"})
        image = imageDiv.find("img", "loadlate")
        
        #  Movie Title
        gt =  getMovieTitle(header)
        title.append(gt)
        
        #  Movie release year
        gy = getReleaseYear(header)
        year.append(gy)
        
        #  Genre  of movie
        gg = getGenre(muted_text)
        genres.append(gg)
        
        # Movie Synopsys
        gs = getsynopsys(movie)
        synopsis.append(gs)
        
        # Image attributes
        gp = getPoster(image)
        gpt = ""
        n = len(gp)
        for i in range(n):
            if(gp[i] == '@' and gp[i+1] == '@'):
                gpt = gpt + "@@.jpg"
                break
            elif(gp[i] == '@'):
                gpt = gpt + "@.jpg"
                break
            elif(gp[i] == '.' and gp[i+1] == '_'):
                gpt = gpt+".jpg"
                break
            else:
                gpt = gpt + gp[i]
                
        #Required custom dimensions of posters
        gpt = gpt[:-4] + "._V1_QL75_UX280_CR0,3,280,414_.jpg"
        poster.append(gpt)
        
        # IMDB url
        gi = image.get('data-tconst')
        ids.append(gi)
        url.append("https://www.imdb.com/title/" + gi + "/")
        
        # Movie Duration
        gd = getDuration(movie)
        duration.append(gd)
        
        # Movie Voters count
        gv = getVoters(movie)
        voters.append(gv)
        
        # Movie Rating
        gr = getRating(movie)
        rating.append(gr)
        
        # Movie Synopsys
        gc = getCertificate(muted_text)
        certificate.append(gc)

In [6]:
# An array to store all the URL that are being queried
imageArr = []

# Maximum number of pages one wants to iterate over
MAX_PAGE = 40

# Loop to generate all the URLS.
for i in range(0,MAX_PAGE):
    
    #since pages are more then 1
    totalRecords = 0 if i==0 else (250*i)+1
    
    #The required url passed here
    imdb_url = f'https://www.imdb.com/search/title/?release_date=1990-01-01,2023-12-31&user_rating=5.0,10.0&languages=en&adult=include&count=250&start=0{totalRecords}&ref_=adv_nxt'
    imageArr.append(imdb_url)

In [7]:
#FUnctions the manages everything through threads
def download_stories(story_urls):
    threads = min(MAX_THREADS, len(story_urls))
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        executor.map(main, story_urls)

In [8]:
# Call the download function with the array of URLS called imageArr
download_stories(imageArr)

# Attach all the data to the pandas dataframe. You can optionally write it to a CSV file as well
movieDf = pd.DataFrame({
    "Title": title,
    "Release_Year": year,
    "Genre": genres,
    "Synopsis": synopsis,
    "Poster_URL": poster,
    "Movie_ID": ids,
    "Movie_URL": url,
    "Duration": duration,
    "Certification": certificate,
    "Voters": voters,
    "Rating": rating
})

#To organize our dataframe 
movieDf['Genre'] = [x[1:] for x in movieDf['Genre']]
movieDf['Movie_ID'] = [x[2:] for x in movieDf['Movie_ID']]
movieDf['Synopsis'] = [x[1:] for x in movieDf['Synopsis']]
movieDf['Genre'] = movieDf['Genre'].str.replace(" ", "")

#Print the dataframe
print('--------- Complete CSV Formed --------')
display(movieDf)

#Save the dataframe
movieDf.to_csv('new.csv', index=False)

--------- Complete CSV Formed --------


Unnamed: 0,Title,Release_Year,Genre,Synopsis,Poster_URL,Movie_ID,Movie_URL,Duration,Certification,Voters,Rating
0,New Amsterdam,(2018–2023),Drama,A new medical director breaks the rules to hea...,https://m.media-amazon.com/images/M/MV5BNDEyZG...,7817340,https://www.imdb.com/title/tt7817340/,43 min,Not Rated,44918,8.0
1,Bad Sisters,(2022– ),"Comedy,Drama,Thriller",The Garvey sisters are bound together by their...,https://m.media-amazon.com/images/M/MV5BNjc2ZW...,15469618,https://www.imdb.com/title/tt15469618/,53 min,A,25559,8.3
2,Charmed,(1998–2006),"Drama,Fantasy,Mystery",Three Halliwell sisters discover that they are...,https://m.media-amazon.com/images/M/MV5BNTIxNm...,0158552,https://www.imdb.com/title/tt0158552/,42 min,U,87169,7.1
3,Star Trek Into Darkness,(2013),"Action,Adventure,Sci-Fi",After the crew of the Enterprise find an unsto...,https://m.media-amazon.com/images/M/MV5BMTk2Nz...,1408101,https://www.imdb.com/title/tt1408101/,132 min,UA,489386,7.7
4,The Northman,(2022),"Action,Adventure,Drama",A young Viking prince is on a quest to avenge ...,https://m.media-amazon.com/images/M/MV5BMzVlMm...,11138512,https://www.imdb.com/title/tt11138512/,137 min,R,228278,7.0
...,...,...,...,...,...,...,...,...,...,...,...
9995,Charlie Says,(2018),"Biography,Crime,Drama",The tragic tale of an all-American girl who wa...,https://m.media-amazon.com/images/M/MV5BMTU4Nj...,1759744,https://www.imdb.com/title/tt1759744/,110 min,R,5084,5.9
9996,Mr. D,(2012–2018),Comedy,Follows a teacher juggle through being a teach...,https://m.media-amazon.com/images/M/MV5BMTQ5NT...,2128016,https://www.imdb.com/title/tt2128016/,30 min,,2528,7.4
9997,Wheeler Dealers,(2003– ),Documentary,Car enthusiast Mike Brewer teams up with mecha...,https://m.media-amazon.com/images/M/MV5BZWIxMD...,1549918,https://www.imdb.com/title/tt1549918/,22 min,,3360,8.2
9998,Invader ZIM,(2001–2006),"Animation,Action,Adventure",An alien named Zim from the planet Irk is sent...,https://m.media-amazon.com/images/M/MV5BOTZjZW...,0235923,https://www.imdb.com/title/tt0235923/,24 min,PG,21982,8.4
