In [1]:
import numpy as np
import regex as re
import pandas as pd

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from difflib import SequenceMatcher

## Load Disney List

In [2]:
disney_df = pd.read_csv('../data/disney_musicals.csv')

In [3]:
disney_df.sort_values(by='Year',ascending=False)

Unnamed: 0,Title,Year
111,The Little Mermaid (Live Action),2023
93,Sneakerella,2022
123,Zombies 3,2022
78,Hollywood Stargirl,2022
59,Better Nate Than Ever,2022
...,...,...
116,The Reluctant Dragon,1941
10,Dumbo,1941
89,Pinocchio,1940
74,Fantasia,1940


## Selenium Setup

In [4]:
driver_path = 'C:/Users/Mohammed/pyprojects/chromedriver.exe'
brave_path = 'C:/Program Files/Google/Chrome/Application/chrome.exe'
extensions = open('../data/paths.txt','r').read().split('\n')
u_block = extensions[2]

In [5]:
option = webdriver.ChromeOptions()
option.binary_location = brave_path
option.add_argument('--disable-dev-shm-usage')
option.add_argument('--load-extension='+u_block)

# Create new Instance of Chrome
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

# enable browser logging
d = DesiredCapabilities.CHROME
d['goog:loggingPrefs'] = { 'browser':'ALL' }

## Movie Tuples

In [6]:
movie_tuples = list(zip(disney_df.Title,disney_df.Year))
movie_tuples[:5]

[('Aladdin', 1992),
 ('Almost Angels', 1962),
 ('Annie', 1999),
 ('Babes in Toyland', 1961),
 ('Beauty and the Beast', 1991)]

## Functions

In [151]:
def deBracket(string):
    sub = re.sub('\([^)]*\)','',string).strip()
    sub = re.sub('[^(\w\s)]','',sub)
    
    return sub

In [161]:
def isSimilar(movie_title,imdb_title):
    #remove text within brackets
    movie_title = deBracket(movie_title)
    imdb_title = deBracket(imdb_title)

    #split into lists
    movie_split = movie_title.split()
    imdb_split = imdb_title.split()

    if len(imdb_split) > len(movie_split):
        imdb_split = imdb_split[:len(movie_split)]
        return SequenceMatcher(isjunk=None,a=movie_split,b=imdb_split).ratio()>=0.75
    else:
        return SequenceMatcher(isjunk=None,a=movie_title,b=imdb_title).ratio()>=0.75

In [12]:
def clickXP(string):
    element = wait.until(EC.element_to_be_clickable((By.XPATH, string)))
    return element    

def selectXP(string):
    # element = browser.find_element(By.XPATH,string)
    element = wait.until(EC.presence_of_element_located((By.XPATH, string)))
    return element

In [164]:
def correctResult(search_text):
    title = search_text[:-5]
    year = search_text[-4:]

    #get first three elements from search bar results
    base_result_path = '//*[@id="react-autowhatever-1--item-{}"]'
    
    for r in range(4):
        
        result_path = base_result_path.format(r)
        result_text = clickXP(result_path).text
        
        if 'See all results' in result_text: #list of results exhausted
            correct_result = 'No correct result'
            break
        else:
            result_title = result_text.split('\n')[0]
            
            if isSimilar(title,result_title) and year in result_text:
                correct_result = result_path + '/a/child::div/child::div'
                break
            else:
                correct_result = 'No correct result'

    return correct_result

In [13]:
def searchTitle(search_text):    
    #wait until search bar is clickable and click
    search_bar = clickXP('//*[@id="suggestion-search"]')
    #javascript click because sometimes selenium can't click by itself
    browser.execute_script("arguments[0].click();", search_bar)
    
    #select all and delete to clear -- note: search_bar.clear() doesnt always work
    search_bar.send_keys(Keys.CONTROL + "A")
    search_bar.send_keys(Keys.DELETE)
    #clear search bar
    #enter text
    search_bar.send_keys(search_text)
    #click on correct result
    correct_result = correctResult(search_text)
    
    if correct_result == 'No correct result':
        print(correct_result,'for',search_text)
        return 0 
    else:
        browser.implicitly_wait(30)
        search_result = clickXP(correct_result)
        browser.execute_script("arguments[0].click();",search_result)
        return 1

In [81]:
def searchBar(search_text):
    try:
        try:
            search = searchTitle(search_text)
        except:
            #a prompt may come up and interrupt
            prompt_path = '/html/body/div[4]/div[2]/div/div[2]/div/div[2]/div[2]/button[2]/div'
            prompt_button = clickXP(prompt_path)
            #close prompt

            browser.execute_script('arguments[0].click();',prompt_button)

            #search again
            search = searchTitle(search_text)
        
        if search == 0:
            return 0
        else:
            return 1
    except Exception as e: #timeout
        print(e,'No results were retrieved for',search_text)
        return 0

In [15]:
def containerButton(path,text_match):
    text_match = text_match.lower()
    buttons = browser.find_elements(By.XPATH,path)
    elements = [button for button in buttons if text_match in button.text.lower()][0]
    
    return elements

def containerContent(button,subpath):
    path = div_sib + subpath
    elements = button.find_elements(By.XPATH, path)
    content = [e.text for e in elements]
    
    return content

In [192]:
#variables with similar naming pattern (only the suffix is different)
base_path = '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/'
title_path = base_path+'/h1'
year_path = base_path+'div[2]/div[1]/div/ul/li[1]/span'
pg_path = base_path+'div[2]/div[1]/div/ul/li[2]/span'
runtime_path = base_path+'div[2]/div[1]/div/ul/li[3]'
rating_path = base_path+'div[3]/div[2]/div[1]/div[2]/div/div[1]/a/div/div/div[2]/div[1]/span[1]'
votes_path = base_path+'div[3]/div[2]/div[1]/div[2]/div/div[1]/a/div/div/div[2]/div[3]'
userreviews_path = base_path+'/div[2]/ul/li[1]/a/span/span[1]'
metascore_path = base_path+'/div[3]/div[2]/div[2]/ul/li[3]/a/span/span[1]/span'
criticreviews_path = base_path+'/div[2]/ul/li[2]/a/span/span[1]'

#list of all fetch paths
fetch_paths = [title_path,year_path,pg_path,runtime_path,rating_path,votes_path,userreviews_path,
               metascore_path, criticreviews_path]

#container variables
base_cont = '//*[@id="__next"]/main/div/section[1]/div/section/div/div[1]/child::section'
directors_path = base_cont+'/ul/li[1]/child::*'
writers_path = base_cont+'/ul/li[2]/child::*'
topcast_path = base_cont+'/child::div'
boxoffice_path = base_cont+'/div[1]'
#list of all container paths
cont_paths = [directors_path,writers_path,topcast_path,boxoffice_path]

#subpaths
releasedate_subpath = './following-sibling::*'
directors_subpath = './following-sibling::*/child::ul/child::li'
writers_subpath = './following-sibling::*/child::ul/child::li'
genres_subpath = './following-sibling::*/child::*/child::*'
topcast_subpath = '.' + '/parent::*'*4 + '/following-sibling::*/child::div/child::div'
boxoffice_subpath = '.'+'/parent::*'*4 + '/following-sibling::*/child::*/child::*'
#list of subpaths
cont_subpaths = [releasedate_subpath, directors_subpath, writers_subpath,
                 genres_subpath, topcast_subpath, boxoffice_subpath]


#list of all suffixes to fetch
fetch_list = ['title','year','pg','runtime','rating','votes','user_reviews','critic_reviews']
#list of all container suffixes
containers_list = ['Release date', 'Director','Writer','Genres','Top cast','Box office']

In [186]:
cont = 'Genres'
subpath = genres_subpath
xpath = "//*[contains(text(), '{i}')]".format(i='Genres')
header_elements = browser.find_elements(By.XPATH, xpath)
#find list of elements that have `cont` as text and pick first element
header = [element for element in header_elements if cont in element.text][0]
#navigate to relevant subpath
target_container = header.find_elements(By.XPATH, subpath)
#get list of information as long as text is not empty
#and as long as the text is not IMDb Pro promotion
target = [t.text for t in target_container if t.text if 'IMDbPro' not in t.text]# '//*[@id="__next"]/main/div/section[1]/div/section/div/div[1]/section[6]/div[2]/ul[2]/li[2]/button'

In [193]:
def scrapeIMDb(title='title'):
    #creating a dictionary for each path's results
    imdb_dict = {}
    key = 'no_key'
    
    #get all fetch_path values
    try:
        for (key,path) in zip(fetch_list,fetch_paths):
            imdb_dict[key] = selectXP(path).text
    except Exception as e:
        print(repr(e),key, 'info for',title,'not obtained')

    # imdb_dict = {key: selectXP(path).text for (key, path) in zip(fetch_list,fetch_paths)}
    
    for (cont,subpath) in list(zip(containers_list,cont_subpaths)):
        key = cont.lower().replace(' ','_') #create more uniform key format
        
        try:
            #find element on page containing container header e.g. Directors
            xpath = "//*[contains(text(), '{i}')]".format(i=cont)
            header_elements = browser.find_elements(By.XPATH, xpath)
            #find list of elements that have `cont` as text and pick first element
            header = [element for element in header_elements if cont in element.text][0]
            #navigate to relevant subpath
            target_container = header.find_elements(By.XPATH, subpath)
            #get list of information as long as text is not empty
            #and as long as the text is not IMDb Pro promotion
            target = [t.text for t in target_container if t.text if 'IMDbPro' not in t.text]
            #add to dict
            imdb_dict[key] = target
        except Exception as e:
            print(repr(e), 'Could not find', key, 'info for', title)
            imdb_dict[key] = np.nan

    return imdb_dict

# Launch Selenium Chrome

In [253]:
def seleniumCall():
    browser = webdriver.Chrome(executable_path=driver_path, options=option, desired_capabilities=d);
    wait = WebDriverWait(browser, timeout=10, poll_frequency=1)

    browser.get('https://www.imdb.com/')
    winds = browser.window_handles

    browser.switch_to.window(winds[0])

    if 'IMDb' not in browser.title:
        browser.switch_to.window(winds[1])    

    browser.maximize_window()

## Iterate

In [None]:
seleniumCall()

In [195]:
movie_dicts = []

for (i,tup) in enumerate(movie_tuples):
    search_text = tup[0] + ' ' + str(tup[1])
    
    search = searchBar(search_text)
    
    if search==0:
        imdb_dict = {'titleId': i}
    else:
        imdb_dict = scrapeIMDb(title=search_text)
        imdb_dict['titleId'] = i
    
    movie_dicts.append(imdb_dict)

IndexError('list index out of range') Could not find genres info for Aladdin 1992
TimeoutException() critic_reviews info for Almost Angels 1962 not obtained
IndexError('list index out of range') Could not find box_office info for Almost Angels 1962
No correct result for Annie 1999
IndexError('list index out of range') Could not find genres info for Babes in Toyland 1961
IndexError('list index out of range') Could not find genres info for Beauty and the Beast 1991
IndexError('list index out of range') Could not find genres info for Beauty and the Beast (Live-Action) 2017
TimeoutException() year info for Beauty and the Beast: The Enchanted Christmas 1997 not obtained
IndexError('list index out of range') Could not find box_office info for Beauty and the Beast: The Enchanted Christmas 1997
TimeoutException() year info for Camp Rock 2008 not obtained
IndexError('list index out of range') Could not find box_office info for Camp Rock 2008
TimeoutException() year info for Camp Rock 2: The Fin

In [227]:
movie_dicts = movie_dicts[:76]

In [228]:
len(movie_dicts)

76

In [230]:
err = 76
for (i,tup) in enumerate(movie_tuples[err:]):
    search_text = tup[0] + ' ' + str(tup[1])
    i = i + err
    search = searchBar(search_text)
    
    if search==0:
        imdb_dict = {'titleId': i}
    else:
        imdb_dict = scrapeIMDb(title=search_text)
        imdb_dict['titleId'] = i
    
    movie_dicts.append(imdb_dict)

IndexError('list index out of range') Could not find genres info for Hannah Montana & Miley Cyrus: Best of Both Worlds Concert 2008
IndexError('list index out of range') Could not find genres info for Hannah Montana: The Movie 2009
IndexError('list index out of range') Could not find genres info for Hollywood Stargirl 2022
IndexError('list index out of range') Could not find box_office info for Hollywood Stargirl 2022
IndexError('list index out of range') Could not find genres info for Inside Out 2015
IndexError('list index out of range') Could not find genres info for James and the Giant Peach 1996
IndexError('list index out of range') Could not find genres info for Lilo and Stitch 2002
IndexError('list index out of range') Could not find genres info for Meet the Robinsons 2007
TimeoutException() critic_reviews info for Mickey's Christmas Carol 1983 not obtained
TimeoutException() year info for Mickey's Magical Christmas 2001 not obtained
IndexError('list index out of range') Could no

In [308]:
imdb_df = pd.DataFrame(data=movie_dicts)

In [309]:
imdb_df[[imdb_df.columns[-1]] + list(imdb_df.columns[:-1])]

Unnamed: 0,titleId,title,year,pg,runtime,rating,votes,user_reviews,critic_reviews,release_date,director,writer,genres,top_cast,box_office
0,0,Aladdin,1992,G,1h 30m,8.0,427K,391,86,"[November 25, 1992 (United States)]","[Ron Clements, John Musker]","[Ron Clements(screenplay by), John Musker(scre...",,"[Scott Weinger\nAladdin(voice), Robin Williams...","[Budget\n$28,000,000 (estimated), Gross US & C..."
1,1,Almost Angels,1962,Not Rated,1h 33m,6.4,601,13,,"[December 21, 1962 (United States)]",[],"[Vernon Harris(screenplay by), Robert A. Stemm...","[Comedy, Family, Music]","[Vincent Winter\nTony Fiala, Sean Scully\nPete...",
2,2,,,,,,,,,,,,,,
3,3,Babes in Toyland,1961,Approved,1h 46m,6.1,4.4K,46,55,"[December 14, 1961 (United States)]",[Jack Donohue],"[Victor Herbert(operetta), Glen MacDonough(ope...",,"[Ray Bolger\nBarnaby Barnicle, Tommy Sands\nTo...","[Gross US & Canada\n$10,218,316]"
4,4,Beauty and the Beast,1991,G,1h 24m,8.0,459K,490,95,"[November 22, 1991 (United States)]","[Gary Trousdale, Kirk Wise]","[Linda Woolverton(animation screenplay by), Br...",,"[Paige O'Hara\nBelle(voice), Robby Benson\nBea...","[Budget\n$25,000,000 (estimated), Gross US & C..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120,120,Up,,,1h 36m,,,1K,88,"[May 29, 2009 (United States)]","[Pete Docter, Bob Peterson(co-director)]","[Pete Docter(story by), Bob Peterson(story by)...",,[Edward Asner\nCarl Fredricksen(voice)(as Ed A...,"[Budget\n$175,000,000 (estimated), Gross US & ..."
121,121,Wreck-It Ralph,,,1h 41m,,,494,72,"[November 2, 2012 (United States)]",[Rich Moore],"[Rich Moore(story by), Phil Johnston(story by)...",,"[John C. Reilly\nRalph(voice), Jack McBrayer\n...","[Budget\n$165,000,000 (estimated), Gross US & ..."
122,122,Zombies 2: The Collab,,,,,,,,"[February 2, 2020 (United States)]",[Elek Hendrickson],,"[Short, Comedy, Musical, Romance]","[Ben Azelart\nWyatt, Sofie Dossi\nWinter, Madi...",
123,123,,,,,,,,,,,,,,


In [236]:
imdb_df.to_csv('../data/imdb_info.csv')

In [252]:
imdb_df[imdb_df['box_office'].notnull()]

Unnamed: 0,title,year,pg,runtime,rating,votes,user_reviews,critic_reviews,release_date,director,writer,genres,top_cast,box_office,titleId
0,Aladdin,1992,G,1h 30m,8.0,427K,391,86,"[November 25, 1992 (United States)]","[Ron Clements, John Musker]","[Ron Clements(screenplay by), John Musker(scre...",,"[Scott Weinger\nAladdin(voice), Robin Williams...","[Budget\n$28,000,000 (estimated), Gross US & C...",0
3,Babes in Toyland,1961,Approved,1h 46m,6.1,4.4K,46,55,"[December 14, 1961 (United States)]",[Jack Donohue],"[Victor Herbert(operetta), Glen MacDonough(ope...",,"[Ray Bolger\nBarnaby Barnicle, Tommy Sands\nTo...","[Gross US & Canada\n$10,218,316]",3
4,Beauty and the Beast,1991,G,1h 24m,8.0,459K,490,95,"[November 22, 1991 (United States)]","[Gary Trousdale, Kirk Wise]","[Linda Woolverton(animation screenplay by), Br...",,"[Paige O'Hara\nBelle(voice), Robby Benson\nBea...","[Budget\n$25,000,000 (estimated), Gross US & C...",4
5,Beauty and the Beast,2017,PG,2h 9m,7.1,314K,1.2K,65,"[March 17, 2017 (United States)]",[Bill Condon],"[Stephen Chbosky(screenplay by), Evan Spilioto...",,"[Emma Watson\nBelle, Dan Stevens\nBeast, Luke ...","[Budget\n$160,000,000 (estimated), Gross US & ...",5
9,Confessions of a Teenage Drama Queen,2004,PG,1h 29m,4.6,30K,108,33,"[February 20, 2004 (United States)]",[Sara Sugarman],"[Dyan Sheldon(book), Gail Parent(screenplay)]",,"[Lindsay Lohan\nLola, Megan Fox\nCarla, Adam G...","[Budget\n$15,000,000 (estimated), Gross US & C...",9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,The Reluctant Dragon,,,1h 14m,,,28,,"[June 20, 1941 (United States)]","[Alfred L. Werker, Hamilton Luske(cartoon sequ...","[Kenneth Grahame(based on the story by), Ted S...","[Animation, Comedy, Family]","[Robert Benchley\nRobert Benchley, Frances Gif...","[Gross US & Canada\n$872,000]",116
117,The Sword in the Stone,,,1h 19m,,,120,61,"[June 21, 1964 (United States)]","[Wolfgang Reitherman, Clyde Geronimi(earlier f...","[Bill Peet(story), T.H. White(based on the boo...",,"[Rickie Sorensen\nWart(voice), Sebastian Cabot...","[Budget\n$3,000,000 (estimated), Gross US & Ca...",117
119,Toy Story,,,1h 21m,,,756,96,"[November 22, 1995 (United States)]",[John Lasseter],"[John Lasseter(original story by), Pete Docter...",,"[Tom Hanks\nWoody(voice), Tim Allen\nBuzz Ligh...","[Budget\n$30,000,000 (estimated), Gross US & C...",119
120,Up,,,1h 36m,,,1K,88,"[May 29, 2009 (United States)]","[Pete Docter, Bob Peterson(co-director)]","[Pete Docter(story by), Bob Peterson(story by)...",,[Edward Asner\nCarl Fredricksen(voice)(as Ed A...,"[Budget\n$175,000,000 (estimated), Gross US & ...",120


try1.head()

In [256]:
seleniumCall()

  browser = webdriver.Chrome(executable_path=driver_path, options=option, desired_capabilities=d);


In [258]:
browser = webdriver.Chrome(executable_path=driver_path, options=option, desired_capabilities=d);
wait = WebDriverWait(browser, timeout=10, poll_frequency=1)

browser.get('https://www.imdb.com/')
winds = browser.window_handles

browser.switch_to.window(winds[0])

if 'IMDb' not in browser.title:
    browser.switch_to.window(winds[1])    

browser.maximize_window()

  browser = webdriver.Chrome(executable_path=driver_path, options=option, desired_capabilities=d);


In [259]:
searchBar('High School Musical')

1

In [321]:
def isAnimation(genre_list):
    if type(genre_list)==list:
        return bool('Animation' in genre_list)
    else:
        return genre_list

In [322]:
isAnimation(['Animation','Comedy','Drama'])

True

In [323]:
imdb_df['is_animation'] = np.vectorize(isAnimation)(imdb_df.genres)
# imdb_df['is_animation'] = 
imdb_df['is_animation']

0      NaN
1      0.0
2      NaN
3      NaN
4      NaN
      ... 
120    NaN
121    NaN
122    0.0
123    NaN
124    NaN
Name: is_animation, Length: 125, dtype: float64

In [313]:
new_cols = list(imdb_df.columns[-2:]) + list(imdb_df.columns[:-2])
imdb_df = imdb_df[new_cols]

In [324]:
imdb_df.head()

Unnamed: 0,titleId,is_animation,title,year,pg,runtime,rating,votes,user_reviews,critic_reviews,release_date,director,writer,genres,top_cast,box_office
0,0,,Aladdin,1992.0,G,1h 30m,8.0,427K,391.0,86.0,"[November 25, 1992 (United States)]","[Ron Clements, John Musker]","[Ron Clements(screenplay by), John Musker(scre...",,"[Scott Weinger\nAladdin(voice), Robin Williams...","[Budget\n$28,000,000 (estimated), Gross US & C..."
1,1,0.0,Almost Angels,1962.0,Not Rated,1h 33m,6.4,601,13.0,,"[December 21, 1962 (United States)]",[],"[Vernon Harris(screenplay by), Robert A. Stemm...","[Comedy, Family, Music]","[Vincent Winter\nTony Fiala, Sean Scully\nPete...",
2,2,,,,,,,,,,,,,,,
3,3,,Babes in Toyland,1961.0,Approved,1h 46m,6.1,4.4K,46.0,55.0,"[December 14, 1961 (United States)]",[Jack Donohue],"[Victor Herbert(operetta), Glen MacDonough(ope...",,"[Ray Bolger\nBarnaby Barnicle, Tommy Sands\nTo...","[Gross US & Canada\n$10,218,316]"
4,4,,Beauty and the Beast,1991.0,G,1h 24m,8.0,459K,490.0,95.0,"[November 22, 1991 (United States)]","[Gary Trousdale, Kirk Wise]","[Linda Woolverton(animation screenplay by), Br...",,"[Paige O'Hara\nBelle(voice), Robby Benson\nBea...","[Budget\n$25,000,000 (estimated), Gross US & C..."


In [326]:
imdb_df.to_csv('../data/imdb_info.csv')

# Old

In [8]:
def scrapeIMDb():
    #variables with similar naming pattern (only the suffix is different)
    title_path = '/html/body/div[2]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/h1'
    year_path = '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/div/ul/li[1]/span'
    pg_path = '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/div/ul/li[2]/span'
    runtime_path = '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/div/ul/li[3]'
    genre1_path = '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/div[1]/div[1]/div[2]/a[1]/span'
    genre2_path = '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/div[1]/div[1]/div[2]/a[2]/span'
    genre3_path = '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/div[1]/div[1]/div[2]/a[3]/span'
    rating_path = '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/div[2]/div/div[1]/a/div/div/div[2]/div[1]/span[1]'
    votes_path = '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/div[2]/div/div[1]/a/div/div/div[2]/div[3]'
    userreviews_path = '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[2]/ul/li[1]/a/span/span[1]'
    criticreviews_path = '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[2]/ul/li[2]/a/span/span[1]'
    watchlist_path = '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[2]/div/div/button[1]/div/div[2]'
    #container variables
    i=4
    base_cont = '//*[@id="__next"]/main/div/section[1]/div/section/div/div[1]/section[{}]'
    dase_cont = '//*[@id="__next"]/main/div/section[1]/div/section/div/div[1]/section[2]/ul/li[1]/div/ul'
    directors_cont = base_cont.format(i)+'/ul/li[1]/div/ul'
    writers_cont = base_cont.format(i)+'/ul/li[2]/div/ul'
    topcast_cont = base_cont.format(i)+'/div[2]/div[2]'

    #list of all suffixes to fetch
    fetch_list = ['title','year','pg','runtime','genre1','genre2','genre3','rating','votes','userreviews','criticreviews']
    #list of all container suffixes
    containers_list = ['directors','writers','topcast']
    #creating a dictionary for each path's results
    imdb_dict = {}

    #iterating over list
    for fetch in fetch_list:
        try:
            #accesses locals variables, to assign value of each {suffix}_path to temp_path 
            temp_path = locals()['{f}_path'.format(f=fetch)]
            #fetch desired result from selenium
            temp_var = selectXP(temp_path).text
            #use string formatting to create dict key corresponding to suffix fetched
            imdb_dict['{f}'.format(f=fetch)] = temp_var
        except Exception as e:
            print(repr(e),fetch)
            
    print(imdb_dict)
    for cont in containers_list:
        try:
            temp_path = locals()['{c}_cont'.format(c=cont)]
            container = selectXP(temp_path)
            temp_list = [c.text for c in container.find_elements(By.XPATH,temp_path+"/child::*")]

            imdb_dict['{c}'.format(c=cont)] = temp_list
            print(imdb_dict)
        except Exception as e:
            i=2
            temp_path = locals()['{c}_cont'.format(c=cont)]
            container = selectXP(temp_path)
            temp_list = [c.text for c in container.find_elements(By.XPATH,temp_path+"/child::*")]

            imdb_dict['{c}'.format(c=cont)] = temp_list
            print(imdb_dict)
    return imdb_dict

In [127]:
base_cont = '//*[@id="__next"]/main/div/section[1]/div/section/div/div[1]/child::section'
directors_path = base_cont+'/ul/li[1]/button'
writers_path = base_cont+'/ul/li[2]/button'
topcast_path = base_cont+'/child::div'
boxoffice_path = base_cont+'/div[1]'
div_sib = './following-sibling::div'

def containerButton(path,text_match):
    text_match = text_match.lower()
    buttons = browser.find_elements(By.XPATH,path)
    elements = [button for button in buttons if text_match in button.text.lower()][0]
    
    return elements

def containerContent(button,path):
    path = div_sib + path
    elements = button.find_elements(By.XPATH, path)
    content = [e.text for e in elements]
    
    return content
    
ul_li = '/child::ul/child::li'
div_div = '/child::div/child::div'
ul_li_wild = '/child::ul/child::li/child::*'

d_button = containerButton(directors_path,'director')
w_button = containerButton(writers_path,'writer')
t_button = containerButton(topcast_path,'top cast')
b_button = containerButton(boxoffice_path,'box office')

directors = containerContent(d_button, ul_li)
writers = containerContent(w_button, ul_li)
topcast = containerContent(t_button, div_div)
boxoffice = containerContent(b_button, ul_li_wild)
# d_button = [db for db in browser.find_elements(By.XPATH,directors_path) if 'Director' in db.text][0]
# w_button = [wb for wb in browser.find_elements(By.XPATH,writers_path) if 'Writer' in wb.text][0]
# t_button = [tb for tb in browser.find_elements(By.XPATH,topcast_path) if 'top cast' in tb.text.lower()][0]
# f_button = [fb for fb in browser.find_elements(By.XPATH,


# directors = [d.text for d in d_button.find_elements(By.XPATH, div_sib+'/child::ul/child::li')]
# writers = [w.text for w in w_button.find_elements(By.XPATH, div_sib+'/child::ul/child::li')]
# topcast = [t.text for t in t_button.find_elements(By.XPATH, div_sib+'/child::div/child::div')]

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "C:\Users\Mohammed\.conda\envs\assessment4env\lib\site-packages\IPython\core\interactiveshell.py", line 3457, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Mohammed\AppData\Local\Temp\ipykernel_20892\3578594524.py", line 8, in <module>
    d_button = [db for db in browser.find_elements(By.XPATH,directors_path) if 'Director' in db.text][0]
  File "C:\Users\Mohammed\.conda\envs\assessment4env\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 860, in find_elements
    return self.execute(Command.FIND_ELEMENTS, {"using": by, "value": value})["value"] or []
  File "C:\Users\Mohammed\.conda\envs\assessment4env\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 438, in execute
    response = self.command_executor.execute(driver_command, params)
  File "C:\Users\Mohammed\.conda\envs\assessment4env\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 290, in execute
    r

TypeError: object of type 'NoneType' has no len()

In [167]:
base_cont = '//*[@id="__next"]/main/div/section[1]/div/section/div/div[1]/child::section'
directors_path = base_cont+'/ul/li[1]/child::*'
writers_path = base_cont+'/ul/li[2]/child::*'
topcast_path = base_cont+'/child::div'
boxoffice_path = base_cont+'/div[1]'
#subpaths
div_sib = './following-sibling::div'
ul_li = '/child::ul/child::li'
div_div = '/child::div/child::div'
ul_li_wild = '/child::ul/child::li/child::*'

#list of all suffixes to fetch
fetch_list = ['title','year','pg','runtime','genre1','genre2','genre3','rating','votes','userreviews','criticreviews']
#list of all container suffixes
containers_list = ['directors','writers','topcast','boxoffice']
#creating a dictionary for each path's results
imdb_dict = {}
key = 'no_key'

d_button = containerButton(directors_path,'director')
w_button = containerButton(writers_path,'writer')
t_button = containerButton(topcast_path,'top cast')
b_button = containerButton(boxoffice_path,'box office')

directors = containerContent(d_button, ul_li)
writers = containerContent(w_button, ul_li)
topcast = containerContent(t_button, div_div)
boxoffice = containerContent(b_button, ul_li_wild)