This file takes a while to run since it goes through each link, loads it and scrapes the necessary data even though i've added some multi threading. Since it takes a while I've added in some code which takes any previously fetched movie data and ensures those are not scraped again. 

In [None]:
import pickle
import os

MOVIE_LINKS_FILE = '2links24-12-2021-19-29-31-9457.pkl' # set to the output of notebook file 1
PREV_MOVIES_DATA_FILE = "test-1769.pkl" # set to None if no prev data file

test_links = ['https://www.rottentomatoes.com/m/rumble_2021',
 'https://www.rottentomatoes.com/m/hurt_2021',
 'https://www.rottentomatoes.com/m/back_to_the_outback']

test_links = pickle.load(open( os.path.join('data', MOVIE_LINKS_FILE), 'rb'))

print("Number of links", len(test_links))

In [None]:
import time
import logging
from datetime import datetime
from selenium.webdriver.common.by import By
from concurrent import futures
from SeleniumDriver import getDriver
from joblib import Parallel, delayed, cpu_count
print('Cpu count', cpu_count())
from concurrent.futures import ThreadPoolExecutor

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    count = 0
    for i in range(0, len(lst), n):
        
        yield {"chunk": lst[i:i + n], "chunk_number": count}
        count += 1
NOW_DATE_STRING = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")

def saveMovies(movies, chunk_number): 
    name_of_file = 'movies-{num_of_movies}-chunk-{chunk_number}-date-{date}.pkl'.format(num_of_movies=len(movies), date=NOW_DATE_STRING, chunk_number=chunk_number)
    with open(os.path.join('raw_data', name_of_file), 'wb') as f:
            pickle.dump(movies, f)

prev_movies_data_hash_map = {}
if (PREV_MOVIES_DATA_FILE):
    prev_movies_data = pickle.load(open(os.path.join('data',PREV_MOVIES_DATA_FILE), 'rb'))
    for movie in prev_movies_data:
        link = movie['link']
        prev_movies_data_hash_map[link] = True

def scrapeMovies(test_links, chunk_number):
    print('First url' , test_links[0])
    print('num of links ' , len(test_links))
    driver = getDriver()
    scraped_movies = []
    errored_links = []

    for i in range(len(test_links)):
        link = test_links[i]
        print("i", i)
        if i > 0 and (i % 800 == 0):
            saveMovies(scraped_movies, chunk_number)
        try:
            if (link in prev_movies_data_hash_map):
                continue

            print( "Scraping link", link)
            driver.get(link)

            # TODO Don't skip whole movie if some fields error
            score_board_elm = driver.find_element(By.TAG_NAME, "score-board")
            audience_score = score_board_elm.get_attribute("audiencescore")
            tomato_meter_score = score_board_elm.get_attribute("tomatometerscore")
            release_date = driver.find_element(By.TAG_NAME, "time").text
            number_of_reviews = driver.find_element(By.CSS_SELECTOR,"a[data-qa='audience-rating-count']").text
            info = driver.find_element(By.CLASS_NAME,"content-meta")
            genre = info.find_element(By.CLASS_NAME,"genre").text
            page_title = driver.title
            information = {
                "audience_score" :  audience_score,
                "tomato_meter_score" :  tomato_meter_score,
                "release_date": release_date,
                "page_title": page_title,
                "link": link,
                "number_of_reviews": number_of_reviews,
                "genre" : genre
            }
            scraped_movies.append(information)
        except Exception as e:
            print('bad link {link}'.format(link=link))
            # logging.exception(e, exc_info=True) # Uncomment for debugging, could use debug levels in future
            errored_links.append(link)

    saveMovies(scraped_movies, chunk_number)
    print('Number of errored movie links', len(errored_links))
    driver.quit()

SIZE_OF_CHUNK = 2364

chunks_list = [chunk['chunk'] for chunk in chunks(test_links,SIZE_OF_CHUNK)]
chunk_number = [chunk['chunk_number'] for chunk in chunks(test_links,SIZE_OF_CHUNK)]
print('chunks_list', [len(chunk) for chunk in chunks_list])
print('chunk_numbers', chunk_number)

with ThreadPoolExecutor(max_workers=4) as executor:
          executor.map(scrapeMovies, chunks_list, chunk_number)



In [None]:
# Combine multiple files
FILES = ["movies-2293-chunk-0-date-14-01-2022-23-49-10.pkl","movies-2304-chunk-1-date-14-01-2022-23-49-10.pkl","movies-2329-chunk-2-date-14-01-2022-23-49-10.pkl","movies-2341-chunk-3-date-14-01-2022-23-49-10.pkl"]
OUTPUT_FILE_NAME = "movies-with-num-rev"
combined = []
for FILE in FILES:
    movies = pickle.load(open(os.path.join('raw_data', FILE), 'rb'))
    combined += movies

num_movies=len(combined)
print("Number of movies", num_movies)
output_path = os.path.join('raw_data', "{OUTPUT_FILE_NAME}-{num_movies}.pkl".format(OUTPUT_FILE_NAME=OUTPUT_FILE_NAME, num_movies=num_movies))

with open(output_path, 'wb') as f:
        pickle.dump(combined, f)
