Brett D. Grell
DA 320
Midterm: MongoDB Notebooks
11/07/2022

In [1]:
# Initial runtime setup

import re
import urllib3
import certifi

# Construct regular expressions for data scraping

title_regex = re.compile(r"class=\"title\"><h3>(.+)</h3>")
date_regex = re.compile(r"class=\"clamp-details\">\s+<span>(.+)</span>")
description_regex = re.compile(r"<div class=\"summary\">\s*([\S\s]+?)\s*<\/div>")
score_regex = re.compile(r"<span class=\"title\">Metascore:</span>\s+<a class=\"metascore_anchor\"href=\"/movie/.*?/critic-reviews\">\s+<div class=\"metascore_w large movie.+\">(.?)</div>")
image_regex = re.compile(r"<a href=\"/movie/.*\"><img src=\"(.*)\"alt=\"")

# Construct HTTP pool for neccessary connections

http= urllib3.PoolManager(ca_certs=certifi.where())

In [2]:
# Connect to database

import json
import pymongo

# Retrieve credentials needed for database connection

with open('C:\\Users\\brett\\Desktop\\DA 320\\secrets.json') as f:
    data = json.load(f)
    secret_key = data['mongodb']

# Fetch required database

client = pymongo.MongoClient(secret_key, tlsCAFile=certifi.where())
my_database = client['DA_320_Movies']
metacritic_data = my_database['Metacritic']

In [9]:
# Lets scrape some data!

import pandas

# Retrieve list of movies from specified year and page of Metacritic

def data_scraper(year: int, page: int) -> pandas.DataFrame:

    # Fetch webpage
    url = f"https://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected={year}&sort=desc&view=detailed&page{page}"
    response = http.request('GET', url, headers={'User-Agent': 'Mozilla/5.0'})
    datastring = str(response.data, "utf-8")

    # Execute regular expressions
    titles = title_regex.findall(datastring)
    dates = date_regex.findall(datastring)
    descriptions = description_regex.findall(datastring)
    scores = score_regex.findall(datastring)
    images = image_regex.findall(datastring)

    # Debug mismatched array length. Code posted on discussion board by Stuart Ketcham
    print("title: ", len(titles), "date: ", len(dates), "description: " , len(descriptions), "score: " , len(scores), "image: " , len(images))

    # Return unified collection
    dataset = {"title": titles, "date": dates, "description": descriptions, "score": scores, "image": images}
    return pandas.DataFrame(dataset)

    




In [6]:
import time

# Write a CSV file for retrieved data

for year in range(2000, 2023):
    page = 0
    print(f"Collecting data for {year} page {page}...")

    # Retry page multiple times if needed
    while True:
        data = data_scraper(year, page)

        # Stop when a page has 0 rows
        if len(data) == 0:
            break

        # Convert dataframe into a list of movies for INSERT into MongoDB
        movies_to_insert = []
        for row in data.itertuples():
            movie = {
                "title": row.title,
                "release_date": row.date,
                "description": row.description,
                "metascore": row.score,
                "image_url": row.image,
            }
            movies_to_insert.append(movie)

        # Insert records into MongoDB
        print(f"Inserting {len(movies_to_insert)} movies for the year {year} page {page}")
        metacritic_data.insert_many(movies_to_insert)
        page = page + 1

Collecting data for 2000 page 0...
title:  100 date:  100 description:  100 score:  0 image:  0


ValueError: All arrays must be of the same length