# Scrape movie script data from Springfield! Springfield!

## Written by Nicholas Fasano
## Created: 01/12/2022
### Website: https://www.springfieldspringfield.co.uk/

In [1]:
# Load in python packages
import requests
from bs4 import BeautifulSoup

import re
import numpy as np
import pandas as pd

import datetime
import time

import os
from concurrent.futures import ThreadPoolExecutor

In [2]:
# function to scrape script links
def getMovieScriptLinks(movieScriptLinks, movieTitle, link, i):
    numLinksPerPage = 17
    
    # load in webpage content
    page = requests.get(link)
    soup = BeautifulSoup(page.content,'lxml')

    # get number of pages for this letter
    results = soup.find_all('a',attrs={'href':re.compile("/movie_script.php")})
    for j in range(len(results)):      
        linkTemp = results[j]
        movieScriptLinks[i*numLinksPerPage+j] = 'https://www.springfieldspringfield.co.uk'+linkTemp['href']
        movieTitle[i*numLinksPerPage+j] = linkTemp.text
        
        
def getScriptText(scriptText,link,i):
    page = requests.get(link)
    soup = BeautifulSoup(page.content,'lxml')
    results = soup.find_all('div',attrs={'class':re.compile("movie_script")})
    scriptText[i] = results[0].text

In [3]:
# Scraping approach: 1) get links to the webpages that contain the script data 
#                    2) Scrape the script data 

# Movie scripts are organized according to the first letter of the film
# Within each letter there are xx pages with 17 movie titles per page
# Extract link for each movie title which brings you to the script for that title

letters = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T',
           'U','V','W','X','Y','Z','0']
numPages = [151,141,117,109,47,74,65,89,59,39,47,89,126,54,39,86,6,73,182,
           335,21,26,70,3,16,12,33]
print('%d pages with movie links' % np.sum(numPages) )

2109 pages with movie links


## First find links to movie script data

In [4]:
# create python list of links to pages which contain movie titles
movieTitlelinks = []
for i in range(len(letters)):
    for j in range(numPages[i]):
        movieTitlelinks.append('https://www.springfieldspringfield.co.uk/movie_scripts.php?order=' + letters[i] + '&page='+str(j+1))
numMovies = len(movieTitlelinks)  

# Intialize python lists to collect movie scripts and movie title
# Note 1: the movie title contains the year the movie was released
# Note 2: There may be fewer than 17 links on the final page of each letter, so
# the lists will be trimmed after link extraction is completed
movieTitle = ['None']*numMovies*17
movieScriptLinks = ['None']*numMovies*17

In [5]:
# Execution time: ~3.5 minutes at 10 pages per iteration
t0 = time.time()
numPagesPerIteration = 10
numIterations = int(np.ceil(numMovies/numPagesPerIteration))
for k in range(numIterations):
    linkIndex = [k*numPagesPerIteration + r for r in range(numPagesPerIteration) if k*numPagesPerIteration + r < numMovies]
    with ThreadPoolExecutor(max_workers=11) as executor:
        [executor.submit(getMovieScriptLinks, movieScriptLinks, movieTitle, movieTitlelinks[i], i) for i in linkIndex]
        
t1 = time.time()

print("Execution Time = %.2f sec" % (t1-t0))
    

Execution Time = 204.13 sec


In [6]:
# remove duplicated links and 'None' values
movieScriptLinksRed = []
movieTitleRed = []
for i in range(len(movieScriptLinks)):
    if (movieScriptLinks[i] not in movieScriptLinksRed) and (movieScriptLinks[i] != 'None'):
        movieScriptLinksRed.append(movieScriptLinks[i])
        movieTitleRed.append(movieTitle[i])

In [7]:
# create a Pandas dataframe and save the data as a .csv file
dictTemp = {'Movie Title':movieTitleRed,'Movie Script Link':movieScriptLinksRed}
df = pd.DataFrame(dictTemp)
now = datetime.datetime.now()  
df.to_csv('springfield_MovieScriptLinks_'+now.strftime("%Y_%m_%d_%Hhr_%Mmin_%Ssec")+'.csv')   

# Now collect movie script text using these links

In [8]:
numMovies = len(movieScriptLinksRed)
scriptText = ['None']*numMovies

In [9]:
# Execution time: ~75 minutes at 10 pages per iteration
numPagesPerIteration = 10
numIterations = int(np.ceil(numMovies/numPagesPerIteration))
t0 = time.time()
for k in range(numIterations):
    linkIndex = [k*numPagesPerIteration + r for r in range(numPagesPerIteration) if k*numPagesPerIteration + r < numMovies]
    with ThreadPoolExecutor(max_workers=11) as executor:
        [executor.submit(getScriptText, scriptText, movieScriptLinksRed[i], i) for i in linkIndex]
    if((k+1)%np.floor(numIterations/10) == 0):        
        t1 = time.time()
        print("%.1f Percent completed (Total Time = %.0f min)" % (100*(k+1)/numIterations,(t1-t0)/60))

10.0 Percent completed (Total Time = 8 min)
20.0 Percent completed (Total Time = 15 min)
30.0 Percent completed (Total Time = 23 min)
40.0 Percent completed (Total Time = 31 min)
50.0 Percent completed (Total Time = 39 min)
60.0 Percent completed (Total Time = 47 min)
70.0 Percent completed (Total Time = 56 min)
80.0 Percent completed (Total Time = 64 min)
90.0 Percent completed (Total Time = 74 min)
100.0 Percent completed (Total Time = 82 min)


In [10]:
print('Total number of movie scripts extracted was %d' % (numMovies))

Total number of movie scripts extracted was 35595


In [11]:
# Clean up text before saving to .csv file 
for j in range(numMovies):
    scriptText[j] = scriptText[j].replace("\n", " ")
    scriptText[j] = scriptText[j].replace("\r", " ")
    scriptText[j] = scriptText[j].replace("\'", "")
    scriptText[j] = re.sub(' +', ' ', scriptText[j]) # removes excess spaces

In [12]:
# construct a pandas dataframe and save the file to a .csv file
dicttemp = {'Movie Title':movieTitleRed, 'Movie Script Link':movieScriptLinksRed, 'Script Text':scriptText}
df = pd.DataFrame(dicttemp)
now = datetime.datetime.now()
df.to_csv('SpringField_MovieScripts_'+now.strftime("%Y_%m_%d_%Hhr_%Mmin_%Ssec")+'.csv')  