# Scrape television script data from Springfield! Springfield!

## Written by Nicholas Fasano
## Created: 01/12/2022
### Website: https://www.springfieldspringfield.co.uk/

In [1]:
# Load in python packages
import requests
from bs4 import BeautifulSoup

import re
import numpy as np
import pandas as pd

import datetime
import time

import os
from concurrent.futures import ThreadPoolExecutor

In [2]:
def getTvShowLinks(tvShowLinks, titleShow, link, i):
    numLinksPerPage = 18
    
    # load in webpage content
    page = requests.get(link)
    soup = BeautifulSoup(page.content,'lxml')

    # find links to TV episode scripts and append them to lists
    results = soup.find_all('a',attrs={'href':re.compile("/episode_scripts.php")})
    for j in range(len(results)):      
        linkTemp = results[j]
        tvShowLinks[i*numLinksPerPage+j] = 'https://www.springfieldspringfield.co.uk'+linkTemp['href']
        titleShow[i*numLinksPerPage+j] = linkTemp.text

def getTvEpisodeLinks(tvEpisodeLinks, titleEpisode, titleShowEp, titleShow, link, i):
    # load in webpage content
    page = requests.get(link)
    soup = BeautifulSoup(page.content,'lxml')
         
    # find links to TV episode scripts and append them to lists
    results = soup.find_all('a',attrs={'href':re.compile("view_episode_scripts.php")})
    
    tvEpisodeLinksTemp = []
    titleEpisodeTemp = []
    titleShowEpTemp = []
    for j in range(len(results)):      
        linkTemp = results[j]
        tvEpisodeLinksTemp.append('https://www.springfieldspringfield.co.uk/'+linkTemp['href'])
        titleEpisodeTemp.append(linkTemp.text)
        titleShowEpTemp.append(titleShow)
    tvEpisodeLinks[i] = tvEpisodeLinksTemp
    titleEpisode[i] = titleEpisodeTemp
    titleShowEp[i] =  titleShowEpTemp 
    
def getScriptText(scriptText,link,i):
    page = requests.get(link)
    soup = BeautifulSoup(page.content,'lxml')
    results = soup.find_all('div',attrs={'class':re.compile("scrolling-script-container")})
    scriptText[i] = results[0].text

In [3]:
# Scraping approach: 1) get titles for all shows in database
#                    2) find epsiode titles for each of the shows
#                    3) get script text for all episodes of each show
tvPageLinks = []
numPages = 328
for j in range(numPages):
    tvPageLinks.append('https://www.springfieldspringfield.co.uk/tv_show_episode_scripts.php?page='+str(j+1))
    
numTvShows = len(tvPageLinks)
titleShow = ['None']*numTvShows*18
tvShowLinks = ['None']*numTvShows*18

In [4]:
# Execution time: ~25seconds at 10 pages per iteration
t0 = time.time()
numPagesPerIteration = 10
numIterations = int(np.ceil(numTvShows/numPagesPerIteration))
for k in range(numIterations):
    linkIndex = [k*numPagesPerIteration + r for r in range(numPagesPerIteration) if k*numPagesPerIteration + r < numTvShows]
    with ThreadPoolExecutor(max_workers=11) as executor:
        [executor.submit(getTvShowLinks, tvShowLinks, titleShow, tvPageLinks[i],i) for i in linkIndex]
        
t1 = time.time()
print("Execution Time = %.2f sec" % (t1-t0))
    

Execution Time = 30.62 sec


In [5]:
# remove duplicated links and 'None' values
tvShowLinksRed = []
titleShowRed = []
for i in range(len(tvShowLinks)):
    if (tvShowLinks[i] not in tvShowLinksRed) and (tvShowLinks[i] != 'None'):
        tvShowLinksRed.append(tvShowLinks[i])
        titleShowRed.append(titleShow[i])
        
print('Total number of television shows is %d' % (len(tvShowLinksRed)))

Total number of television shows is 5566


In [6]:
# construct dataframe with show links and save to a .csv file
dictTemp = {'TV Show Title':titleShowRed,'TV Show Link':tvShowLinksRed}
df = pd.DataFrame(dictTemp)
now = datetime.datetime.now()  
df.to_csv('springfield_tvShowLinks_'+now.strftime("%Y_%m_%d_%Hhr_%Mmin_%Ssec")+'.csv')   

# Get episode links for all TV shows

In [7]:
numTvShows = len(tvShowLinksRed)
titleEpisode = [[]]*numTvShows
titleShowEp = [[]]*numTvShows
tvEpisodeLinks = [[]]*numTvShows


In [8]:
# Execution time: ~7-8 minutes at 10 pages per iteration
t0 = time.time()
numPagesPerIteration = 10
numIterations = int(np.ceil(numTvShows/numPagesPerIteration))
for k in range(numIterations):
    linkIndex = [k*numPagesPerIteration + r for r in range(numPagesPerIteration) if k*numPagesPerIteration + r < numTvShows]
    with ThreadPoolExecutor(max_workers=11) as executor:
        [executor.submit(getTvEpisodeLinks, tvEpisodeLinks, titleEpisode, titleShowEp, titleShowRed[i], tvShowLinksRed[i], i) for i in linkIndex]
    if((k+1)%np.floor(numIterations/10) == 0):        
        t1 = time.time()
        print("%.1f Percent completed (Total Time = %.1f min)" % (100*(k+1)/numIterations,(t1-t0)/60))
        
# unwrap lists of lists
titleEpisode = [item for sublist in titleEpisode for item in sublist]
titleShowEp = [item for sublist in titleShowEp for item in sublist]
tvEpisodeLinks = [item for sublist in tvEpisodeLinks for item in sublist]


9.9 Percent completed (Total Time = 0.8 min)
19.7 Percent completed (Total Time = 1.6 min)
29.6 Percent completed (Total Time = 2.4 min)
39.5 Percent completed (Total Time = 3.2 min)
49.4 Percent completed (Total Time = 4.0 min)
59.2 Percent completed (Total Time = 4.8 min)
69.1 Percent completed (Total Time = 5.5 min)
79.0 Percent completed (Total Time = 6.3 min)
88.9 Percent completed (Total Time = 7.1 min)
98.7 Percent completed (Total Time = 7.9 min)


In [9]:
# remove duplicated links and 'None' values (execution time: ~2minutes)
tvEpisodeLinksRed = []
titleEpisodeRed = []
titleShowEpRed = []
for i in range(len(tvEpisodeLinks)):
    if (tvEpisodeLinks[i] not in tvEpisodeLinksRed) and (tvEpisodeLinks[i] != 'None'):
        tvEpisodeLinksRed.append(tvEpisodeLinks[i])
        titleEpisodeRed.append(titleEpisode[i])
        titleShowEpRed.append(titleShowEp[i])
        
print('Total number of television script links extracted was %d' % (len(tvEpisodeLinksRed)))

Total number of television script links extracted was 132722


In [10]:
# construct a pandas dataframe and save the file to a .csv file
dicttemp = {'TV Show':titleShowEpRed,'TV Episode Name':titleEpisodeRed,'TV Episode Script Link':tvEpisodeLinksRed}
df = pd.DataFrame(dicttemp)

now = datetime.datetime.now()  
df.to_csv('springfield_tvEpisodeLinks_'+now.strftime("%Y_%m_%d_%Hhr_%Mmin_%Ssec")+'.csv')   

In [11]:
numTvEpisodes = len(tvEpisodeLinksRed)
scriptText = ['None']*numTvEpisodes

# Finally, get the script text using these links

In [12]:
# get script data -- takes ~1.1second/article * 36,000articles = 10hours of data collection. 
#                 -- This can also benefit from parallelization as all links are independent
# Execution time: ~4.5 hours at 10 pages per iteration
numPagesPerIteration = 10
numIterations = int(np.ceil(numTvEpisodes/numPagesPerIteration))
t0 = time.time()
for k in range(numIterations):
    linkIndex = [k*numPagesPerIteration + r for r in range(numPagesPerIteration) if k*numPagesPerIteration + r < numTvEpisodes]
    with ThreadPoolExecutor(max_workers=11) as executor:
        [executor.submit(getScriptText, scriptText, tvEpisodeLinksRed[i], i) for i in linkIndex]
    if((k+1)%np.floor(numIterations/10) == 0):        
        t1 = time.time()
        print("%.1f Percent completed (Total Time = %.0f min)" % (100*(k+1)/numIterations,(t1-t0)/60))

10.0 Percent completed (Total Time = 23 min)
20.0 Percent completed (Total Time = 47 min)
30.0 Percent completed (Total Time = 70 min)
40.0 Percent completed (Total Time = 94 min)
50.0 Percent completed (Total Time = 117 min)
60.0 Percent completed (Total Time = 141 min)
70.0 Percent completed (Total Time = 165 min)
80.0 Percent completed (Total Time = 188 min)
90.0 Percent completed (Total Time = 212 min)
100.0 Percent completed (Total Time = 235 min)


In [13]:
# Clean up text before saving to .csv file 
for j in range(numTvEpisodes):
    scriptText[j] = scriptText[j].replace("\n", " ")
    scriptText[j] = scriptText[j].replace("\r", " ")
    scriptText[j] = scriptText[j].replace("\'", "")
    scriptText[j] = re.sub(' +', ' ', scriptText[j]) # removes excess spaces

In [14]:
# construct a pandas dataframe and save the file to a .csv file
dicttemp = {'TV Show':titleShowEpRed,'TV Episode Name':titleEpisodeRed,'TV Episode Script Link':tvEpisodeLinksRed,'Script Text':scriptText}
df = pd.DataFrame(dicttemp)
now = datetime.datetime.now()
df.to_csv('SpringField_tvScripts_'+now.strftime("%Y_%m_%d_%Hhr_%Mmin_%Ssec")+'.csv')     