# Scrape movie script data from IMSDb.com

## Written by Nicholas Fasano
## Created: 01/08/2022
### Website: https://imsdb.com/

In [13]:
# Load in python packages
import requests
from bs4 import BeautifulSoup

import re
import numpy as np
import pandas as pd

import datetime
import time


In [14]:
# Get links to the movie 'main page' which contains link to the script text for that movie. 
# Also contains writers, genre, ratings and reviews
movieLinks = []
page = requests.get('https://imsdb.com/all-scripts.html')
soup = BeautifulSoup(page.content,'lxml')
results = soup.find_all('a',attrs={'href':re.compile("/Movie Scripts/")})
for link in results:
    movieLinks.append('https://imsdb.com'+link['href'])
numMovies = len(movieLinks)

In [15]:
# Get genre and title data from movie main page
# Will ignore ratings and comments for now
genre = ['None']*numMovies
titleStr = ['None']*numMovies
movieScriptLinks = ['None']*numMovies

for j in range(numMovies):
    GenreTemp = ''
    page = requests.get(movieLinks[j])
    soup = BeautifulSoup(page.content,'lxml')
    
    # first get movie script link from movie main page
    results = soup.find('a',attrs={'href':re.compile("/scripts/")})
    if(results == None):
        movieScriptLinks[j] = 'Script_Not_Available'
    else:
        movieScriptLinks[j] = 'https://imsdb.com' + results['href']   
    
    # next get movie genere and title from movie main page
    results = soup.find().find_all('a',attrs={'href':re.compile("/genre/")})   
    numGenres = len(results)-18 # 18 is an offset to account for the sidebar link
    for k in range(18,numGenres+18):
        GenreTemp = GenreTemp + results[k]['href'].split('/')[2] + ' '
    genre[j] = GenreTemp[0:-1]
    
    if(soup.find().find_all('h1')[0].text == 'Not Found'):
        # get title from link
        titleStrTemp = movieLinks[j].split('/')[-1].split('.html')[0]
        titleStrTemp = titleStrTemp.replace(' Script','')
    else:
        # get title from webpage
        titleStrTemp = soup.find().find_all('h1')[1].text
        titleStrTemp = titleStrTemp.replace(' Script','')
        
    titleStr[j] = titleStrTemp
    

In [17]:
# get script data for all movies in the database 
scriptText = ['None']*numMovies
for j in range(numMovies):
    URL = movieScriptLinks[j]
    if(URL == 'Script_Not_Available'):
        scriptText[j] = 'Script_Not_Available'
    else:
        page = requests.get(URL)
        soup = BeautifulSoup(page.content,'lxml')
        if(soup.find('pre') == None):
            if(soup.find_all('td',attrs={'class':re.compile("scrtext")}) == []):
                scriptText[j] = 'Script_Link_Broken' # 404 error
            else:
                scriptText[j] = soup.find_all('td',attrs={'class':re.compile("scrtext")})[0].text
        else:
            scriptText[j] = soup.find('pre').text
    

In [18]:
# Clean up text before saving to .csv file 
for j in range(numMovies):
    scriptText[j] = scriptText[j].replace("\n", " ")
    scriptText[j] = scriptText[j].replace("\r", " ")
    scriptText[j] = scriptText[j].replace("\'", "")
    scriptText[j] = re.sub(' +', ' ', scriptText[j]) # removes excess spaces

In [19]:
# construct a pandas dataframe and save the file to a .csv file
dicttemp = {'Title':titleStr,'Genres':genre,'Movie Info Link':movieLinks,'Movie Script Link':movieScriptLinks,'Script Text':scriptText}
df = pd.DataFrame(dicttemp)
now = datetime.datetime.now()
df.to_csv('IMSDb_MovieScripts_'+now.strftime("%Y_%m_%d_%Hhr_%Mmin_%Ssec")+'.csv')     