In [0]:
# import packages
from bs4 import BeautifulSoup
import requests
import re

In [2]:
def get_episodes_links():
  """Return the links to the scripts of all Seinfeld episodes"""
  episodes_page = 'http://www.seinology.com/scripts-english.shtml'
  r = requests.get(episodes_page)
  page_content = BeautifulSoup(r.content, "html.parser")


  base_url = 'http://www.seinology.com/'
  links = [base_url + episode['href'] for episode in page_content.select('a[href*=shtml]') if 'scripts/script' in episode['href']]

  # remove duplicates 
  # https://stackoverflow.com/questions/7961363/removing-duplicates-in-lists
  links = list(dict.fromkeys(links))


  # remove highlights episodes that have no scripts
  links = list(filter(lambda ep: '100' not in ep and '177' not in ep, links))
    
  return links 

# print out the links of the first 10 episodes
for episode in get_episodes_links()[:10]:
  print(episode)

http://www.seinology.com//scripts/script-01.shtml
http://www.seinology.com//scripts/script-02.shtml
http://www.seinology.com//scripts/script-03.shtml
http://www.seinology.com//scripts/script-04.shtml
http://www.seinology.com//scripts/script-05.shtml
http://www.seinology.com//scripts/script-06.shtml
http://www.seinology.com//scripts/script-07.shtml
http://www.seinology.com//scripts/script-08.shtml
http://www.seinology.com//scripts/script-09.shtml
http://www.seinology.com//scripts/script-10.shtml


In [3]:
def get_episode_script(link):
  """Given a link to an episode, returns the text of the script of this episode"""    
  r = requests.get(link)
  page_content = BeautifulSoup(r.content, "html.parser")
  text = page_content.get_text()
  
  # script starts after a line of 
  # ==================================================================
  # script content
  # 'The end' | 'The End' | 'THE END' + \s | \s\n
  # for episode 179-180
  # ==================================================================
  # cast info
  # ==================================================================
  # script content
  # The End
  text = re.sub(r'END OF SHOW|\(To be continued.*\)|To be continued.*', 'The End', text, flags=re.IGNORECASE) # episode 17, 41, 45, 63, 133
  if 'script-64' in link:
    text = re.sub(r'(===\[)|(\]={3,})', '', text) 
  if 'script-72' in link:
    text = re.sub(r'\tTHE END', '', text) 
    text = re.sub(r'TRUELY THE END', 'THE END', text) 
  if '82and83' in link:
    text = re.sub(r'(Copyright 2006 seinology)', r'The End\n\1', text)
  if '115' in link or '144' in link:
    text = text.replace('\xa0', '')
  if '119' in link:
    text = re.sub(r'\(IMG.*\)', '', text) 
  if '123' in link:
    text = re.sub(r'<IMG.*>', '', text) 
  
  pattern = re.compile(r'={30,}\n([^=]*)the end\W*\n', re.IGNORECASE)
  match = pattern.search(text)
  content = match.group(1)
  
  # replace curly quotes and apostrophes
  content = re.sub(r'(“|”)', r'"', content) 
  content = re.sub(r'(‘|’)', r"'", content) 
 
  # change Kessler name to Kramer in episode 1 script
  if 'script-01' in link:
    content = re.sub('KESSLER', 'KRAMER', content)
    content = re.sub('Kessler', 'Kramer', content)

  return content

# script of episode 51, The Contest, which is considered one of the best Seinfeld episodes
print(get_episode_script('http://www.seinology.com/scripts/script-51.shtml')[:2000])


							[Setting: Monk's Coffee shop] 

							(Jerry and Kramer are sitting opposite Elaine at a booth, eating lunch) 

							JERRY: (To Elaine) Let me ask you a question. 

							ELAINE: Mm-hm. 

							JERRY: You're a hostage, captured by terrorists- 

							ELAINE: (Smiling, chewing) Who, me? 

							JERRY: You, anybody - whatever. You're in the little room, you're chained to the floor, you're there for a long time.. do you think they would ever consider doing the laundry? 

							ELAINE: (Matter-of-factly) They have to, it's in the Geneva Convention. 

							KRAMER: (Imitating a Turkish terrorist) You! Take off your socks, your pants, your underwear. We're doing the wash. C'mon! Take it off, take it off! 

							(Jerry and Elaine both laugh at Kramer's impression as George slowly enters. He's in a melancholy state) 

							KRAMER: Hey, Georgie. 

							JERRY AND ELAINE: Hi. 

							(George sits down next to Elaine - opposite Kramer) 

							JERRY: (To George) What's the

In [4]:
def get_text_corpus():
  """Merge all the scripts into a text corpus - corpus.txt file"""
  corpus = open('text.txt', 'w') 
  for link in get_episodes_links():
    corpus.write(get_episode_script(link))
  corpus.close()
  
  def clean_corpus(input, output):
    """Remove empty line and strip the corpus"""
    #   https://stackoverflow.com/questions/37682955/how-to-delete-empty-lines-from-a-txt-file
    with open(input) as infile, open(output, 'w') as outfile:
      for line in infile:
        if line.strip():
          outfile.write(line.strip() + '\n') 
  
  clean_corpus('text.txt', 'corpus.txt')
  print('Done!')

get_text_corpus()

Done!


In [5]:
# test open and read from corpus.txt
filtered = open('corpus.txt', 'r') 
 
print(filtered.read(2000))

INT. COMEDY CLUB – NIGHT
(Jerry is on stage, performing.)
JERRY: Do you know what this is all about? Do you know, why we're here? To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talking about "We should go out"? This is what they're talking about...this whole thing, we're all out now, no one is home. Not one person here is home, we're all out! There are people tryin' to find us, they don't know where we are. (on an imaginary phone) "Did you ring?, I can't find him." "Where did he go?" "He didn't tell me where he was going". He must have gone out. You wanna go out: you get ready, you pick out the clothes, right? You take the shower, you get all ready, get the cash, get your friends, the car, the spot, the reservation...Then you're standing around, whatta you do? You go: "We gotta be getting back". Once you're out, you wanna get back! You wanna go to sleep, you wanna get up, you wanna go out again tomorrow, right

In [0]:
# download corpus.txt file from colab to local system
from google.colab import files

files.download('corpus.txt')