Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
tree: 8588b7f86e
Fetching contributors…

Octocat-spinner-32-eaf2f5

Cannot retrieve contributors at this time

executable file 194 lines (173 sloc) 7.406 kb
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193
#!/usr/bin/env python
"""
For downloading videos for Coursera classes. Given a class name and related cookie file, it scrapes the course listing page to get the week and class names, and then downloads the related videos into appropriately named files and directories.

Examples:
coursera-dl -c cookies.txt saas
coursera-dl -c cookies.txt -l listing.html -o saas --skip-download

Author:
John Lehmann (first last at geemail dotcom)
"""

import sys, os, re, string
import urllib2, cookielib
import tempfile
import subprocess
import argparse
import StringIO
from BeautifulSoup import BeautifulSoup

def get_syllabus_url(className):
  """Return the Coursera index/syllabus URL."""
  return "http://class.coursera.org/%s/lecture/index" % className

def load_cookies_file(cookies_file):
  """Loads the cookies file. I am pre-pending the file with the special
Netscape header because the cookie loader is being very particular about
this string."""
  cookies = StringIO.StringIO()
  NETSCAPE_HEADER = "# Netscape HTTP Cookie File"
  cookies.write(NETSCAPE_HEADER);
  cookies.write(open(cookies_file, 'r').read())
  cookies.flush()
  cookies.seek(0)
  return cookies

def get_opener(cookies_file):
  """Use cookie file to create a url opener."""
  cj = cookielib.MozillaCookieJar()
  cookies = load_cookies_file(cookies_file)
  # nasty hack: cj.load() requires a filename not a file, but if
  # I use stringio, that file doesn't exist. I used NamedTemporaryFile
  # before, but encountered problems on Windows.
  cj._really_load(cookies, "StringIO.cookies", False, False)
  return urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

def get_page(url, cookies_file):
  """Download an HTML page using the cookiejar."""
  opener = get_opener(cookies_file)
  return opener.open(url).read()

def get_syllabus(class_name, cookies_file, local_page=False):
  """ Get the course listing webpage."""
  if (not (local_page and os.path.exists(local_page))):
    url = get_syllabus_url(class_name)
    page = get_page(url, cookies_file)
    print "Downloaded %s (%d bytes)" % (url, len(page))
    # cache the page if we're in 'local' mode
    if (local_page):
      open(local_page, 'w').write(page)
  else:
    page = open(local_page).read()
  return page

def clean_filename(s):
  """Sanitize a string to be used as a filename."""
  # strip paren portions which contain trailing time length (...)
  s = re.sub("\([^\(]*$", "", s)
  s = s.strip().replace(':','-').replace(' ', '_')
  valid_chars = "-_.()%s%s" % (string.ascii_letters, string.digits)
  return ''.join(c for c in s if c in valid_chars)

def parse_syllabus(page):
  """Parses a Coursera course listing/syllabus page.
Each section is a week of classes."""
  sections = []
  soup = BeautifulSoup(page)
  # traverse sections
  for stag in soup.findAll(attrs={'class':'list_header'}):
    assert stag.string != None, "couldn't find section"
    section_name = clean_filename(stag.string)
    print section_name
    videos = []
    # traverse videos
    for vtag in stag.parent.nextSibling.findAll('li'):
      assert vtag.a.contents[0], "couldn't get video name"
      vname = clean_filename(vtag.a.contents[0])
      print " ", vname,
      # find the anchor with .mp4 reference
      url = vtag.find('a', {"href":re.compile("\.mp4")})["href"]
      print " ", url
      videos.append((vname, url))
    sections.append((section_name, videos))
  print "Found %d sections and %d videos on this page" % \
    (len(sections), sum((len(s[1]) for s in sections)))
  if (not len(sections)):
    print "Probably bad cookies file (or wrong class name)"
  return sections

def download_videos(wget_bin, cookies_file, class_name, sections, overwrite=False, skip_download=False, only_section_num=None):
  """Downloads videos described by sections."""

  def format_section(num, section):
    return "%s_%02d_%s" % (class_name.upper(), num, section)

  def format_video(num, section, video):
    return "%s_%02d_%s.mp4" % (section[0:7].lower(),num, video)

  for (secnum, (section, videos)) in enumerate(sections):
    if only_section_num and secnum+1 != only_section_num:
      continue
    sec = format_section(secnum+1, section)
    if not os.path.exists(sec):
      os.mkdir(sec)
    for (vidnum, (vname, url)) in enumerate(videos):
      vidfn = os.path.join(sec, format_video(vidnum+1, sec, vname))
      if overwrite or not os.path.exists(vidfn):
        if not skip_download:
          download_file(url, vidfn, cookies_file, wget_bin)
        else:
          open(vidfn, 'w').close() # touch

def download_file(url, fn, cookies_file, wget_bin):
  """Downloads file and removes current file if aborted by user."""
  try:
    if wget_bin:
      download_file_wget(wget_bin, url, fn, cookies_file)
    else:
      download_file_nowget(url, fn, cookies_file)
  except KeyboardInterrupt, e:
    print "\nKeyboard Interrupt -- Removing partial file:", fn
    os.remove(fn)
    sys.exit()

def download_file_wget(wget_bin, url, fn, cookies_file):
  """Downloads a file using wget. Could possibly use python to stream files to
disk, but wget is robust and gives nice visual feedback."""
  cmd = [wget_bin, url, "-O", fn, "--load-cookies", cookies_file]
  print "Executing wget:", cmd
  retcode = subprocess.call(cmd)

def download_file_nowget(url, fn, cookies_file):
  """'Native' python downloader -- slower than wget."""
  print "Downloading %s -> %s" % (url, fn)
  urlfile = get_opener(cookies_file).open(url)
  chunk_sz = 1048576
  bytesread = 0
  f = open(fn, "w")
  while True:
    data = urlfile.read(chunk_sz)
    if not data:
      print "."
      break
    f.write(data)
    bytesread += len(data)
    print "\r%d bytes read" % bytesread,
    sys.stdout.flush()

def parseArgs():
  parser = argparse.ArgumentParser(description='Download Coursera.org videos.')
  # positional
  parser.add_argument('class_name', action='store',
    help='name of the class (e.g. "nlp")')
  # required
  parser.add_argument('-c', '--cookies_file', dest='cookies_file',
    action='store', required=True, help='full path to the cookies.txt file')
  # optional
  parser.add_argument('-w', '--wget_bin', dest='wget_bin',
    action='store', default=None, help='wget binary if it should be used for downloading')
  parser.add_argument('-s', '--section_num', dest='only_section_num', type=int,
    action='store', help='only download this section number')
  parser.add_argument('-o', '--overwrite', dest='overwrite',
    action='store_true', default=False,
    help='whether existing video files should be overwritten (default: False)')
  parser.add_argument('-l', '--process_local_page', dest='local_page',
    help='for debugging: uses or creates local cached version of syllabus page')
  parser.add_argument('--skip-download', dest='skip_download',
    action='store_true', default=False,
    help='for debugging: skip actual downloading of videos')
  args = parser.parse_args()
  # check arguments
  if not os.path.exists(args.cookies_file):
    raise IOError("Cookies file not found: " + args.cookies_file)
  return args

def main():
  args = parseArgs()
  page = get_syllabus(args.class_name, args.cookies_file, args.local_page)
  sections = parse_syllabus(page)
  download_videos(args.wget_bin, args.cookies_file, args.class_name, sections, args.overwrite, args.skip_download, args.only_section_num)

if __name__ == "__main__":
  main()
Something went wrong with that request. Please try again.