Permalink
Browse files

Completely rewritten

Refactored.
Now downloads videos. Saves videos with meaningful names in hierarchical
folders.
Uses youtube_dl code to download videos.
  • Loading branch information...
1 parent 24b4b4a commit fd0f89f54edca2c2758dee3dce930b87e509366f @nonsleepr committed Oct 24, 2012
Showing with 5,261 additions and 54 deletions.
  1. +4 −3 README.md
  2. +86 −51 edu_10gen.py
  3. +1 −0 ydl_params.json
  4. +689 −0 youtube_dl/FileDownloader.py
  5. +3,365 −0 youtube_dl/InfoExtractors.py
  6. +198 −0 youtube_dl/PostProcessor.py
  7. +557 −0 youtube_dl/__init__.py
  8. +7 −0 youtube_dl/__main__.py
  9. +354 −0 youtube_dl/utils.py
View
@@ -1,9 +1,10 @@
-##Generate list of course videos from eudcation.10gen.com.
+##Download course videos from eudcation.10gen.com.
File `config.py` should be populated with login/password.
-After completion, the script will create text file with youtube links, named after the course.
-This file could then be used to download videos with [youtube-dl](https://github.com/rg3/youtube-dl/).
+This script uses code from [youtube-dl](https://github.com/rg3/youtube-dl/) project to download videos.
+
+Script will skip already downloaded videos, although it will look for video links on 10gen's site.
###Dependencies:
* Python 2.7
View
@@ -11,6 +11,11 @@
from urllib import urlencode
+YDL_PARAMS_FILE = 'ydl_params.json'
+from youtube_dl.FileDownloader import FileDownloader
+from youtube_dl.InfoExtractors import YoutubeIE
+from youtube_dl.utils import sanitize_filename
+
try:
from bs4 import BeautifulSoup
import mechanize
@@ -30,7 +35,7 @@
except ImportError:
TARGETDIR = ''
-site_url = 'https://education.10gen.com'
+SITE_URL = 'https://education.10gen.com'
login_url = '/login'
dashboard_url = '/dashboard'
youtube_url = 'http://www.youtube.com/watch?v='
@@ -61,63 +66,93 @@ def csrfCookie(csrftoken):
comment=None, comment_url=None,
rest={'HttpOnly': None}, rfc2109=False)
-
-br = mechanize.Browser()
-cj = mechanize.LWPCookieJar()
-csrftoken = makeCsrf()
-cj.set_cookie(csrfCookie(csrftoken))
-br.set_handle_robots(False)
-br.set_cookiejar(cj)
-br.addheaders.append(('X-CSRFToken',csrftoken))
-br.addheaders.append(('Referer','https://education.10gen.com'))
-try:
- login_resp = br.open(site_url + login_url, urlencode({'email':EMAIL, 'password':PASSWORD}))
-except mechanize.HTTPError, e:
- print "Unexpected error:", e.code
- exit()
-login_state = json.loads(login_resp.read())
-
-if not login_state.get('success'):
- print login_state.get('value')
- exit()
-
-dashboard = br.open(site_url + dashboard_url)
-dashboard_soup = BeautifulSoup(dashboard.read())
-username = dashboard_soup.find('section', 'user-info').findAll('span')[1].text
-print 'Logged as %s\n\n' % username
-
-my_courses = dashboard_soup.findAll('article', 'my-course')
-for my_course in my_courses:
- course_url = my_course.a['href']
- course_name = my_course.h3.text
- f = open(course_name + '.txt', 'w')
- print '%s' % course_name
- courseware_url = re.sub(r'\/info$','/courseware',course_url)
- courseware = br.open(site_url+courseware_url)
- courseware_soup = BeautifulSoup(courseware.read())
- chapters = courseware_soup.findAll('div','chapter')
- for chapter in chapters:
- chapter_title = chapter.find('h3').find('a').text
- print '\t%s' % chapter_title
- paragraphs = chapter.find('ul').findAll('li')
- for paragraph in paragraphs:
- par_name = paragraph.p.text
- par_url = paragraph.a['href']
- par = br.open(site_url + par_url)
+class TenGenBrowser(object):
+ def __init__(self):
+ self._br = mechanize.Browser()
+ self._cj = mechanize.LWPCookieJar()
+ csrftoken = makeCsrf()
+ self._cj.set_cookie(csrfCookie(csrftoken))
+ self._br.set_handle_robots(False)
+ self._br.set_cookiejar(self._cj)
+ self._br.addheaders.append(('X-CSRFToken',csrftoken))
+ self._br.addheaders.append(('Referer',SITE_URL))
+ self._logged_in = False
+ with open(YDL_PARAMS_FILE) as fydl:
+ self._fd = FileDownloader(json.load(fydl))
+ self._fd.add_info_extractor(YoutubeIE())
+ def login(self, email, password):
+ try:
+ login_resp = self._br.open(SITE_URL + login_url, urlencode({'email':email, 'password':password}))
+ login_state = json.loads(login_resp.read())
+ self._logged_in = login_state.get('success')
+ if not self._logged_in:
+ print login_state.get('value')
+ return self._logged_in
+ except mechanize.HTTPError, e:
+ sys.exit('Can\'t sign in')
+ def list_courses(self):
+ self.courses = []
+ if self._logged_in:
+ dashboard = self._br.open(SITE_URL + dashboard_url)
+ dashboard_soup = BeautifulSoup(dashboard.read())
+ my_courses = dashboard_soup.findAll('article', 'my-course')
+ i = 0
+ for my_course in my_courses:
+ i += 1
+ course_url = my_course.a['href']
+ courseware_url = re.sub(r'\/info$','/courseware',course_url)
+ course_name = my_course.h3.text
+ self.courses.append({'name':course_name, 'url':courseware_url})
+ print '[%02i] %s' % (i, course_name)
+ def list_chapters(self, course_i):
+ self.paragraphs = []
+ if course_i <= len(self.courses) and course_i >= 0:
+ course = self.courses[course_i - 1]
+ course_name = course['name']
+ courseware = self._br.open(SITE_URL+course['url'])
+ courseware_soup = BeautifulSoup(courseware.read())
+ chapters = courseware_soup.findAll('div','chapter')
+ i = 0
+ for chapter in chapters:
+ i += 1
+ chapter_name = chapter.find('h3').find('a').text
+ print '\t[%02i] %s' % (i, chapter_name)
+ paragraphs = chapter.find('ul').findAll('li')
+ j = 0
+ for paragraph in paragraphs:
+ j += 1
+ par_name = paragraph.p.text
+ par_url = paragraph.a['href']
+ self.paragraphs.append((course_name, i, chapter_name, par_name, par_url))
+ print '\t[%02i.%02i] %s' % (i, j, par_name)
+ def download(self):
+ j = 0
+ for (cn, i, chn, pn, url) in self.paragraphs:
+ j += 1
+ par = self._br.open(SITE_URL + url)
par_soup = BeautifulSoup(par.read())
contents = par_soup.findAll('div','seq_contents')
- par_part = 0
+ k = 0
for content in contents:
content_soup = BeautifulSoup(content.text)
try:
+ video_type = content_soup.h2.text.strip()
video_stream = content_soup.find('div','video')['data-streams']
video_id = video_stream.split(':')[1]
video_url = youtube_url + video_id
- video_type = content_soup.h2.text.strip()
- par_part += 1
- print '\t\t%s - %i-%s: %s' % (par_name, par_part, video_type, video_url)
- f.writelines(video_url+'\n')
+ k += 1
+ print '[%02i.%02i.%i] %s (%s)' % (i, j, k, pn, video_type)
+ #f.writelines(video_url+'\n')
+ outtmpl = sanitize_filename(cn) + '\\' + sanitize_filename(chn) + '\\' + '%02i.%02i.%i ' % (i,j,k) + sanitize_filename('%s (%s)' % (pn, video_type)) + '.%(ext)s'
+ self._fd.params['outtmpl'] = outtmpl
+ self._fd.download([video_url])
except:
pass
- f.close()
- print '\nYou can now downlaod lecture videos with the following command:\n youtube-dl -a "%s.txt" -A -t\n' % course_name
+
+
+tgb = TenGenBrowser()
+tgb.login(EMAIL, PASSWORD)
+tgb.list_courses()
+for c in range(0,len(tgb.courses)):
+ tgb.list_chapters(c)
+ tgb.download()
View
@@ -0,0 +1 @@
+{"username": null, "listformats": null, "skip_download": false, "usenetrc": false, "max_downloads": null, "noprogress": false, "forcethumbnail": false, "forceformat": false, "format_limit": null, "ratelimit": null, "nooverwrites": true, "forceurl": false, "writeinfojson": false, "simulate": false, "playliststart": 1, "continuedl": true, "password": null, "prefer_free_formats": false, "nopart": false, "retries": 10, "updatetime": true, "consoletitle": false, "verbose": true, "forcefilename": false, "ignoreerrors": false, "logtostderr": false, "format": null, "subtitleslang": null, "quiet": false, "outtmpl": "%(id)s.%(ext)s", "rejecttitle": null, "playlistend": -1, "writedescription": false, "forcetitle": false, "forcedescription": false, "writesubtitles": false, "matchtitle": null}
Oops, something went wrong.

0 comments on commit fd0f89f

Please sign in to comment.