Permalink
Browse files

Merge pull request #1 from nettoyeur/master

Fix indexing + support multiple EdX powered sites
  • Loading branch information...
2 parents 17b7201 + b36b158 commit 695611c2e45524fba3cf47818dad56095a28b547 @nonsleepr committed Jan 9, 2013
Showing with 30 additions and 9 deletions.
  1. +2 −2 README.md
  2. +8 −2 config.py
  3. +20 −5 edu_10gen.py
View
@@ -1,6 +1,6 @@
-##Download course videos from eudcation.10gen.com.
+##Download course videos from education.10gen.com or any other site 'Powered by EdX' (including, of course, http://edx.org itself).
-File `config.py` should be populated with login/password.
+File `config.py` should be populated with login/password and site you're downloading video from.
This script uses code from [youtube-dl](https://github.com/rg3/youtube-dl/) project to download videos.
View
@@ -1,2 +1,8 @@
-EMAIL = 'test@test.com'
-PASSWORD = 'password'
+# edx.org config
+EMAIL = 'your-email@he.re'
+PASSWORD='password'
+DOMAIN='www.edx.org'
+
+
+#common things
+SITE_URL = 'https://' + DOMAIN
View
@@ -29,12 +29,17 @@
print "You should provide config.py file with EMAIL and PASSWORD."
sys.exit(1)
+try:
+ from config import SITE_URL, DOMAIN
+except ImportError:
+ print "You should provide config.py file with SITE_URL and DOMAIN."
+ sys.exit(1)
+
if len(sys.argv) == 2:
DIRECTORY = sys.argv[1].strip('"') + '/'
else:
DIRECTORY = ''
-SITE_URL = 'https://education.10gen.com'
login_url = '/login'
dashboard_url = '/dashboard'
youtube_url = 'http://www.youtube.com/watch?v='
@@ -52,7 +57,7 @@ def csrfCookie(csrftoken):
name='csrftoken',
value=csrftoken,
port=None, port_specified=False,
- domain='10gen.com',
+ domain=DOMAIN,
domain_specified=False,
domain_initial_dot=False,
path='/', path_specified=True,
@@ -75,6 +80,7 @@ def __init__(self):
with open(YDL_PARAMS_FILE) as fydl:
self._fd = FileDownloader(json.load(fydl))
self._fd.add_info_extractor(YoutubeIE())
+
def login(self, email, password):
try:
login_resp = self._br.open(SITE_URL + login_url, urlencode({'email':email, 'password':password}))
@@ -85,6 +91,7 @@ def login(self, email, password):
return self._logged_in
except mechanize.HTTPError, e:
sys.exit('Can\'t sign in')
+
def list_courses(self):
self.courses = []
if self._logged_in:
@@ -99,10 +106,11 @@ def list_courses(self):
course_name = my_course.h3.text
self.courses.append({'name':course_name, 'url':courseware_url})
print '[%02i] %s' % (i, course_name)
+
def list_chapters(self, course_i):
self.paragraphs = []
- if course_i <= len(self.courses) and course_i >= 0:
- course = self.courses[course_i - 1]
+ if course_i < len(self.courses) and course_i >= 0:
+ course = self.courses[course_i]
course_name = course['name']
courseware = self._br.open(SITE_URL+course['url'])
courseware_soup = BeautifulSoup(courseware.read())
@@ -120,19 +128,25 @@ def list_chapters(self, course_i):
par_url = paragraph.a['href']
self.paragraphs.append((course_name, i, j, chapter_name, par_name, par_url))
print '\t[%02i.%02i] %s' % (i, j, par_name)
+
def download(self):
+ print "\n-----------------------\nStart downloading\n-----------------------\n"
for (course_name, i, j, chapter_name, par_name, url) in self.paragraphs:
nametmpl = sanitize_filename(course_name) + '/' \
+ sanitize_filename(chapter_name) + '/' \
+ '%02i.%02i.*' % (i,j)
fn = glob.glob(DIRECTORY + nametmpl)
+
if fn:
+ print "Processing of %s skipped" % nametmpl
continue
+ print "Processing %s..." % nametmpl
par = self._br.open(SITE_URL + url)
par_soup = BeautifulSoup(par.read())
contents = par_soup.findAll('div','seq_contents')
k = 0
for content in contents:
+ #print "Content: %s" % content
content_soup = BeautifulSoup(content.text)
try:
video_type = content_soup.h2.text.strip()
@@ -148,7 +162,8 @@ def download(self):
+ sanitize_filename('%s (%s)' % (par_name, video_type)) + '.%(ext)s'
self._fd.params['outtmpl'] = outtmpl
self._fd.download([video_url])
- except:
+ except Exception as e:
+ #print "Error: %s" % e
pass

0 comments on commit 695611c

Please sign in to comment.