Skip to content
This repository
Browse code

Merge pull request #1 from nettoyeur/master

Fix indexing + support multiple EdX powered sites
  • Loading branch information...
commit 695611c2e45524fba3cf47818dad56095a28b547 2 parents 17b7201 + b36b158
Alexander Bessonov authored January 09, 2013
4  README.md
Source Rendered
... ...
@@ -1,6 +1,6 @@
1  
-##Download course videos from eudcation.10gen.com.
  1
+##Download course videos from education.10gen.com or any other site 'Powered by EdX' (including, of course, http://edx.org itself).
2 2
 
3  
-File `config.py` should be populated with login/password.
  3
+File `config.py` should be populated with login/password and site you're downloading video from.
4 4
 
5 5
 This script uses code from [youtube-dl](https://github.com/rg3/youtube-dl/) project to download videos.
6 6
 
10  config.py
... ...
@@ -1,2 +1,8 @@
1  
-EMAIL = 'test@test.com'
2  
-PASSWORD = 'password'
  1
+# edx.org config
  2
+EMAIL = 'your-email@he.re'
  3
+PASSWORD='password'
  4
+DOMAIN='www.edx.org'
  5
+
  6
+
  7
+#common things
  8
+SITE_URL = 'https://' + DOMAIN
25  edu_10gen.py
@@ -29,12 +29,17 @@
29 29
     print "You should provide config.py file with EMAIL and PASSWORD."
30 30
     sys.exit(1)
31 31
 
  32
+try:
  33
+    from config import SITE_URL, DOMAIN
  34
+except ImportError:
  35
+    print "You should provide config.py file with SITE_URL and DOMAIN."
  36
+    sys.exit(1)
  37
+
32 38
 if len(sys.argv) == 2:
33 39
     DIRECTORY = sys.argv[1].strip('"') + '/'
34 40
 else:
35 41
     DIRECTORY = ''
36 42
 
37  
-SITE_URL = 'https://education.10gen.com'
38 43
 login_url = '/login'
39 44
 dashboard_url = '/dashboard'
40 45
 youtube_url = 'http://www.youtube.com/watch?v='
@@ -52,7 +57,7 @@ def csrfCookie(csrftoken):
52 57
             name='csrftoken',
53 58
             value=csrftoken,
54 59
             port=None, port_specified=False,
55  
-            domain='10gen.com',
  60
+            domain=DOMAIN,
56 61
             domain_specified=False,
57 62
             domain_initial_dot=False,
58 63
             path='/', path_specified=True,
@@ -75,6 +80,7 @@ def __init__(self):
75 80
         with open(YDL_PARAMS_FILE) as fydl:
76 81
             self._fd = FileDownloader(json.load(fydl))
77 82
             self._fd.add_info_extractor(YoutubeIE())
  83
+
78 84
     def login(self, email, password):
79 85
         try:
80 86
             login_resp = self._br.open(SITE_URL + login_url, urlencode({'email':email, 'password':password}))
@@ -85,6 +91,7 @@ def login(self, email, password):
85 91
             return self._logged_in
86 92
         except mechanize.HTTPError, e:
87 93
             sys.exit('Can\'t sign in')
  94
+
88 95
     def list_courses(self):
89 96
         self.courses = []
90 97
         if self._logged_in:
@@ -99,10 +106,11 @@ def list_courses(self):
99 106
                 course_name = my_course.h3.text
100 107
                 self.courses.append({'name':course_name, 'url':courseware_url})
101 108
                 print '[%02i] %s' % (i, course_name)
  109
+
102 110
     def list_chapters(self, course_i):
103 111
         self.paragraphs = []
104  
-        if course_i <= len(self.courses) and course_i >= 0:
105  
-            course = self.courses[course_i - 1]
  112
+        if course_i < len(self.courses) and course_i >= 0:
  113
+            course = self.courses[course_i]
106 114
             course_name = course['name']
107 115
             courseware = self._br.open(SITE_URL+course['url'])
108 116
             courseware_soup = BeautifulSoup(courseware.read())
@@ -120,19 +128,25 @@ def list_chapters(self, course_i):
120 128
                     par_url = paragraph.a['href']
121 129
                     self.paragraphs.append((course_name, i, j, chapter_name, par_name, par_url))
122 130
                     print '\t[%02i.%02i] %s' % (i, j, par_name)
  131
+
123 132
     def download(self):
  133
+        print "\n-----------------------\nStart downloading\n-----------------------\n"
124 134
         for (course_name, i, j, chapter_name, par_name, url) in self.paragraphs:
125 135
             nametmpl = sanitize_filename(course_name) + '/' \
126 136
                      + sanitize_filename(chapter_name) + '/' \
127 137
                      + '%02i.%02i.*' % (i,j)
128 138
             fn = glob.glob(DIRECTORY + nametmpl)
  139
+            
129 140
             if fn:
  141
+                print "Processing of %s skipped" % nametmpl
130 142
                 continue
  143
+            print "Processing %s..." % nametmpl
131 144
             par = self._br.open(SITE_URL + url)
132 145
             par_soup = BeautifulSoup(par.read())
133 146
             contents = par_soup.findAll('div','seq_contents')
134 147
             k = 0
135 148
             for content in contents:
  149
+                #print "Content: %s" % content
136 150
                 content_soup = BeautifulSoup(content.text)
137 151
                 try:
138 152
                     video_type = content_soup.h2.text.strip()
@@ -148,7 +162,8 @@ def download(self):
148 162
                             + sanitize_filename('%s (%s)' % (par_name, video_type)) + '.%(ext)s'
149 163
                     self._fd.params['outtmpl'] = outtmpl
150 164
                     self._fd.download([video_url])
151  
-                except:
  165
+                except Exception as e:
  166
+                    #print "Error: %s" % e
152 167
                     pass
153 168
 
154 169
 

0 notes on commit 695611c

Please sign in to comment.
Something went wrong with that request. Please try again.