Add Download Resume Feature

mxamin · Jul 11, 2015 · afa350a · rbrito · Aug 5, 2015 · mxamin
1 parent 5c233c8
commit afa350a
Show file tree

Hide file tree

Showing 4 changed files with 136 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -222,6 +222,29 @@ instead.  This is especially convenient, as typing usernames (email
 addresses) and passwords directly on the command line can get tiresome (even
 more if you happened to choose a "strong" password).
 
+In default mode when you interrupt the download process by pressing
+CTRL+C, partially downloaded files will be deleted from your disk and
+you have to start the download process from the begining. If your
+download was interrupted by something other than KeyboardInterrupt
+(CTRL+C) like sudden system crash, partially downloaded files will
+remain on your disk and the next time you start the process again,
+these files will be discraded from download list!, therefor it's your
+job to delete them manually before next start. For this reason we
+added an option called `--resume` which continues your downloads from
+where they stopped:
+
+	coursera-dl -u <user> -p <pass> --resume sdn1-001
+
+This option can also be used with external downloaders:
+
+	coursera-dl --wget -u <user> -p <pass> --resume sdn1-001
+
+*Note 1*: Some external downloaders use their own built-in resume feature
+which may not be compatibale with others, so use them at your own risk.
+
+*Note 2*: Remember that in resume mode, interrupted files **WON'T** be deleted from
+your disk.
+
 **NOTE**: If your password contains punctuation, quotes or other "funny
 characters" (e.g., `<`, `>`, `#`, `&`, `|` and so on), then you may have to
 escape them from your shell. With bash or other Bourne-shell clones (and

diff --git a/coursera/coursera_dl.py b/coursera/coursera_dl.py
@@ -491,7 +491,8 @@ def download_lectures(downloader,
                       hooks=None,
                       playlist=False,
                       intact_fnames=False,
-                      ignored_formats=None):
+                      ignored_formats=None,
+                      resume=False):
     """
     Downloads lecture resources described by sections.
     Returns True if the class appears completed.
@@ -533,10 +534,10 @@ def download_lectures(downloader,
                     lecfn = os.path.join(
                         sec, format_resource(lecnum + 1, lecname, title, fmt))
 
-                if overwrite or not os.path.exists(lecfn):
+                if overwrite or not os.path.exists(lecfn) or resume:
                     if not skip_download:
                         logging.info('Downloading: %s', lecfn)
-                        downloader.download(url, lecfn)
+                        downloader.download(url, lecfn, resume=resume)
                     else:
                         open(lecfn, 'w').close()  # touch
                     last_update = time.time()
@@ -727,6 +728,12 @@ def parse_args(args=None):
                                    help='use axel for downloading,'
                                    ' optionally specify axel bin')
 
+    parser.add_argument('--resume',
+                        dest='resume',
+                        action='store_true',
+                        default=False,
+                        help='resume incomplete downloads (default: False)')
+
     parser.add_argument('-o',
                         '--overwrite',
                         dest='overwrite',
@@ -944,7 +951,8 @@ def download_class(args, class_name):
                                   args.hooks,
                                   args.playlist,
                                   args.intact_fnames,
-                                  ignored_formats)
+                                  ignored_formats,
+                                  args.resume)
 
     return completed
 
@@ -996,7 +1004,8 @@ def download_on_demand_class(args, class_name):
             args.hooks,
             args.playlist,
             args.intact_fnames,
-            ignored_formats
+            ignored_formats,
+            args.resume
         )
         completed = completed and result
 

diff --git a/coursera/downloaders.py b/coursera/downloaders.py
@@ -33,28 +33,30 @@ class Downloader(object):
       >>> d.download('http://example.com', 'save/to/this/file')
     """
 
-    def _start_download(self, url, filename):
+    def _start_download(self, url, filename, resume):
         """
         Actual method to download the given url to the given file.
         This method should be implemented by the subclass.
         """
         raise NotImplementedError("Subclasses should implement this")
 
-    def download(self, url, filename):
+    def download(self, url, filename, resume=False):
         """
         Download the given url to the given file. When the download
         is aborted by the user, the partially downloaded file is also removed.
         """
 
         try:
-            self._start_download(url, filename)
+            self._start_download(url, filename, resume)
         except KeyboardInterrupt as e:
-            logging.info(
-                'Keyboard Interrupt -- Removing partial file: %s', filename)
-            try:
-                os.remove(filename)
-            except OSError:
-                pass
+            # keep the file if resume is True
+            if not resume:
+                logging.info('Keyboard Interrupt -- Removing partial file: %s',
+                             filename)
+                try:
+                    os.remove(filename)
+                except OSError:
+                    pass
             raise e
 
 
@@ -94,6 +96,13 @@ def _prepare_cookies(self, command, url):
         if cookie_values:
             self._add_cookies(command, cookie_values)
 
+    def _enable_resume(self, command):
+        """
+        Enable resume feature
+        """
+
+        raise RuntimeError("Subclass should implement this")
+
     def _add_cookies(self, command, cookie_values):
         """
         Add the given cookie values to the command
@@ -107,9 +116,12 @@ def _create_command(self, url, filename):
         """
         raise NotImplementedError("Subclasses should implement this")
 
-    def _start_download(self, url, filename):
+    def _start_download(self, url, filename, resume):
         command = self._create_command(url, filename)
         self._prepare_cookies(command, url)
+        if resume:
+            self._enable_resume(command)
+
         logging.debug('Executing %s: %s', self.bin, command)
         try:
             subprocess.call(command)
@@ -126,6 +138,9 @@ class WgetDownloader(ExternalDownloader):
 
     bin = 'wget'
 
+    def _enable_resume(self, command):
+        command.extend(['-c', ])
+
     def _add_cookies(self, command, cookie_values):
         command.extend(['--header', "Cookie: " + cookie_values])
 
@@ -141,6 +156,9 @@ class CurlDownloader(ExternalDownloader):
 
     bin = 'curl'
 
+    def _enable_resume(self, command):
+        command.extend(['-C', '-'])
+
     def _add_cookies(self, command, cookie_values):
         command.extend(['--cookie', cookie_values])
 
@@ -156,6 +174,9 @@ class Aria2Downloader(ExternalDownloader):
 
     bin = 'aria2c'
 
+    def _enable_resume(self, command):
+        command.extend(['-c', ])
+
     def _add_cookies(self, command, cookie_values):
         command.extend(['--header', "Cookie: " + cookie_values])
 
@@ -173,6 +194,10 @@ class AxelDownloader(ExternalDownloader):
 
     bin = 'axel'
 
+    def _enable_resume(self, command):
+        logging.warn('Resume download not implemented for this '
+                     'downloader!')
+
     def _add_cookies(self, command, cookie_values):
         command.extend(['-H', "Cookie: " + cookie_values])
 
@@ -278,43 +303,75 @@ class NativeDownloader(Downloader):
     def __init__(self, session):
         self.session = session
 
-    def _start_download(self, url, filename):
-        logging.info('Downloading %s -> %s', url, filename)
+    def _start_download(self, url, filename, resume=False):
+        # resume has no meaning if the file doesn't exists!
+        resume = resume and os.path.exists(filename)
+
+        headers = {}
+        filesize = None
+        if resume:
+            filesize = os.path.getsize(filename)
+            headers['Range'] = 'bytes={}-'.format(filesize)
+            logging.info('Resume downloading %s -> %s', url, filename)
+        else:
+            logging.info('Downloading %s -> %s', url, filename)
 
         attempts_count = 0
         error_msg = ''
         while attempts_count < 5:
-            r = self.session.get(url, stream=True)
-
-            if r.status_code is not 200:
-                logging.warn(
-                    'Probably the file is missing from the AWS repository...'
-                    ' waiting.')
-
-                if r.reason:
-                    error_msg = r.reason + ' ' + str(r.status_code)
+            r = self.session.get(url, stream=True, headers=headers)
+
+            if r.status_code != 200:
+                # because in resume state we are downloadig only a
+                # portion of requested file, server may return
+                # following HTTP codes:
+                # 206: Partial Content
+                # 415: Requested Range Not Satisfiable
+                # which are OK for us.
+                if resume and r.status_code == 206:
+                    pass
+                elif resume and r.status_code == 416:
+                    logging.info('%s already downloaded', filename)
+                    r.close()
+                    return True
                 else:
-                    error_msg = 'HTTP Error ' + str(r.status_code)
-
-                wait_interval = 2 ** (attempts_count + 1)
-                msg = 'Error downloading, will retry in {0} seconds ...'
-                print(msg.format(wait_interval))
-                time.sleep(wait_interval)
-                attempts_count += 1
-                continue
+                    print(r.status_code)
+                    print(url)
+                    print(filesize)
+                    logging.warn('Probably the file is missing from the AWS '
+                                 'repository...  waiting.')
+
+                    if r.reason:
+                        error_msg = r.reason + ' ' + str(r.status_code)
+                    else:
+                        error_msg = 'HTTP Error ' + str(r.status_code)
+
+                    wait_interval = 2 ** (attempts_count + 1)
+                    msg = 'Error downloading, will retry in {0} seconds ...'
+                    print(msg.format(wait_interval))
+                    time.sleep(wait_interval)
+                    attempts_count += 1
+                    continue
+
+            if resume and r.status_code == 200:
+                # if the server returns HTTP code 200 while we are in
+                # resume mode, it means that the server does not support
+                # partial downloads.
+                resume = False
 
             content_length = r.headers.get('content-length')
-            progress = DownloadProgress(content_length)
             chunk_sz = 1048576
-            with open(filename, 'wb') as f:
-                progress.start()
-                while True:
-                    data = r.raw.read(chunk_sz, decode_content=True)
-                    if not data:
-                        progress.stop()
-                        break
-                    progress.report(r.raw.tell())
-                    f.write(data)
+            progress = DownloadProgress(content_length)
+            progress.start()
+            f = open(filename, 'ab') if resume else open(filename, 'wb')
+            while True:
+                data = r.raw.read(chunk_sz, decode_content=True)
+                if not data:
+                    progress.stop()
+                    break
+                progress.report(r.raw.tell())
+                f.write(data)
+            f.close()
             r.close()
             return True
 

diff --git a/coursera/test/test_downloaders.py b/coursera/test/test_downloaders.py
@@ -70,7 +70,7 @@ def test_bin_not_found_raises_exception():
     d = downloaders.ExternalDownloader(None, bin='no_way_this_exists')
     d._prepare_cookies = lambda cmd, cv: None
     d._create_command = lambda x, y: ['no_way_this_exists']
-    assertRaises(OSError, d._start_download, 'url', 'filename')
+    assertRaises(OSError, d._start_download, 'url', 'filename', False)
 
 
 def test_bin_is_set():
@@ -186,7 +186,7 @@ class IObject(object):
 
     class MockSession(object):
 
-        def get(self, url, stream=True):
+        def get(self, url, stream=True, headers={}):
             object_ = IObject()
             object_.status_code = 400
             object_.reason = None
@@ -197,7 +197,7 @@ def get(self, url, stream=True):
 
     session = MockSession()
     d = downloaders.NativeDownloader(session)
-    assert d._start_download('download_url', 'save_to') is False
+    assert d._start_download('download_url', 'save_to', False) is False
 
     time.sleep = _sleep