Massive revamp, packaging

manvari · Mar 30, 2014 · 5a47ee5 · 5a47ee5
1 parent 869ed0a
commit 5a47ee5
Show file tree

Hide file tree

Showing 8 changed files with 233 additions and 1 deletion.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,3 @@
+exclude .gitignore
+global-exclude __pycache__
+include README.rst LICENSE
diff --git a/README.md b/README.md
@@ -2,4 +2,18 @@ imgur-scraper
 =============
 
 An imgur album scraper written in Python.
-Licensed under the MIT license.
+
+### Components
+
+The modules in [imgur-scraper](https://github.com/iceTwy/imgur-scraper/tree/master/imgurscraper) are used by 
+imgur-scrape, which is a script that fetches all the images in an imgur (gallery) album.
+
+### Requirements
+
+imgur-scraper requires [requests](https://github.com/kennethreitz/requests)
+
+Besides, you need to register an account at [imgur](https://imgur.com) to receive an [API client ID](https://imgur.com/account/settings/apps).
+
+### License
+
+imgur-scraped is licensed under the MIT license; refer to the LICENSE file.
diff --git a/README.rst b/README.rst
@@ -0,0 +1,40 @@
+Introduction
+============
+
+imgur-scraper is a scraper that helps you retrieve all images from an imgur album.
+
+Registration
+============
+
+imgur-scraper queries the imgur API to retrieve information about albums.
+
+You need to register an account at imgur_, then to generate an `API client ID`_.
+
+.. _imgur: https://imgur.com/register
+.. _`API client ID`: https://imgur.com/account/settings/apps
+
+
+Installation
+============
+
+Simply use `pip` to install imgur-scraper:
+
+        $ pip install imgur-scraper
+
+imgur-scraper provides `imgur-scrape`.
+
+Usage
+=====
+
+imgur-scrape can be used with the following arguments:
+
+        usage: imgur-scrape [-h] [-v] client_id url path
+
+        positional arguments:
+        client_id      imgur API client id
+        url            URL of the imgur album to scrape
+        path           path to save album images
+
+        optional arguments:
+        -h, --help     show this help message and exit
+        -v, --verbose  increase output verbosity
diff --git a/imgur-scrape b/imgur-scrape
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+
+import argparse
+from imgurscraper import scraper
+
+parser = argparse.ArgumentParser()
+parser.add_argument("client_id", help="imgur API client id")
+parser.add_argument("url", help="URL of the imgur album to scrape")
+parser.add_argument("path", help="path to save album images")
+parser.add_argument("-v", "--verbose", help="increase output verbosity",
+                    action="store_true")
+args = parser.parse_args()
+
+imgur_resource = scraper.prepare_url(args.client_id, args.url)
+album_id = scraper.get_album_id(args.client_id, imgur_resource)
+album_dir = scraper.mkdir_album(args.path, album_id)
+scraper.scrape(args.client_id, album_id, album_dir, imgur_resource, args.verbose)
diff --git a/imgurscraper/__init__.py b/imgurscraper/__init__.py
diff --git a/imgurscraper/imgur.py b/imgurscraper/imgur.py
@@ -0,0 +1,43 @@
+# -*- coding: utf8 -*-
+
+import json
+import requests
+import shutil
+
+class ImgurScraper(object):
+    def __init__(self, client_id):
+        """
+        Sets the client_id (obtain yours here: https://api.imgur.com/oauth2/addclient),
+        the imgur API URL and the default path to save images.
+        """
+        self.client_id = client_id
+        self.api_url = "https://api.imgur.com/3/"
+
+    def request(self, input):
+        """
+        Sends a request to the API. Only publicly available data is accessible.
+        Returns data as JSON.
+        """
+        headers = {'Authorization': 'Client-ID ' + self.client_id,
+                   'Accept': 'application/json'}
+        request = requests.get(self.api_url + input, headers=headers)
+        request.raise_for_status()
+        return request.json()
+
+    def resource(self, resource, id):
+        """
+        Retrieves a resource from the imgur API.
+        Returns data as JSON.
+        """
+        api_request_path = '{0}/{1}'.format(resource, id)
+        return self.request(api_request_path)
+
+    def save_image(self, link, path):
+        """
+        Downloads an image from imgur.
+        """
+        request = requests.get(link, stream=True)
+        request.raise_for_status()
+        with open(path, 'wb') as out_file:
+            return shutil.copyfileobj(request.raw, out_file)
+
diff --git a/imgurscraper/scraper.py b/imgurscraper/scraper.py
@@ -0,0 +1,89 @@
+import re
+import argparse
+import os
+from imgurscraper.imgur import ImgurScraper
+
+"""
+Checks if the inputted URL is a valid imgur album URL.
+Returns a string of the form: 'album/id' or 'gallery/id'.
+"""
+def prepare_url(client_id, url):
+    url_regex = re.compile('^(?:https?://)?(?:i\.)?imgur\.com/(.*[^\/])')
+    regex_match = re.match(url_regex, url)
+
+    if regex_match == None:
+        raise ValueError('Enter a valid imgur URL.')
+
+    imgur_resource = regex_match.group(1)
+    if 'gallery' not in imgur_resource and 'a' not in imgur_resource and 'album' not in imgur_resource:
+        raise ValueError('Enter the valid URL of an imgur album or gallery album.')
+
+    client = ImgurScraper(client_id)
+    request = client.request(imgur_resource)
+    if 'gallery' in imgur_resource and request['data']['is_album'] != True:
+        raise ValueError('This gallery resource is not an album.')
+
+    return imgur_resource
+
+"""
+Gets an album ID from the imgur API.
+"""
+def get_album_id(client_id, imgur_resource):
+    client = ImgurScraper(client_id)
+    request = client.request(imgur_resource)
+
+    album_id = request['data']['id']
+    return album_id
+
+"""
+Creates a sub-directory for an album to store images,
+within a specified path (i.e. path/album_id).
+
+Returns the full path to the directory.
+"""
+def mkdir_album(path, album_id):
+    if not os.path.exists(os.path.expanduser(path)):
+        raise ValueError('Incorrect path.')
+    if not path.endswith('/'):
+        path = path + '/'
+
+    album_dir = path + album_id
+    if os.path.exists(album_dir):
+        return album_dir
+    else:
+        try:
+            os.mkdir(album_dir)
+            return album_dir
+        except:
+            exit(1)
+
+"""
+Scrapes images from an imgur album.
+Verbose: indicates the album ID, the number of images in
+the album, and the current download progress.
+"""
+def scrape(client_id, album_id, album_dir, resource, verbose=False):
+    client = ImgurScraper(client_id)
+    request = client.request(resource)
+
+    images_count = request['data']['images_count']
+    if verbose:
+        print('Album ID: {0}\nNumber of images: {1}'.format(album_id, images_count))
+
+    image_regex = re.compile('^(?:https?://)?(?:i\.)?imgur\.com/(.*)$')
+    for i in range(0, images_count, 1):
+        image_url = request['data']['images'][i]['link']
+        request_regex = re.match(image_regex, image_url)
+        image_id = request_regex.group(1)
+        path = '{0}/{1}'.format(album_dir, image_id)
+        if os.path.exists(path):
+            if verbose:
+                print('{0}/{1} already downloaded'.format(i+1, images_count))
+            else:
+                pass
+        else:
+            client.save_image(image_url, path)
+            if verbose:
+                print('Downloaded {0}/{1}'.format(i+1, images_count))
+        if verbose and i+1 == images_count:
+            print('Finished downloading the album.')
diff --git a/setup.py b/setup.py
@@ -0,0 +1,26 @@
+from setuptools import setup
+
+with open('README.rst') as file:
+        README = file.read()
+
+setup(
+    name = 'imgur-scraper',
+    packages = ['imgurscraper'],
+    scripts = ['imgur-scrape'],
+    install_requires = ['requests'],
+    version = '0.1',
+    description = 'An imgur album scraper',
+    long_description = README,
+    author = 'iceTwy',
+    author_email = 'icetwy@icetwy.re',
+    license = 'MIT',
+    url = 'https://github.com/iceTwy/imgur-scraper',
+    keywords = ['imgur', 'album', 'scraper'],
+    classifiers = [
+        'Programming Language :: Python :: 3',
+        'Development Status :: 3 - Alpha',
+        'License :: OSI Approved :: MIT License',
+        'Operating System :: OS Independent',
+        'Topic :: Internet'
+    ]
+)