Skip to content

Commit

Permalink
Massive revamp, packaging
Browse files Browse the repository at this point in the history
  • Loading branch information
iceTwy committed Mar 30, 2014
1 parent 869ed0a commit 5a47ee5
Show file tree
Hide file tree
Showing 8 changed files with 233 additions and 1 deletion.
3 changes: 3 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
exclude .gitignore
global-exclude __pycache__
include README.rst LICENSE
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,18 @@ imgur-scraper
=============

An imgur album scraper written in Python.
Licensed under the MIT license.

### Components

The modules in [imgur-scraper](https://github.com/iceTwy/imgur-scraper/tree/master/imgurscraper) are used by
imgur-scrape, which is a script that fetches all the images in an imgur (gallery) album.

### Requirements

imgur-scraper requires [requests](https://github.com/kennethreitz/requests)

Besides, you need to register an account at [imgur](https://imgur.com) to receive an [API client ID](https://imgur.com/account/settings/apps).

### License

imgur-scraped is licensed under the MIT license; refer to the LICENSE file.
40 changes: 40 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
Introduction
============

imgur-scraper is a scraper that helps you retrieve all images from an imgur album.

Registration
============

imgur-scraper queries the imgur API to retrieve information about albums.

You need to register an account at imgur_, then to generate an `API client ID`_.

.. _imgur: https://imgur.com/register
.. _`API client ID`: https://imgur.com/account/settings/apps


Installation
============

Simply use `pip` to install imgur-scraper:

$ pip install imgur-scraper

imgur-scraper provides `imgur-scrape`.

Usage
=====

imgur-scrape can be used with the following arguments:

usage: imgur-scrape [-h] [-v] client_id url path

positional arguments:
client_id imgur API client id
url URL of the imgur album to scrape
path path to save album images

optional arguments:
-h, --help show this help message and exit
-v, --verbose increase output verbosity
17 changes: 17 additions & 0 deletions imgur-scrape
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env python

import argparse
from imgurscraper import scraper

parser = argparse.ArgumentParser()
parser.add_argument("client_id", help="imgur API client id")
parser.add_argument("url", help="URL of the imgur album to scrape")
parser.add_argument("path", help="path to save album images")
parser.add_argument("-v", "--verbose", help="increase output verbosity",
action="store_true")
args = parser.parse_args()

imgur_resource = scraper.prepare_url(args.client_id, args.url)
album_id = scraper.get_album_id(args.client_id, imgur_resource)
album_dir = scraper.mkdir_album(args.path, album_id)
scraper.scrape(args.client_id, album_id, album_dir, imgur_resource, args.verbose)
Empty file added imgurscraper/__init__.py
Empty file.
43 changes: 43 additions & 0 deletions imgurscraper/imgur.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# -*- coding: utf8 -*-

import json
import requests
import shutil

class ImgurScraper(object):
def __init__(self, client_id):
"""
Sets the client_id (obtain yours here: https://api.imgur.com/oauth2/addclient),
the imgur API URL and the default path to save images.
"""
self.client_id = client_id
self.api_url = "https://api.imgur.com/3/"

def request(self, input):
"""
Sends a request to the API. Only publicly available data is accessible.
Returns data as JSON.
"""
headers = {'Authorization': 'Client-ID ' + self.client_id,
'Accept': 'application/json'}
request = requests.get(self.api_url + input, headers=headers)
request.raise_for_status()
return request.json()

def resource(self, resource, id):
"""
Retrieves a resource from the imgur API.
Returns data as JSON.
"""
api_request_path = '{0}/{1}'.format(resource, id)
return self.request(api_request_path)

def save_image(self, link, path):
"""
Downloads an image from imgur.
"""
request = requests.get(link, stream=True)
request.raise_for_status()
with open(path, 'wb') as out_file:
return shutil.copyfileobj(request.raw, out_file)

89 changes: 89 additions & 0 deletions imgurscraper/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import re
import argparse
import os
from imgurscraper.imgur import ImgurScraper

"""
Checks if the inputted URL is a valid imgur album URL.
Returns a string of the form: 'album/id' or 'gallery/id'.
"""
def prepare_url(client_id, url):
url_regex = re.compile('^(?:https?://)?(?:i\.)?imgur\.com/(.*[^\/])')
regex_match = re.match(url_regex, url)

if regex_match == None:
raise ValueError('Enter a valid imgur URL.')

imgur_resource = regex_match.group(1)
if 'gallery' not in imgur_resource and 'a' not in imgur_resource and 'album' not in imgur_resource:
raise ValueError('Enter the valid URL of an imgur album or gallery album.')

client = ImgurScraper(client_id)
request = client.request(imgur_resource)
if 'gallery' in imgur_resource and request['data']['is_album'] != True:
raise ValueError('This gallery resource is not an album.')

return imgur_resource

"""
Gets an album ID from the imgur API.
"""
def get_album_id(client_id, imgur_resource):
client = ImgurScraper(client_id)
request = client.request(imgur_resource)

album_id = request['data']['id']
return album_id

"""
Creates a sub-directory for an album to store images,
within a specified path (i.e. path/album_id).
Returns the full path to the directory.
"""
def mkdir_album(path, album_id):
if not os.path.exists(os.path.expanduser(path)):
raise ValueError('Incorrect path.')
if not path.endswith('/'):
path = path + '/'

album_dir = path + album_id
if os.path.exists(album_dir):
return album_dir
else:
try:
os.mkdir(album_dir)
return album_dir
except:
exit(1)

"""
Scrapes images from an imgur album.
Verbose: indicates the album ID, the number of images in
the album, and the current download progress.
"""
def scrape(client_id, album_id, album_dir, resource, verbose=False):
client = ImgurScraper(client_id)
request = client.request(resource)

images_count = request['data']['images_count']
if verbose:
print('Album ID: {0}\nNumber of images: {1}'.format(album_id, images_count))

image_regex = re.compile('^(?:https?://)?(?:i\.)?imgur\.com/(.*)$')
for i in range(0, images_count, 1):
image_url = request['data']['images'][i]['link']
request_regex = re.match(image_regex, image_url)
image_id = request_regex.group(1)
path = '{0}/{1}'.format(album_dir, image_id)
if os.path.exists(path):
if verbose:
print('{0}/{1} already downloaded'.format(i+1, images_count))
else:
pass
else:
client.save_image(image_url, path)
if verbose:
print('Downloaded {0}/{1}'.format(i+1, images_count))
if verbose and i+1 == images_count:
print('Finished downloading the album.')
26 changes: 26 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from setuptools import setup

with open('README.rst') as file:
README = file.read()

setup(
name = 'imgur-scraper',
packages = ['imgurscraper'],
scripts = ['imgur-scrape'],
install_requires = ['requests'],
version = '0.1',
description = 'An imgur album scraper',
long_description = README,
author = 'iceTwy',
author_email = 'icetwy@icetwy.re',
license = 'MIT',
url = 'https://github.com/iceTwy/imgur-scraper',
keywords = ['imgur', 'album', 'scraper'],
classifiers = [
'Programming Language :: Python :: 3',
'Development Status :: 3 - Alpha',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
'Topic :: Internet'
]
)

0 comments on commit 5a47ee5

Please sign in to comment.