-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
iceTwy
committed
Mar 30, 2014
1 parent
869ed0a
commit 5a47ee5
Showing
8 changed files
with
233 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
exclude .gitignore | ||
global-exclude __pycache__ | ||
include README.rst LICENSE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
Introduction | ||
============ | ||
|
||
imgur-scraper is a scraper that helps you retrieve all images from an imgur album. | ||
|
||
Registration | ||
============ | ||
|
||
imgur-scraper queries the imgur API to retrieve information about albums. | ||
|
||
You need to register an account at imgur_, then to generate an `API client ID`_. | ||
|
||
.. _imgur: https://imgur.com/register | ||
.. _`API client ID`: https://imgur.com/account/settings/apps | ||
|
||
|
||
Installation | ||
============ | ||
|
||
Simply use `pip` to install imgur-scraper: | ||
|
||
$ pip install imgur-scraper | ||
|
||
imgur-scraper provides `imgur-scrape`. | ||
|
||
Usage | ||
===== | ||
|
||
imgur-scrape can be used with the following arguments: | ||
|
||
usage: imgur-scrape [-h] [-v] client_id url path | ||
|
||
positional arguments: | ||
client_id imgur API client id | ||
url URL of the imgur album to scrape | ||
path path to save album images | ||
|
||
optional arguments: | ||
-h, --help show this help message and exit | ||
-v, --verbose increase output verbosity |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
#!/usr/bin/env python | ||
|
||
import argparse | ||
from imgurscraper import scraper | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument("client_id", help="imgur API client id") | ||
parser.add_argument("url", help="URL of the imgur album to scrape") | ||
parser.add_argument("path", help="path to save album images") | ||
parser.add_argument("-v", "--verbose", help="increase output verbosity", | ||
action="store_true") | ||
args = parser.parse_args() | ||
|
||
imgur_resource = scraper.prepare_url(args.client_id, args.url) | ||
album_id = scraper.get_album_id(args.client_id, imgur_resource) | ||
album_dir = scraper.mkdir_album(args.path, album_id) | ||
scraper.scrape(args.client_id, album_id, album_dir, imgur_resource, args.verbose) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# -*- coding: utf8 -*- | ||
|
||
import json | ||
import requests | ||
import shutil | ||
|
||
class ImgurScraper(object): | ||
def __init__(self, client_id): | ||
""" | ||
Sets the client_id (obtain yours here: https://api.imgur.com/oauth2/addclient), | ||
the imgur API URL and the default path to save images. | ||
""" | ||
self.client_id = client_id | ||
self.api_url = "https://api.imgur.com/3/" | ||
|
||
def request(self, input): | ||
""" | ||
Sends a request to the API. Only publicly available data is accessible. | ||
Returns data as JSON. | ||
""" | ||
headers = {'Authorization': 'Client-ID ' + self.client_id, | ||
'Accept': 'application/json'} | ||
request = requests.get(self.api_url + input, headers=headers) | ||
request.raise_for_status() | ||
return request.json() | ||
|
||
def resource(self, resource, id): | ||
""" | ||
Retrieves a resource from the imgur API. | ||
Returns data as JSON. | ||
""" | ||
api_request_path = '{0}/{1}'.format(resource, id) | ||
return self.request(api_request_path) | ||
|
||
def save_image(self, link, path): | ||
""" | ||
Downloads an image from imgur. | ||
""" | ||
request = requests.get(link, stream=True) | ||
request.raise_for_status() | ||
with open(path, 'wb') as out_file: | ||
return shutil.copyfileobj(request.raw, out_file) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
import re | ||
import argparse | ||
import os | ||
from imgurscraper.imgur import ImgurScraper | ||
|
||
""" | ||
Checks if the inputted URL is a valid imgur album URL. | ||
Returns a string of the form: 'album/id' or 'gallery/id'. | ||
""" | ||
def prepare_url(client_id, url): | ||
url_regex = re.compile('^(?:https?://)?(?:i\.)?imgur\.com/(.*[^\/])') | ||
regex_match = re.match(url_regex, url) | ||
|
||
if regex_match == None: | ||
raise ValueError('Enter a valid imgur URL.') | ||
|
||
imgur_resource = regex_match.group(1) | ||
if 'gallery' not in imgur_resource and 'a' not in imgur_resource and 'album' not in imgur_resource: | ||
raise ValueError('Enter the valid URL of an imgur album or gallery album.') | ||
|
||
client = ImgurScraper(client_id) | ||
request = client.request(imgur_resource) | ||
if 'gallery' in imgur_resource and request['data']['is_album'] != True: | ||
raise ValueError('This gallery resource is not an album.') | ||
|
||
return imgur_resource | ||
|
||
""" | ||
Gets an album ID from the imgur API. | ||
""" | ||
def get_album_id(client_id, imgur_resource): | ||
client = ImgurScraper(client_id) | ||
request = client.request(imgur_resource) | ||
|
||
album_id = request['data']['id'] | ||
return album_id | ||
|
||
""" | ||
Creates a sub-directory for an album to store images, | ||
within a specified path (i.e. path/album_id). | ||
Returns the full path to the directory. | ||
""" | ||
def mkdir_album(path, album_id): | ||
if not os.path.exists(os.path.expanduser(path)): | ||
raise ValueError('Incorrect path.') | ||
if not path.endswith('/'): | ||
path = path + '/' | ||
|
||
album_dir = path + album_id | ||
if os.path.exists(album_dir): | ||
return album_dir | ||
else: | ||
try: | ||
os.mkdir(album_dir) | ||
return album_dir | ||
except: | ||
exit(1) | ||
|
||
""" | ||
Scrapes images from an imgur album. | ||
Verbose: indicates the album ID, the number of images in | ||
the album, and the current download progress. | ||
""" | ||
def scrape(client_id, album_id, album_dir, resource, verbose=False): | ||
client = ImgurScraper(client_id) | ||
request = client.request(resource) | ||
|
||
images_count = request['data']['images_count'] | ||
if verbose: | ||
print('Album ID: {0}\nNumber of images: {1}'.format(album_id, images_count)) | ||
|
||
image_regex = re.compile('^(?:https?://)?(?:i\.)?imgur\.com/(.*)$') | ||
for i in range(0, images_count, 1): | ||
image_url = request['data']['images'][i]['link'] | ||
request_regex = re.match(image_regex, image_url) | ||
image_id = request_regex.group(1) | ||
path = '{0}/{1}'.format(album_dir, image_id) | ||
if os.path.exists(path): | ||
if verbose: | ||
print('{0}/{1} already downloaded'.format(i+1, images_count)) | ||
else: | ||
pass | ||
else: | ||
client.save_image(image_url, path) | ||
if verbose: | ||
print('Downloaded {0}/{1}'.format(i+1, images_count)) | ||
if verbose and i+1 == images_count: | ||
print('Finished downloading the album.') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from setuptools import setup | ||
|
||
with open('README.rst') as file: | ||
README = file.read() | ||
|
||
setup( | ||
name = 'imgur-scraper', | ||
packages = ['imgurscraper'], | ||
scripts = ['imgur-scrape'], | ||
install_requires = ['requests'], | ||
version = '0.1', | ||
description = 'An imgur album scraper', | ||
long_description = README, | ||
author = 'iceTwy', | ||
author_email = 'icetwy@icetwy.re', | ||
license = 'MIT', | ||
url = 'https://github.com/iceTwy/imgur-scraper', | ||
keywords = ['imgur', 'album', 'scraper'], | ||
classifiers = [ | ||
'Programming Language :: Python :: 3', | ||
'Development Status :: 3 - Alpha', | ||
'License :: OSI Approved :: MIT License', | ||
'Operating System :: OS Independent', | ||
'Topic :: Internet' | ||
] | ||
) |