Skip to content

Commit

Permalink
Enable upload by url (#27)
Browse files Browse the repository at this point in the history
Make `upload_single_file` support `upload_by_url` by providing
an url instead of a file path.

Introduces `up_all_from_url()` as an alternative to `up_all()` for uploading
multiple url files. This takes the output of `make_info` as its input and
creates one log file per possible upload outcome (error, warning, success).

Adds the `-type` argument to control which of `up_all_from_url()` and `up_all()`
to use.

Also:
* Fixes some pydocstyle issues which seem to be skipped due to
  something going wrong with the matching settings in tox.ini
* Re-factors logging by introducing a `LogFile` clas to `common`
  and adding timestamp capabilities (Partially removes the need for #15)
* Bumps version to 0.1.5

Task: [T158402](https://phabricator.wikimedia.org/T158402)
  • Loading branch information
lokal-profil committed May 29, 2017
1 parent 1caece4 commit dd0cbc3
Show file tree
Hide file tree
Showing 5 changed files with 295 additions and 34 deletions.
6 changes: 4 additions & 2 deletions README.md
Expand Up @@ -52,11 +52,13 @@ the file extension) as the key and the following values:
- Content categories
- File description
- Output filename
- Input filename (as key)
- Input filename or url to file (as key)
4. Run the prep-uploader to rename the media files and create the text file
for the associated file description page.
for the associated file description page. \*
5. Run the uploader to upload it all

\* This step is not needed for _upload by url_.

## Usage example:

For usage examples see [lokal-profil/upload_batches](https://github.com/lokal-profil/upload_batches).
Expand Down
48 changes: 45 additions & 3 deletions batchupload/common.py
Expand Up @@ -8,6 +8,7 @@
import operator # needed by sorted_dict
import sys # needed by convert_from_commandline()
import locale # needed by convert_from_commandline()
from datetime import datetime # needed for LogFile.write()
from pywikibot.tools import deprecated


Expand Down Expand Up @@ -234,7 +235,7 @@ def create_dir(out_path):

def to_unicode(text):
"""
Converts a str to unicode (if python2).
Convert a str to unicode (if python2).
@param text: text to convert
@return: unicode string
Expand Down Expand Up @@ -294,11 +295,52 @@ def interpret_bool(string):


class MyError(Exception):

"""Home made errors"""
"""Home made errors."""

def __init__(self, value):
"""Initialise the Error."""
self.value = value

def __str__(self):
"""A string representation of the Error."""
return repr(self.value)


class LogFile(object):
"""A simple object for doing logging."""

def __init__(self, output_dir, name):
"""
Initialise the LogFile.
@param output_dir: directory in which to store the file
@param name: the filename including extension
"""
self.file_name = os.path.join(output_dir, name)
self.file = open(self.file_name, 'a', encoding='utf-8')

def write(self, text):
"""
Write text to the log file, appending a linebreak.
Also flushes the log to ensure the contents have been stored in case of
a crash.
@param text: text to output
"""
self.file.write('{0}\n'.format(text))
self.file.flush()

def write_w_timestamp(self, text):
"""
Write text to the log prepending a timestamp, appending a linebreak.
@param text: text to output
"""
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
self.write('{0} {1}'.format(timestamp, text))

def close_and_confirm(self):
"""Close the log file and return a confirmation."""
self.file.close()
return 'Created {0}'.format(self.file_name)
220 changes: 192 additions & 28 deletions batchupload/uploader.py
@@ -1,14 +1,15 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
#
"""Tool for uploading a single or multiple files from disc or url."""
from __future__ import unicode_literals
from builtins import open
import batchupload.common as common
import batchupload.prepUpload as prepUpload
from batchupload.make_info import make_info_page
import os
import pywikibot

FILE_EXTS = ('.tif', '.jpg', '.tiff', '.jpeg')
URL_PROTOCOLS = ('http', 'https') # @todo: extend with supported protocols


def upload_single_file(file_name, media_file, text, target_site,
Expand All @@ -19,7 +20,7 @@ def upload_single_file(file_name, media_file, text, target_site,
Upload a single file in chunks.
@param file_name: sanitized filename to use for upload
@param media_file: path to media file to upload
@param media_file: path or URL to media file to upload
@param text: file description page
@param target_site: pywikibot.Site object to which file should be uploaded
@param chunk_size: Size of chunks (in MB) in which to upload file
Expand Down Expand Up @@ -63,11 +64,22 @@ def allow_warnings(warning_list):
file_page = pywikibot.FilePage(target_site, file_name)
file_page.text = text

# support upload_by_url: cannot just look for :// due to file://
source_filename = source_url = None
protocol, _, rest = media_file.partition('://')
if protocol in URL_PROTOCOLS:
source_url = media_file
else:
source_filename = media_file

try:
success = target_site.upload(
file_page, source_filename=media_file,
file_page,
source_filename=source_filename,
source_url=source_url,
ignore_warnings=ignore_warnings,
report_success=False, chunk_size=chunk_size)
report_success=False,
chunk_size=chunk_size)
except pywikibot.data.api.APIError as error:
result['error'] = error
result['log'] = 'Error: %s: %s' % (file_page.title(), error)
Expand Down Expand Up @@ -108,7 +120,7 @@ def up_all(in_path, cutoff=None, target='Uploaded', file_exts=None,
@param target: sub-directory for uploaded files (defaults to "Uploaded")
@param file_exts: tuple of allowed file extensions (defaults to FILE_EXTS)
@param verbose: whether to output confirmation after each upload
@param test: set to True to test but not upload (deprecated?)
@param test: set to True to test but not upload
@param target_site: pywikibot.Site to which file should be uploaded,
defaults to Commons.
@param chunked: Whether to do chunked uploading or not.
Expand All @@ -133,8 +145,7 @@ def up_all(in_path, cutoff=None, target='Uploaded', file_exts=None,
common.create_dir(warnings_dir)

# logfile
logfile = os.path.join(in_path, '¤uploader.log')
flog = open(logfile, 'a', encoding='utf-8')
flog = common.LogFile(in_path, '¤uploader.log')

# find all content files
found_files = prepUpload.find_files(path=in_path, file_exts=file_exts,
Expand All @@ -148,7 +159,8 @@ def up_all(in_path, cutoff=None, target='Uploaded', file_exts=None,
base_name = os.path.basename(f)
base_info_name = os.path.basename(info_file)
if not os.path.exists(info_file):
flog.write('%s: Found multimedia file without info\n' % base_name)
flog.write(
'{0}: Found multimedia file without info'.format(base_name))
continue

# prepare upload
Expand All @@ -172,36 +184,178 @@ def up_all(in_path, cutoff=None, target='Uploaded', file_exts=None,
if verbose:
pywikibot.output(result.get('log'))

flog.write('%s\n' % result.get('log'))
flog.write(result.get('log'))
os.rename(f, os.path.join(target_dir, base_name))
os.rename(info_file, os.path.join(target_dir, base_info_name))
counter += 1
flog.flush()

flog.close()
pywikibot.output('Created %s' % logfile)
pywikibot.output(flog.close_and_confirm())


def up_all_from_url(info_path, cutoff=None, target='upload_logs',
file_exts=None, verbose=False, test=False,
target_site=None):
"""
Upload all images provided as urls in a make_info json file.
Media (image) files and metadata files with the expected extension .info
should be in the same directory. Metadata files should contain the entirety
of the desired description page (in wikitext).
Outputs separate logfiles for files triggering errors, warnings (and
successful) so that these can be used in latter runs.
@param info_path: path to the make_info json file
@param cutoff: number of files to upload (defaults to all)
@param target: sub-directory for log files (defaults to "upload_logs")
@param file_exts: tuple of allowed file extensions (defaults to FILE_EXTS)
@param verbose: whether to output confirmation after each upload
@param test: set to True to test but not upload
@param target_site: pywikibot.Site to which file should be uploaded,
defaults to Commons.
"""
# set defaults unless overridden
file_exts = file_exts or FILE_EXTS
target_site = target_site or pywikibot.Site('commons', 'commons')
target_site.login()

# load info file
info_datas = common.open_and_read_file(info_path, as_json=True)

# create target directory if it doesn't exist
output_dir = os.path.join(os.path.dirname(info_path), target)
common.create_dir(output_dir)

# create all log files
logs = {
'success': common.LogFile(output_dir, 'success.log'),
'warning': common.LogFile(output_dir, 'warnings.log'),
'error': common.LogFile(output_dir, 'errors.log'),
'general': common.LogFile(output_dir, 'uploader.log')
}

# shortcut to the general/verbose logfile
flog = logs['general']

# @todo: filtering/removal of entries based on new input parameters

counter = 1
for url, data in info_datas.items():
if cutoff and counter > cutoff:
break

# verify that the file extension is ok
try:
ext = verify_url_file_extension(url, file_exts)
except common.MyError as e:
flog.write(e)
continue

# verify that info and output filenames are provided
if not data['info']:
flog.write(
'{url}: Found url missing the info field (at least)'.format(
url=url))
continue
elif not data['filename']:
flog.write(
'{url}: Found url missing the output filename'.format(
url=url))
continue

# prepare upload
txt = make_info_page(data)
filename = '{filename}{ext}'.format(filename=data['filename'], ext=ext)

if test:
pywikibot.output(
'Test upload "{filename}" from {url} with the following '
'description: {txt}\n'.format(
filename=filename, url=url, txt=txt))
continue
# stop here if testing

result = upload_single_file(
filename, url, txt, target_site, upload_if_badprefix=True)
if result.get('error'):
logs['error'].write(url)
elif result.get('warning'):
logs['warning'].write(url)
else:
logs['success'].write(url)
if verbose:
pywikibot.output(result.get('log'))

flog.write(result.get('log'))
counter += 1

for log in logs.values():
pywikibot.output(log.close_and_confirm())


def verify_url_file_extension(url, file_exts, url_protocols=None):
"""
Verify that a url contains a file extension and that it is allowed.
Also checks that the protocol is whitelisted.
@param url: the url to check
@param file_exts: tuple of allowed file extensions
@param url_protocols: tuple of allowed url protocols
@return: the file extension
@raises: common.MyError
"""
url_protocols = url_protocols or URL_PROTOCOLS

protocol, _, rest = url.partition('://')
if protocol not in url_protocols:
raise common.MyError(
'{0}: Found url with a disallowed protocol'.format(url))

try:
ext = os.path.splitext(url)[1]
except IndexError:
raise common.MyError(
'{0}: Found url without a file extension'.format(url))
else:
if not ext:
raise common.MyError(
'{0}: Found url without a file extension'.format(url))

if ext not in file_exts:
raise common.MyError(
'{0}: Found url with a disallowed file extension ({1})'.format(
url, ext))

return ext


def main(*args):
"""Command line entry-point."""
usage = 'Usage:' \
'\tpython uploader.py -in_path:PATH -dir:PATH -cutoff:NUM\n' \
'\t-in_path:PATH path to the directory containing the media files\n' \
'\t-dir:PATH specifies the path to the directory containing a ' \
'user_config.py file (optional)\n' \
'\t-cutoff:NUM stop the upload after the specified number of files ' \
'(optional)\n' \
'\t-confirm Whether to output a confirmation after each upload ' \
'attempt (optional)\n' \
'\t-nochunk Whether to turn of chunked uploading, this is slow ' \
'and does not support files > 100Mb (optional)\n' \
'\tExample:\n' \
'\tpython uploader.py -in_path:../diskkopia -cutoff:100\n'
usage = (
'Usage:'
'\tpython uploader.py -in_path:PATH -dir:PATH -cutoff:NUM\n'
'\t-in_path:PATH path to the directory containing the media files or '
'to the make_info output file if "-type" is set to url\n'
'\t-type:STRING the type of upload to make. Must be either "FILES" '
'or "URL". Defaults to FILES (optional)\n'
'\t-dir:PATH specifies the path to the directory containing a '
'user_config.py file (optional)\n'
'\t-cutoff:NUM stop the upload after the specified number of files '
'(optional)\n'
'\t-confirm Whether to output a confirmation after each upload '
'attempt (optional)\n'
'\t-nochunk Whether to turn of chunked uploading, this is slow '
'and does not support files > 100Mb (optional)\n'
'\tExample:\n'
'\tpython uploader.py -in_path:../diskkopia -cutoff:100\n'
)
cutoff = None
in_path = None
test = False
confirm = False
chunked = True
typ = 'files'

# Load pywikibot args and handle local args
for arg in pywikibot.handle_args(args):
Expand All @@ -217,13 +371,23 @@ def main(*args):
confirm = True
elif option == '-nochunk':
chunked = False
elif option == '-type':
if value.lower() == 'url':
typ = 'url'
elif value.lower() not in ('url', 'files'):
pywikibot.output(usage)
return
elif option == '-usage':
pywikibot.output(usage)
return

if in_path:
up_all(in_path, cutoff=cutoff, test=test, verbose=confirm,
chunked=chunked)
if typ == 'files':
up_all(in_path, cutoff=cutoff, test=test, verbose=confirm,
chunked=chunked)
elif typ == 'url':
up_all_from_url(in_path, cutoff=cutoff, test=test,
verbose=confirm)
else:
pywikibot.output(usage)

Expand Down
2 changes: 1 addition & 1 deletion setup.py
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from setuptools import setup
version = '0.1.4'
version = '0.1.5'
repo = 'BatchUploadTools'

setup(
Expand Down

0 comments on commit dd0cbc3

Please sign in to comment.