In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

import pywikibot as pb
from pywikibot.specialbots import UploadRobot

import requests
from requests.compat import quote
from bs4 import BeautifulSoup
from mako.template import Template

import os, re
import shutil
import calendar

commons_site = pb.Site("commons", "commons")

In [None]:
# Path handling for importing utils.py
import sys, inspect
current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
folder_parts = current_folder.split(os.sep)
parent_folder = os.sep.join(folder_parts[:-1])

if current_folder not in sys.path:
    sys.path.insert(0, current_folder)
if parent_folder not in sys.path:
    sys.path.insert(0, parent_folder)
    
from wikimedia.utils import is_commons_file, get_hash

In [None]:
# Creation of images folder
cwd = os.getcwd()

images_directory = os.path.join(cwd, 'images')
if not os.path.exists(images_directory):
    os.makedirs(images_directory)

In [None]:
# Configuration
config = {
    'url': 'https://diario.madrid.es/blog/notas-de-prensa/la-alcaldesa-me-siento-orgullosa-de-la-solidaridad-de-espana-con-la-acogida-a-los-migrantes-del-aquarius/',
    'categories': ['Manuela Carmena in 2018'
                   'Cuartel del Conde Duque, Madrid',
                   '2018 events in Madrid'],
    'uploader_category': None,
    'head_picture': True,
    'article_title': None,
    'pub_date': None,
    'article_content': None,
    'excluded': [6]
}

categories = [category for category in (config['categories'] + [config['uploader_category']]) if category]
categories.append("Images from Ayuntamiento de Madrid (to classify)")
categories

In [None]:
# Retrieval of base page for extracting gallery information
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0"}
r = requests.get(config['url'], headers=headers)
soup = BeautifulSoup(r.text, 'html.parser')

In [None]:
# Image title
article_title = soup.find_all("h1", class_="post-title")[0].get_text().strip().replace('  ', ' ')
if not config['article_title']:
    image_name = article_title
else:
    image_name = config['article_title']
image_name

In [None]:
# Image date
if not config['pub_date']:
    pub_date = '-'.join(soup.find_all("div", class_="post-date")[0].get_text().strip().split('/')[::-1])
    year = pub_date.split('-')[0]
    month = calendar.month_name[int(pub_date.split('-')[1])]
    categories.append(f"{month} {year} in Madrid")
else:
    pub_date = config['pub_date']
    
pub_date

In [None]:
if not config['article_content']:
    body = soup.find_all("div", class_="post-content")
    p_description = body[0].find_all("p")
    for p in p_description :
        if len(p.get_text()) > 10:
            description = p.get_text()
            break
else :
    description = config['article_content']
description

In [None]:
template = u"""=={{int:filedesc}}==
{{Information
|description={{es|1=${description}}}
|date=${pub_date}
|source=[${url} Diario de Madrid - ${title}]
|author=[https://diario.madrid.es/ Diario de Madrid]
|permission=[https://diario.madrid.es/contenidos-libres/ License information for all contents in diario.madrid.es]
|other versions=
}}

=={{int:license-header}}==
{{Diario de Madrid}}

${cat_string}"""

vars = {
    "url": config['url'],
    "description": description,
    "pub_date": pub_date,
    "title": article_title,
    "cat_string": '\n'.join(['[[Category:'+i+']]' for i in categories])
}
t = Template(template)
_text = t.render(**vars)
_text

In [None]:
image_list = [{"url": image.a["href"], "title": image_name} for image in soup.find_all("div", class_="gallery-icon")]
image_list

In [None]:
if config['head_picture']:
    image_list.extend([{"url": img.parent["href"], "title": img["alt"]} for img in soup.find_all("img", class_=lambda value: value and value.startswith("wp-image"))])
image_list = [{'url': image['url'], 'title': image['title']} if len(image['title'])!= 0 else {'url': image['url'], 'title': image_name} for image in image_list]
image_list

In [None]:
#### User input:
excluded = config['excluded']

used_names = []
global_counter = 1
for i, image in enumerate(image_list):
    # If the image is excluded, skip
    if i in excluded:
        print ("Image excluded. Skipping")
        continue
        
    # First, the image is downloaded and stored
    image_url = quote(image["url"].encode('utf-8'), ':/')
    if len(image_list) == 1:
        image_name = '{}.jpg'.format(image["title"].replace(':', ' -').replace('  ', ' '))
    else :
        image_name = '{} {:02d}.jpg'.format(image["title"].replace(':', ' -').replace('  ', ' '), global_counter)
        global_counter += 1
    image_path = os.path.join(images_directory, image_name)
    try: 
        r = requests.get(image_url, headers=headers, stream=True)
        with open(image_path, 'wb') as out_file:
            shutil.copyfileobj(r.raw, out_file)
    except :
        print ('Failed download. Skipping')
        continue

    # If the image is already in Commons, skip
    if is_commons_file(get_hash(image_path)) :
        print ("Image already in commons. Skipping")
        global_counter -= 1
        os.remove(image_path)
        continue

    # If the image name is already in commons, find a new name
    if pb.Page(commons_site, image_name, ns=6).exists():
        print ("Image name ({0}) already used in Commons".format(image_name))
        used_names.append(image_name)
        
    while True:
        if image_name in used_names :
            # Finding a new name
            image_subject = '.'.join(image_name.split('.')[:-1])
            image_extension = 'jpg'
            p = re.compile('(.*) ([0-9]{2}\.jpg)')
            m = p.match(image_name)
            if m is None:
                image_name = image_subject + ' 01.' + image_extension
            else :
                counter = int(m.group(2)[:2]) + 1
                image_name = '{} {:02d}.{}'.format(m.group(1), counter, image_extension)

            if pb.Page(commons_site, image_name, ns=6).exists():
                print ("Image name ({0}) already used in Commons. Finding a new name".format(image_name))
                used_names.append(image_name)
        else :
            print ("Preparing to upload image with name {0}".format(image_name))
            used_names.append(image_name)
            break

    # image upload
    bot = UploadRobot([image_path],
                      description = _text,
                      useFilename = image_name,
                      keepFilename = True,
                      verifyDescription = False,
                      ignoreWarning = True,
                      targetSite = commons_site)
    bot.run()
    os.remove(image_path)