In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""This notebook builds the CSV-like Wikimedia Commons page 
which records the contributions to WLE in Spain in YEAR"""

import os, sys, inspect

try :
    import pywikibot as pb
    from pywikibot import pagegenerators, textlib
    import mwparserfromhell as mwh

except :
    current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
    folder_parts = current_folder.split(os.sep)
    pywikibot_folder = os.sep.join(folder_parts[0:-1])

    if current_folder not in sys.path:
        sys.path.insert(0, current_folder)
    if pywikibot_folder not in sys.path:
        sys.path.insert(0, pywikibot_folder)

    import pywikibot as pb
    from pywikibot import pagegenerators, textlib
    import mwparserfromhell as mwh

import pandas as pd
from io import StringIO
import re
from datetime import datetime, timedelta

In [2]:
YEAR              = 2017
TAG               = 'WLE'
TAG_EXT           = 'Wiki Loves Earth'
CODE_TEMPLATE     = 'lic'

BASE_NAME         = "Commons:Wiki Loves in Spain/{1}/{0}".format(YEAR, TAG_EXT)
LOG_PAGE          = BASE_NAME + '/Log'
BASE_SITE_DB_NAME = "Commons:Wiki Loves Earth 2016 in Spain"
SITE_DB_PAGE      = BASE_SITE_DB_NAME + "/SCI DB"

WLE_CAT           = "Images from {1} {0} in Spain".format(YEAR, TAG_EXT)
WLE_FINALIST_CAT  = "Evaluation of images from {1} {0} in Spain - Final".format(YEAR, TAG_EXT)

START_TIME        = datetime(YEAR, 5, 1, 0, 0, 0)
END_TIME          = datetime(YEAR, 6, 1, 0, 0, 0) + timedelta(hours=1)

commons_site = pb.Site("commons", "commons")

### Image selection

In [3]:
cat_wle = pb.Category(commons_site, WLE_CAT)
gen_wle = pagegenerators.CategorizedPageGenerator(cat_wle)

images_wle = [page.title(withNamespace=False) for page in gen_wle if page.is_filepage()]
len(images_wle)

2492

### Retrieval of the sites of community importance

In [None]:
# retrieval of the WLE SCI (site of community importance) log
pb.output('Retrieving --> WLE SCI list')
site_list_page = pb.Page(commons_site, SITE_DB_PAGE)
site_list_text = StringIO(site_list_page.text[site_list_page.text.find('\n') + 
                                                      1:site_list_page.text.rfind('\n')])
site_df = pd.read_csv(site_list_text, sep=";",
                      index_col=False,
                      names=["name", "code", "magrama_url", "community",
                            "bio_region", "continent", "min_altitude",
                            "max_altitude", "avg_altitude", "longitude",
                            "latitude", "area", "marine_percentage",
                            "marine_area", "image", "commons_cat", "wikidata_id"])

pb.output('Retrieved --> WLE SCI list')

sites_list = site_df['code'].tolist()

Retrieving --> WLE SCI list
Retrieved --> WLE SCI list


### Image review and recording

In [None]:
images_df = pd.DataFrame(
    columns=['image_title', 'lic_id', 'uploader', 'uploader_registration', 
             'timestamp', 'date', 'size', 'height', 'width', 'qi', 'finalist'])

uploaders = {}

for image_counter, image in enumerate(images_wle):
    if (image_counter != 0) and (image_counter % 50 == 0) :
        pb.output ('Retrieving --> %d image descriptions downloaded' %(image_counter))

    image_row = {'image_title': None, 
                 'lic_id': '', 
                 'uploader': None,
                 'uploader_registration': None,
                 'timestamp': None,
                 'date': None,
                 'size': 0,
                 'height': 0,
                 'width': 0,
                 'qi': None,
                 'finalist': None}

    page = pb.FilePage(commons_site, image)
    text = page.text
    wikicode = mwh.parse(text)

    image_row["image_title"] = page.title(withNamespace=False)
    
    # upload time management
    creation = page.oldest_revision
    creation_time = creation.timestamp + timedelta(hours=2)
    uploader = creation["user"]
    #print (page.oldest_revision.timestamp)
    if ((creation_time > END_TIME) or (creation_time < START_TIME)) and uploader != 'WikiLovesESBot' :
        print ("Image ({}) out of time".format(image_row["image_title"]))
        continue
    image_row["timestamp"] = creation_time.strftime("%Y-%m-%d %H:%M:%S")
    
    fileinfo = page.latest_file_info
    image_row["size"] = fileinfo['size']
    image_row["height"] = fileinfo['height']
    image_row["width"] = fileinfo['width']
    
    # code management
    wle_templates = [template for template in wikicode.filter_templates() 
                    if template.name.lower().strip() == CODE_TEMPLATE]
    if len(wle_templates) > 0 and wle_templates[0].get(1).value.strip() in sites_list:
        image_row["lic_id"] = wle_templates[0].get(1).value.strip()
    else :
        print ("Image with no or wrong code ({0})".format(image_row["image_title"]))
        
    # date management
    info_templates = [template for template in wikicode.filter_templates() 
                      if template.name.lower().strip() == "information"]
    if len(info_templates) != 0 :
        if not (info_templates[0].has("date") or info_templates[0].has("Date")) :
            print ("Image with no date ({0})".format(image_row["image_title"]))
        else :
            if info_templates[0].has("date"):
                date_info = info_templates[0].get("date").value.strip()
            elif info_templates[0].has("Date"):
                date_info = info_templates[0].get("Date").value.strip()
                
            wikidate = mwh.parse(date_info)
            try :
                # we try to determine whether the date is inside a template
                date_info = wikidate.filter_templates()[0].get(1)
            except :
                # no template
                pass
            date_info = date_info.split(' ')[0]

            num_date_tokens = len(date_info.split('-'))
            if num_date_tokens == 2 :
                date_info = '{}-1'.format(date_info)
            elif num_date_tokens == 1 :
                date_info = '{}-1-1'.format(date_info)

            try :
                datetime_object = datetime.strptime(date_info, '%Y-%m-%d')
                image_row["date"] = date_info
            except :
                print ("Image {0} with wrong date ({1})".format(image_row["image_title"], date_info))
    else :
        print ("Image with no info template ({0})".format(image_row["image_title"]))

    # uploader and uploader registration management
    if uploader != 'WikiLovesESBot' :
        image_row["uploader"] = uploader

        if image_row["uploader"] not in uploaders :
            try:
                # Too old users do not have a user registration time
                user_registration = pb.User(commons_site, title=uploader).registration()
                image_row["uploader_registration"] = user_registration.strftime("%Y-%m-%d")
            except :
                image_row["uploader_registration"] = '2006-01-01'
            uploaders[image_row["uploader"]] = image_row["uploader_registration"]
        else :
            image_row["uploader_registration"] = uploaders[image_row["uploader"]]
    else :
        image_row["uploader"] = re.findall(r"\[(.*)\]", info_templates[0].get("Author").value.strip())[0]
        image_row["uploader_registration"] = '2000-01-01'
        #print(image_row["uploader"])

    # quality status management
    qi_templates = [template for template in wikicode.filter_templates() 
                   if template.name.lower().strip() == "qualityimage"]    
    if len(qi_templates) > 0:
        image_row["qi"] = 'qi'
        
    # finalist status management
    cats = [cat.title(withNamespace=False) for cat in textlib.getCategoryLinks(text)]
    if WLE_FINALIST_CAT in cats:
        image_row["finalist"] = 'finalist'

    images_df = images_df.append(image_row, ignore_index=True)

Image with no or wrong code (A pescar!! patiamarilla zambullendose en el Urumea.jpg)
Image with no or wrong code (A rio revuelto.jpg)


Retrieving --> 50 image descriptions downloaded


Image with no or wrong code (Alcarama Ayuntamiento Yanguas.jpg)
Image with no or wrong code (Alcarama Castillo Yanguas.jpg)
Image with no or wrong code (Alcarama Santa Maria Yanguas.jpg)
Image with no or wrong code (Alduides cementerio.jpg)


Retrieving --> 100 image descriptions downloaded


Image with no or wrong code (Ametlla de Segarra, vinyes de Tardor.jpg)


Retrieving --> 150 image descriptions downloaded


Image with no or wrong code (Barcos amarrados en Getaria.jpg)


Retrieving --> 200 image descriptions downloaded


Image with no or wrong code (Bassa d'Oles.jpg)
Image with no or wrong code (Bassa d'Oles2.jpg)
Image with no or wrong code (Begoña22.jpg)
Image with no or wrong code (Benijo I.jpg)
Image with no or wrong code (Benijo II.jpg)
Image with no or wrong code (Benijo III.jpg)
Image with no or wrong code (Benijo IV.jpg)
Image with no or wrong code (Bojurec.jpg)
Image with no or wrong code (Bojurec22.08.2009 006(1).jpg)
Image with no or wrong code (Bolas de paja.jpg)


Retrieving --> 250 image descriptions downloaded
Retrieving --> 300 image descriptions downloaded


Image with no or wrong code (Cameros Ribafrecha.jpg)


Retrieving --> 350 image descriptions downloaded


Image with no or wrong code (Camino del mar.jpg)
Image with no or wrong code (Campanillas.jpg)
Image with no or wrong code (Campanillas2.jpg)


Retrieving --> 400 image descriptions downloaded


Image with no or wrong code (Cardo Granada.jpg)
Image with no date (Caserios al amanecer en las faldas de Peñas de Aia.jpg)


Retrieving --> 450 image descriptions downloaded


Image with no or wrong code (Cazador, cazado.jpg)


Retrieving --> 500 image descriptions downloaded
Retrieving --> 550 image descriptions downloaded
Retrieving --> 600 image descriptions downloaded


Image with no or wrong code (Cuevas Herce.jpg)


Retrieving --> 650 image descriptions downloaded


Image with no or wrong code (Del Drago Milenario.jpg)


Retrieving --> 700 image descriptions downloaded


Image with no or wrong code (Donde duermen las gaviotas.jpg)
Image with no or wrong code (Ebro Riberia.jpg)
Image with no or wrong code (El mar estalla contra las rocas.jpg)


Retrieving --> 750 image descriptions downloaded


Image with no or wrong code (El verano se va.jpg)
Image with no or wrong code (Ermita de Sant Ermengol, Tiurana, la Noguera.jpg)


Retrieving --> 800 image descriptions downloaded
Retrieving --> 850 image descriptions downloaded
Retrieving --> 900 image descriptions downloaded


Image with no or wrong code (Estación de Valdesquí (19 de marzo de 2017, Rascafría).jpg)
Image with no or wrong code (Estalla la primavera en Cristina Enea.jpg)
Image with no or wrong code (Fallas reducido1.jpg)
Image with no or wrong code (Fallas reducido10.jpg)
Image with no or wrong code (Fallas reducido11.jpg)
Image with no or wrong code (Fallas reducido12.jpg)
Image with no or wrong code (Fallas reducido2.jpg)
Image with no or wrong code (Fallas reducido3.jpg)
Image with no or wrong code (Fallas reducido4.jpg)
Image with no or wrong code (Fallas reducido6.jpg)
Image with no or wrong code (Fallas reducido7.jpg)
Image with no or wrong code (Fallas reducido8.jpg)
Image with no or wrong code (Fallas reducido9.jpg)


Retrieving --> 950 image descriptions downloaded
Retrieving --> 1000 image descriptions downloaded
Retrieving --> 1050 image descriptions downloaded


Image with no or wrong code (Henry Moore llega a La Zurriola.jpg)
Image with no or wrong code (Herce villa.jpg)


Retrieving --> 1100 image descriptions downloaded


Image with no or wrong code (Kubos de Moneo, edificio Kursal.jpg)


Retrieving --> 1150 image descriptions downloaded


Image with no or wrong code (Lago pequeño de Anzola.jpg)
Image with no or wrong code (Lago serreta.jpg)


Retrieving --> 1200 image descriptions downloaded


Image with no or wrong code (Llega la tormenta al puerto donostiarra.jpg)


Retrieving --> 1250 image descriptions downloaded


Image with no or wrong code (Mar negro.jpg)


Retrieving --> 1300 image descriptions downloaded


Image with no or wrong code (Mascletà La Canyada1.jpg)
Image with no or wrong code (Mascletà La Canyada10.jpg)
Image with no or wrong code (Mascletà La Canyada11.jpg)
Image with no or wrong code (Mascletà La Canyada12.jpg)
Image with no or wrong code (Mascletà La Canyada13.jpg)
Image with no or wrong code (Mascletà La Canyada14.jpg)
Image with no or wrong code (Mascletà La Canyada15.jpg)
Image with no or wrong code (Mascletà La Canyada16.jpg)
Image with no or wrong code (Mascletà La Canyada17.jpg)
Image with no or wrong code (Mascletà La Canyada18.jpg)
Image with no or wrong code (Mascletà La Canyada19.jpg)
Image with no or wrong code (Mascletà La Canyada2.jpg)
Image with no or wrong code (Mascletà La Canyada20.jpg)
Image with no or wrong code (Mascletà La Canyada21.jpg)
Image with no or wrong code (Mascletà La Canyada22.jpg)
Image with no or wrong code (Mascletà La Canyada23.jpg)
Image with no or wrong code (Mascletà La Canyada3.jpg)
Image with no or wrong code (Mascletà La Canyada5.j

Retrieving --> 1350 image descriptions downloaded
Retrieving --> 1400 image descriptions downloaded
Retrieving --> 1450 image descriptions downloaded


Image with no or wrong code (Médano.jpg)


Retrieving --> 1500 image descriptions downloaded


Image with no or wrong code (Niebla en el puerto de Getaria.jpg)
Image with no or wrong code (Observando la mar brava en el espigon de La Zurriola.jpg)


Retrieving --> 1550 image descriptions downloaded
Retrieving --> 1600 image descriptions downloaded


Image with no or wrong code (Paraje de chopos en el margen del río Palancia a su paso por la localidad de Viver, Alto Palancia, Castellón.jpg)


Retrieving --> 1650 image descriptions downloaded


Image with no or wrong code (Paseo nuevo donostiarra atardecer.jpg)
Image with no or wrong code (Pedret.jpg)
Image with no date (Pedret.jpg)


Retrieving --> 1700 image descriptions downloaded
Retrieving --> 1750 image descriptions downloaded
Retrieving --> 1800 image descriptions downloaded


Image with no or wrong code (Presa de Contreras (28 de enero de 2017, Minglanilla-Villargordo del Cabriel) 02.jpg)
Image with no or wrong code (Presa de Contreras (28 de enero de 2017, Minglanilla-Villargordo del Cabriel).jpg)
Image with no or wrong code (Puente santa catalina en otoño.jpg)


Retrieving --> 1850 image descriptions downloaded


Image with no or wrong code (Rasos en color.jpg)
Image with no or wrong code (Rasos nevados.jpg)


Retrieving --> 1900 image descriptions downloaded


Image with no or wrong code (Rio Tormes en Salamanca bajo el puente romano.jpg)
Image with no or wrong code (Rio Tormes en Salamanca con pesquera y puente de hierro.jpg)


Retrieving --> 1950 image descriptions downloaded


Image with no or wrong code (Roncesvalles 02.jpg)
Image with no or wrong code (Ruta molinos de odiel 1.jpg)


Retrieving --> 2000 image descriptions downloaded


Image with no or wrong code (Río de Anzola.jpg)


Retrieving --> 2050 image descriptions downloaded


Image with no or wrong code (Santa Engracia Gurp Conca Tremp.jpg)


Retrieving --> 2100 image descriptions downloaded
Retrieving --> 2150 image descriptions downloaded
Retrieving --> 2200 image descriptions downloaded
Retrieving --> 2250 image descriptions downloaded


Image with no or wrong code (SS2016. Ciudad Europea de la Cultura. Inauguración. Luz y sonido.jpg)
Image with no or wrong code (Sunset in Puerto Colon.jpg)
Image Tejeda9.jpg with wrong date (02/03/2017-1-1)
Image with no or wrong code (Telaraña entre viñas.jpg)
Image with no or wrong code (Tenerife amazing.jpg)
Image with no or wrong code (Tenerife clouds.jpg)
Image with no or wrong code (Tenerife moon.jpg)
Image with no or wrong code (Tenerife neighbor island.jpg)


Retrieving --> 2300 image descriptions downloaded


Image with no or wrong code (Tenerife neighbor islands.jpg)
Image with no or wrong code (Tenerife ocean.jpg)
Image with no or wrong code (Tenerife peacocks.jpg)
Image with no or wrong code (Tenerife red sunset.jpg)
Image with no or wrong code (Tenerife ship.jpg)
Image with no or wrong code (Tenerife sunrise.jpg)
Image with no or wrong code (Tenerife sunset.jpg)
Image with no or wrong code (Tres colores.jpg)
Image with no or wrong code (Un paraíso prepirenaico.jpg)


Retrieving --> 2350 image descriptions downloaded


Image with no or wrong code (Valdorba Sto cristo Catalain.jpg)


Retrieving --> 2400 image descriptions downloaded


In [None]:
images_df.describe(include="all")

### Log creation

In [None]:
buf = StringIO()
images_df.to_csv(buf, index=None, sep=';', encoding='utf-8', header=False)

db_page = pb.Page(commons_site, LOG_PAGE)
db_page.text = u'<pre>\n' + buf.getvalue() + u'</pre>'
db_page.save("{1} {0} in Spain: Contribution log update".format(YEAR, TAG))