In [1]:
#!/usr/bin/python
# -*- coding: latin-1 -*-
"""This notebook builds the CSV-like file describing the contributions to WLE in Spain in YEAR"""

import os, sys, inspect

try :
    import pywikibot as pb
    from pywikibot import pagegenerators, textlib
    import mwparserfromhell as mwh

except :
    current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
    folder_parts = current_folder.split(os.sep)
    pywikibot_folder = os.sep.join(folder_parts[0:-1])

    if current_folder not in sys.path:
        sys.path.insert(0, current_folder)
    if pywikibot_folder not in sys.path:
        sys.path.insert(0, pywikibot_folder)

    import pywikibot as pb
    from pywikibot import pagegenerators, textlib
    import mwparserfromhell as mwh

from io import StringIO
import pandas as pd
import numpy as np
import re

from datetime import datetime, timedelta

In [2]:
YEAR                = 2017
TAG                 = 'WLE'
TAG_EXT             = 'Wiki Loves Earth'
TEMPLATE            = 'lic'

BASE_NAME           = "Commons:Wiki Loves in Spain/{1}/{0}".format(YEAR, TAG_EXT)
LOG_PAGE            = BASE_NAME + u"/Log"

WLE_CATEGORY          = "Category:Images from {1} {0} in Spain".format(YEAR, TAG_EXT)
WLE_FINALIST_CATEGORY = "Category:Evaluation of images from {1} {0} in Spain - Final".format(YEAR, TAG_EXT)

commons_site = pb.Site("commons", "commons")

In [3]:
cat_wle = pb.Category(commons_site, WLE_CATEGORY)
gen_wle = pagegenerators.CategorizedPageGenerator(cat_wle)

images_wle = [page.title(withNamespace=False) for page in gen_wle if page.is_filepage()]
len(images_wle)

2500

In [4]:
images_df = pd.DataFrame(
    columns=['image_title', 'lic_id', 'uploader', 'uploader_registration', 
             'timestamp', 'date', 'size', 'height', 'width', 'qi', 'finalist'])

uploaders = {}

for image_counter, image in enumerate(images_wle):
    if (image_counter != 0) and (image_counter % 50 == 0) :
        pb.output ('Retrieving --> %d image descriptions downloaded' %(image_counter))
    image_row = {'image_title': None, 
                 'lic_id': '', 
                 'uploader': None,
                 'uploader_registration': None,
                 'timestamp': None,
                 'date': None,
                 'size': 0,
                 'height': 0,
                 'width': 0,
                 'qi': None,
                 'finalist': None}

    page = pb.FilePage(commons_site, image)
    image_row["image_title"] = page.title(withNamespace=False)
    
    fileinfo = page.latest_file_info
    image_row["size"] = fileinfo['size']
    image_row["height"] = fileinfo['height']
    image_row["width"] = fileinfo['width']
    
    text = page.text
    wikicode = mwh.parse(text)
    templates = wikicode.filter_templates()

    info_templates = [template for template in wikicode.filter_templates() 
                      if template.name.lower().strip() == "information"]
    
    # code management
    wle_templates = [template for template in wikicode.filter_templates() 
                    if template.name.lower().strip() == TEMPLATE]
    if len(wle_templates) > 0:
        image_row["lic_id"] = wle_templates[0].get(1).value.strip()
    
    # date management
    if info_templates[0].has("date"):
        try :
            datetime_object = datetime.strptime(info_templates[0].get("date").value.strip().split(' ')[0], '%Y-%m-%d')
            image_row["date"] = info_templates[0].get("date").value.strip().split(' ')[0]
        except :
            print ("Image with wrong date ({0})".format(image_row["image_title"]))
            pass
    elif info_templates[0].has("Date"):
        try :
            datetime_object = datetime.strptime(info_templates[0].get("Date").value.strip().split(' ')[0], '%Y-%m-%d')
            image_row["date"] = info_templates[0].get("Date").value.strip().split(' ')[0]
        except :
            print ("Image with wrong date ({0})".format(image_row["image_title"]))
            pass
    else :
        print ("Image with no date ({0})".format(image_row["image_title"]))

    # uploader management
    creation = page.oldest_revision
    image_row["uploader"] = creation["user"]
    
    # uploader registration management
    if image_row["uploader"] not in uploaders :
        try:
            # Too old users do not have a user registration time
            user_registration = pb.User(commons_site, title=creation["user"]).registration()
            image_row["uploader_registration"] = user_registration.strftime("%Y-%m-%d")
        except :
            image_row["uploader_registration"] = '2006-01-01'
        uploaders[image_row["uploader"]] = image_row["uploader_registration"]
    else :
        image_row["uploader_registration"] = uploaders[image_row["uploader"]]
        
    # upload time management
    creation_time = creation.timestamp + timedelta(hours=2)
    image_row["timestamp"] = creation_time.strftime("%Y-%m-%d %H:%M:%S")
    
    # quality status management
    qi_templates = [template for template in wikicode.filter_templates() 
                   if template.name.lower().strip() == "qualityimage"]    
    if len(qi_templates) > 0:
        image_row["qi"] = 'qi'
        
    # finalist status management
    cats = [cat.title(withNamespace=True) for cat in textlib.getCategoryLinks(text)]
    if WLE_FINALIST_CATEGORY in cats:
        image_row["finalist"] = 'finalist'

    images_df = images_df.append(image_row, ignore_index=True)

Retrieving --> 50 image descriptions downloaded
Retrieving --> 100 image descriptions downloaded
Retrieving --> 150 image descriptions downloaded
Retrieving --> 200 image descriptions downloaded


Image with wrong date (Burgui.jpg)


Retrieving --> 250 image descriptions downloaded
Retrieving --> 300 image descriptions downloaded
Retrieving --> 350 image descriptions downloaded
Retrieving --> 400 image descriptions downloaded


Image with no date (Caserios al amanecer en las faldas de Peñas de Aia.jpg)


Retrieving --> 450 image descriptions downloaded
Retrieving --> 500 image descriptions downloaded


Image with wrong date (Cola del embalse de Picadas 001.jpg)
Image with wrong date (Cola del embalse de Picadas 002.jpg)


Retrieving --> 550 image descriptions downloaded
Retrieving --> 600 image descriptions downloaded
Retrieving --> 650 image descriptions downloaded
Retrieving --> 700 image descriptions downloaded
Retrieving --> 750 image descriptions downloaded
Retrieving --> 800 image descriptions downloaded
Retrieving --> 850 image descriptions downloaded
Retrieving --> 900 image descriptions downloaded


Image with wrong date (Faro de Cabo Vilán 3n 1996 - Camariñas.jpg)
Image with wrong date (Faro de Cabo Vilán en 1996 - Camariñas.jpg)


Retrieving --> 950 image descriptions downloaded


Image with wrong date (Foz de Arbaiun panorama.jpg)
Image with wrong date (Foz de Arbaiun rio.jpg)


Retrieving --> 1000 image descriptions downloaded


Image with wrong date (Gaztelugatxe (1).jpg)


Retrieving --> 1050 image descriptions downloaded
Retrieving --> 1100 image descriptions downloaded
Retrieving --> 1150 image descriptions downloaded


Image with wrong date (Ladera Irumugarrieta Malloak de Aralar.jpg)


Retrieving --> 1200 image descriptions downloaded


Image with wrong date (Lagunas de Laguardia 01.jpg)
Image with wrong date (Lagunas de Laguardia 02.jpg)
Image with wrong date (Lagunas de Laguardia 03.jpg)
Image with wrong date (Larra Pic Anie.jpg)


Retrieving --> 1250 image descriptions downloaded
Retrieving --> 1300 image descriptions downloaded
Retrieving --> 1350 image descriptions downloaded
Retrieving --> 1400 image descriptions downloaded
Retrieving --> 1450 image descriptions downloaded
Retrieving --> 1500 image descriptions downloaded
Retrieving --> 1550 image descriptions downloaded
Retrieving --> 1600 image descriptions downloaded
Retrieving --> 1650 image descriptions downloaded
Set iilimit = ['250']
Retrieving --> 1700 image descriptions downloaded


Image with no date (Pedret.jpg)


Retrieving --> 1750 image descriptions downloaded
Retrieving --> 1800 image descriptions downloaded


Image with wrong date (Prado junto a la Garganta Torinas y la CM-5006.jpg)
Image with wrong date (Puente de San Juan 001.jpg)
Image with wrong date (Puente de San Juan 002.jpg)
Image with wrong date (Puente de San Juan 003.jpg)
Image with wrong date (Puente de San Juan 004.jpg)
Image with wrong date (Puente de San Juan 005.jpg)
Image with wrong date (Puente viejo de Lanzahíta 001.JPG)
Image with wrong date (Puente viejo de Lanzahíta 002.JPG)
Image with wrong date (Puente viejo de Lanzahíta 003.JPG)
Image with wrong date (Puente viejo de Lanzahíta 004.JPG)
Image with wrong date (Puente viejo de Lanzahíta 005.JPG)
Image with wrong date (Puente viejo de Lanzahíta 006.JPG)
Image with wrong date (Puente viejo de Lanzahíta 007.JPG)


Retrieving --> 1850 image descriptions downloaded
Retrieving --> 1900 image descriptions downloaded
Retrieving --> 1950 image descriptions downloaded


Image with wrong date (Roncesvalles 01.jpg)
Image with wrong date (Roncesvalles 02.jpg)


Retrieving --> 2000 image descriptions downloaded


Image with wrong date (Río Tiétar 005.jpg)
Image with wrong date (Río Tiétar 006.jpg)
Image with wrong date (Río Tiétar 007.jpg)
Image with wrong date (Río Tiétar 008.jpg)
Image with wrong date (Río Tiétar 009.jpg)


Retrieving --> 2050 image descriptions downloaded
Retrieving --> 2100 image descriptions downloaded


Image with wrong date (San-Miguel-de-Aralar.jpg)


Retrieving --> 2150 image descriptions downloaded


Image with wrong date (Sierra Cantabria 01.jpg)


Retrieving --> 2200 image descriptions downloaded


Image with wrong date (Sierra Cantabria carboneras.jpg)
Image with wrong date (Sierra Cantabria Davalillo.jpg)
Image with wrong date (Sierra Cantabria octubre.jpg)
Image with wrong date (Sierra Cantabria Paso del Toro.jpg)
Image with wrong date (Sierra Entzia 01.jpg)
Image with wrong date (Sierra Entzia 02.jpg)


Retrieving --> 2250 image descriptions downloaded


Image with wrong date (Tejeda9.jpg)


Retrieving --> 2300 image descriptions downloaded
Retrieving --> 2350 image descriptions downloaded


Image with wrong date (Urbasa 01.jpg)
Image with wrong date (Urbasa 02.jpg)


Retrieving --> 2400 image descriptions downloaded
Retrieving --> 2450 image descriptions downloaded


In [5]:
images_df.describe(include="all")

Unnamed: 0,image_title,lic_id,uploader,uploader_registration,timestamp,date,size,height,width,qi,finalist
count,2500,2500.0,2500,2500,2500,2454,2500,2500,2500,0.0,137
unique,2500,311.0,191,75,1777,562,2500,486,417,0.0,1
top,Mascletà La Canyada13.jpg,,Fedoce1,2017-05-23,2017-05-30 18:36:33,2017-05-27,94207,3456,3648,,finalist
freq,1,103.0,192,411,3,91,1,159,171,,137


In [6]:
buf = StringIO()
images_df.to_csv(buf, index=None, sep=';', encoding='utf-8', header=False)

db_page = pb.Page(commons_site, LOG_PAGE)
db_page.text = u'<pre>\n' + buf.getvalue() + u'</pre>'
db_page.save("{1} {0} in Spain: Contribution log update".format(YEAR, TAG))

Page [[commons:Commons:Wiki Loves in Spain/Wiki Loves Earth/2017/Log]] saved
