In [None]:
#!/usr/bin/python
# -*- coding: latin-1 -*-
"""This notebook builds the CSV-like file describing the contributions to WLF in Spain in YEAR"""

import os, sys, inspect

try :
    import pywikibot as pb
    from pywikibot import pagegenerators, textlib
    import mwparserfromhell as mwh

except :
    current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
    folder_parts = current_folder.split(os.sep)
    pywikibot_folder = os.sep.join(folder_parts[0:-1])

    if current_folder not in sys.path:
        sys.path.insert(0, current_folder)
    if pywikibot_folder not in sys.path:
        sys.path.insert(0, pywikibot_folder)

    import pywikibot as pb
    from pywikibot import pagegenerators, textlib
    import mwparserfromhell as mwh

from io import StringIO
import pandas as pd
import numpy as np
import re

from datetime import datetime, timedelta

In [None]:
YEAR                = 2017
TAG                 = 'WLF'
TAG_EXT             = 'Wiki Loves Folk'

BASE_WLF_NAME       = "Commons:{1}/{0}".format(YEAR, TAG_EXT)
LOG_PAGE            = BASE_WLF_NAME + u"/Log"

WLF_CATEGORY          = "Category:Images from {1} {0} in Spain".format(YEAR, TAG_EXT)
WLF_FINALIST_CATEGORY = "Category:Images from {1} {0} in Spain (finalists)".format(YEAR, TAG_EXT)

commons_site = pb.Site("commons", "commons")

In [None]:
cat_wlf = pb.Category(commons_site, WLF_CATEGORY)
gen_wlf = pagegenerators.CategorizedPageGenerator(cat_wlf)

images_wlf = [page.title(withNamespace=False) for page in gen_wlf if page.is_filepage()]
len(images_wlf)

In [None]:
images_df = pd.DataFrame(
    columns=['image_title', 'wikidata_id', 'uploader', 'uploader_registration', 
             'timestamp', 'date', 'size', 'height', 'width', 'qi', 'finalist'])

for image_counter, image in enumerate(images_wlf):
    if (image_counter != 0) and (image_counter % 50 == 0) :
        pb.output ('Retrieving --> %d image descriptions downloaded' %(image_counter))
    image_row = {'image_title': None, 
                 'wikidata_id': '', 
                 'uploader': None,
                 'uploader_registration': None,
                 'timestamp': None,
                 'date': None,
                 'size': 0,
                 'height': 0,
                 'width': 0,
                 'qi': None,
                 'finalist': None}

    page = pb.FilePage(commons_site, image)
    image_row["image_title"] = page.title(withNamespace=False)
    
    fileinfo = page.latest_file_info
    image_row["size"] = fileinfo['size']
    image_row["height"] = fileinfo['height']
    image_row["width"] = fileinfo['width']
    
    text = page.text
    wikicode = mwh.parse(text)
    templates = wikicode.filter_templates()

    info_templates = [template for template in wikicode.filter_templates() 
                    if template.name.lower().strip() == "information"]
    try:
        image_row["date"] = info_templates[0].get("date").value.strip().split(' ')[0]
    except :
        print (image_row["image_title"])

    wlf_templates = [template for template in wikicode.filter_templates() 
                    if template.name.lower().strip() == "wlf"]
    if len(wlf_templates) > 0:
        image_row["wikidata_id"] = wlf_templates[0].get(1).value.strip()
    
    creation = page.oldest_revision
    image_row["uploader"] = creation["user"]
    
    creation_time = creation.timestamp + timedelta(hours=2)
    image_row["timestamp"] = creation_time.strftime("%Y-%m-%d %H:%M:%S")
    
    try:
        # Too old users do not have a user registration time
        user_registration = pb.User(commons_site, title=creation["user"]).registration()
        image_row["uploader_registration"] = user_registration.strftime("%Y-%m-%d")
    except :
        image_row["uploader_registration"] = '2006-01-01'
        
    qi_templates = [template for template in wikicode.filter_templates() 
                   if template.name.lower().strip() == "qualityimage"]
    
    if len(qi_templates) > 0:
        image_row["qi"] = 'qi'
        
    cats = [cat.title(withNamespace=True) for cat in textlib.getCategoryLinks(text)]
    if WLF_FINALIST_CATEGORY in cats:
        image_row["finalist"] = 'finalist'

    images_df = images_df.append(image_row, ignore_index=True)

In [None]:
buf = StringIO()
images_df.to_csv(buf, index=None, sep=';', encoding='utf-8', header=False)

db_page = pb.Page(commons_site, LOG_PAGE)
db_page.text = u'<pre>\n' + buf.getvalue() + u'</pre>'
db_page.save("{1} {0} in Spain: Contribution log update".format(YEAR, TAG))