In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""This notebook creates the to-do list for TAG YEAR in COUNTRY"""

import inspect, os, sys

try :
    import pywikibot as pb
except :
    current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
    folder_parts = current_folder.split(os.sep)
    pywikibot_folder = os.sep.join(folder_parts[:-1])

    if current_folder not in sys.path:
        sys.path.insert(0, current_folder)
    if pywikibot_folder not in sys.path:
        sys.path.insert(0, pywikibot_folder)

    import pywikibot as pb

In [None]:
import pandas as pd
from io import StringIO
from mako.template import Template
from datetime import datetime

In [None]:
YEAR                = 2017

TAG                 = 'WLE'
TAG_EXT             = 'Wiki Loves Earth'
COUNTRY             = "Spain"

BASE_NAME           = "Commons:Wiki Loves in {2}/{1}/{0}".format(YEAR, TAG_EXT, COUNTRY)
LOG_PAGE            = BASE_NAME + '/Log'
BASE_SITE_DB_NAME   = "Commons:Wiki Loves in {1}/{0}".format(TAG_EXT, COUNTRY)
SITE_DB_PAGE        = BASE_SITE_DB_NAME + "/Sites DB"
TODO_WLE_PAGE       = BASE_NAME + '/To-do'

commons_site = pb.Site('commons', 'commons')

In [None]:
annexes = {
   'ES-AN': [u'Anexo:Lugares de importancia comunitaria de Andalucía', 'Andalusia'],
   'ES-AR': [u'Anexo:Lugares de importancia comunitaria de Aragón', 'Aragon'],
   'ES-AS': [u'Anexo:Lugares de importancia comunitaria de Asturias', 'Asturias'],
   'ES-CB': [u'Anexo:Lugares de importancia comunitaria de Cantabria', 'Cantabria'],
   'ES-CM': [u'Anexo:Lugares de importancia comunitaria de Castilla-La Mancha', 'Castile-La Mancha'],
   'ES-CL': [u'Anexo:Lugares de importancia comunitaria de Castilla y León', u'Castile and León'],
   'ES-CT': [u'Anexo:Lugares de importancia comunitaria de Cataluña', 'Catalonia'],
   'ES-MD': [u'Anexo:Lugares de importancia comunitaria de la Comunidad de Madrid', 'Community of Madrid'],
   'ES-VC': [u'Anexo:Lugares de importancia comunitaria de la Comunidad Valenciana', 'Valencian Community'],
   'ES-EX': [u'Anexo:Lugares de importancia comunitaria de Extremadura', 'Extremadura'],
   'ES-IB': [u'Anexo:Lugares de importancia comunitaria de las Islas Baleares', 'Balearic Islands'],
   'ES-CN': [u'Anexo:Lugares de importancia comunitaria de las Islas Canarias', 'Canary Islands'],
   'ES-GA': [u'Anexo:Lugares de importancia comunitaria de Galicia', 'Galicia'],
   'ES-RI': [u'Anexo:Lugares de importancia comunitaria de La Rioja', 'La Rioja'],
   'ES-NC': [u'Anexo:Lugares de importancia comunitaria de Navarra', 'Navarre'],
   'ES-MC': [u'Anexo:Lugares de importancia comunitaria de la Región de Murcia', 'Region of Murcia'],
   'ES-PV': [u'Anexo:Lugares de importancia comunitaria del País Vasco', 'Basque Country'],
   'ES-CE': [u'Anexo:Lugares de importancia comunitaria de Ceuta y Melilla', 'Ceuta'],
   'ES-ML': [u'Anexo:Lugares de importancia comunitaria de Ceuta y Melilla', 'Melilla'],
   'ES-MAGRAMA': [u'Anexo:Lugares de importancia comunitaria del MAGRAMA', 'MAGRAMA']
}

In [None]:
# Folder management (templates, images...)
cwd = os.getcwd()

templates_directory = os.path.join(cwd, 'templates')

In [None]:
# retrieval of the WLE SCI (site of community importance) log
pb.output('Retrieving --> WLE site of community importance list')
site_list_page = pb.Page(commons_site, SITE_DB_PAGE)
site_list_text = StringIO(site_list_page.text[site_list_page.text.find('\n') + 
                                                      1:site_list_page.text.rfind('\n')])
site_df = pd.read_csv(site_list_text, sep=";",
                      index_col=False,
                      names=["name", "code", "magrama_url", "community",
                            "bio_region", "continent", "min_altitude",
                            "max_altitude", "avg_altitude", "longitude",
                            "latitude", "area", "marine_percentage",
                            "marine_area", "image", "commons_cat", "wikidata_id"])

pb.output('Retrieved --> WLE site of community importance list')
site_df["aut_com"] = site_df["community"].apply(lambda x: annexes[x][1])

In [None]:
len(site_df[~site_df['commons_cat'].isnull()])

In [None]:
len(site_df[~site_df['image'].isnull()])

In [None]:
filtered_site_df = site_df[(site_df['commons_cat'].isnull() | (site_df['image'].isnull()))]

In [None]:
len(filtered_site_df)

In [None]:
# Retrieval of images
pb.output('Retrieving --> {1} {0} in {2} images list from cache'.format(YEAR, TAG, COUNTRY))
list_page = pb.Page(commons_site, LOG_PAGE)
list_page_text = StringIO(list_page.text[list_page.text.find('\n') + 1:list_page.text.rfind('\n')])
images_df = pd.read_csv(list_page_text,
                            sep=";",
                            index_col=False,
                            names=['image_title', 'code', 
                               'uploader', 'uploader_registration', 
                               'timestamp', 'date', 'size', 
                               'height', 'width', 'qi', 
                               'finalist']
                       ).fillna('')
pb.output('Retrieved --> {1} {0} in {2} images list from cache'.format(YEAR, TAG, COUNTRY))

total_images_length = len(images_df)
total_images_length

In [None]:
filtered_images_df = images_df[(~images_df['code'].isnull()) & (images_df['code'].isin(filtered_site_df['code'].values))]

In [None]:
filtered_images_df['code'].unique()

In [None]:
missing_sites_df = filtered_site_df[filtered_site_df['code'].isin(filtered_images_df['code'].unique())].fillna('')
missing_sites_df['annex'] = missing_sites_df["community"].apply(lambda x: annexes[x][0])

In [None]:
lost_cats = []
for image_counter, row in images_df.iterrows():
    #print(row["image_title"])
    page = pb.FilePage(commons_site, row["image_title"])
    text = page.text
    if (image_counter != 0) and (image_counter % 50 == 0) :
        pb.output ('Reviewing --> %d image pages downloaded' %(image_counter))

    cats = [cat for cat in page.categories()]
    lost_cats.extend([cat.title(withNamespace=False) for cat in cats if (not cat.isHiddenCategory() and not cat.exists())])

In [None]:
template_file = os.path.join(templates_directory, 'todo.wiki')
fh = open(template_file, 'r', encoding = "utf-8")
template = fh.read()
fh.close()

In [None]:
list(set(lost_cats))

In [None]:
vars = {
    "lost_categories": list(set(lost_cats)),
    "missing_df": missing_sites_df, 
    "todo_page": TODO_WLE_PAGE,
    "tag": TAG,
    "full_tag": TAG_EXT,
    "year": YEAR,
    "country": COUNTRY,
    "date": datetime.now().strftime("%B %-d, %Y")
}
t = Template(template)
todo_text = t.render(**vars)

In [None]:
todo_page = pb.Page(commons_site, TODO_WLE_PAGE)
if todo_page.text.strip() != todo_text.strip() :
    todo_page.text = todo_text
    pb.output('Publishing --> {1} {0} in {2} To-do List'.format(YEAR, TAG, COUNTRY))
    todo_page.save("{1} {0} in {2} to-do list".format(YEAR, TAG, COUNTRY))