In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""This notebook finds the elements of TAG in COUNTRY with
no picture or category"""

import inspect, os, sys

try :
    import pywikibot as pb
except :
    current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
    folder_parts = current_folder.split(os.sep)
    pywikibot_folder = os.sep.join(folder_parts[:-1])

    if current_folder not in sys.path:
        sys.path.insert(0, current_folder)
    if pywikibot_folder not in sys.path:
        sys.path.insert(0, pywikibot_folder)

    import pywikibot as pb

In [None]:
import pandas as pd
from io import StringIO
from mako.template import Template

In [None]:
YEARS               = [2015, 2016, 2017]

TAG                 = 'WLE'
TAG_EXT             = 'Wiki Loves Earth'
COUNTRY             = "Spain"

BASE_NAME           = "Commons:Wiki Loves in {2}/{1}/{0}"
LOG_PAGES           = ["{0}/Log".format(BASE_NAME.format(YEAR, TAG_EXT, COUNTRY)) for YEAR in YEARS]

BASE_SITE_DB_NAME   = "Commons:Wiki Loves in {1}/{0}".format(TAG_EXT, COUNTRY)
SITE_DB_PAGE        = BASE_SITE_DB_NAME + "/Sites DB"
REPORT_WLE_PAGE     = BASE_SITE_DB_NAME + '/Report'

commons_site = pb.Site('commons', 'commons')

In [None]:
annexes = {
   'ES-AN': [u'Anexo:Lugares de importancia comunitaria de Andalucía', 'Andalusia'],
   'ES-AR': [u'Anexo:Lugares de importancia comunitaria de Aragón', 'Aragon'],
   'ES-AS': [u'Anexo:Lugares de importancia comunitaria de Asturias', 'Asturias'],
   'ES-CB': [u'Anexo:Lugares de importancia comunitaria de Cantabria', 'Cantabria'],
   'ES-CM': [u'Anexo:Lugares de importancia comunitaria de Castilla-La Mancha', 'Castile-La Mancha'],
   'ES-CL': [u'Anexo:Lugares de importancia comunitaria de Castilla y León', u'Castile and León'],
   'ES-CT': [u'Anexo:Lugares de importancia comunitaria de Cataluña', 'Catalonia'],
   'ES-MD': [u'Anexo:Lugares de importancia comunitaria de la Comunidad de Madrid', 'Community of Madrid'],
   'ES-VC': [u'Anexo:Lugares de importancia comunitaria de la Comunidad Valenciana', 'Valencian Community'],
   'ES-EX': [u'Anexo:Lugares de importancia comunitaria de Extremadura', 'Extremadura'],
   'ES-IB': [u'Anexo:Lugares de importancia comunitaria de las Islas Baleares', 'Balearic Islands'],
   'ES-CN': [u'Anexo:Lugares de importancia comunitaria de las Islas Canarias', 'Canary Islands'],
   'ES-GA': [u'Anexo:Lugares de importancia comunitaria de Galicia', 'Galicia'],
   'ES-RI': [u'Anexo:Lugares de importancia comunitaria de La Rioja', 'La Rioja'],
   'ES-NC': [u'Anexo:Lugares de importancia comunitaria de Navarra', 'Navarre'],
   'ES-MC': [u'Anexo:Lugares de importancia comunitaria de la Región de Murcia', 'Region of Murcia'],
   'ES-PV': [u'Anexo:Lugares de importancia comunitaria del País Vasco', 'Basque Country'],
   'ES-CE': [u'Anexo:Lugares de importancia comunitaria de Ceuta y Melilla', 'Ceuta'],
   'ES-ML': [u'Anexo:Lugares de importancia comunitaria de Ceuta y Melilla', 'Melilla'],
   'ES-MAGRAMA': [u'Anexo:Lugares de importancia comunitaria del MAGRAMA', 'MAGRAMA']
}

In [None]:
# retrieval of the WLE SCI (site of community importance) log
pb.output('Retrieving --> WLE site of community importance list')
site_list_page = pb.Page(commons_site, SITE_DB_PAGE)
site_list_text = StringIO(site_list_page.text[site_list_page.text.find('\n') + 
                                                      1:site_list_page.text.rfind('\n')])
site_df = pd.read_csv(site_list_text, sep=";",
                      index_col=False,
                      names=["name", "code", "magrama_url", "community",
                            "bio_region", "continent", "min_altitude",
                            "max_altitude", "avg_altitude", "longitude",
                            "latitude", "area", "marine_percentage",
                            "marine_area", "image", "commons_cat", "wikidata_id"])

pb.output('Retrieved --> WLE site of community importance list')
site_df["aut_com"] = site_df["community"].apply(lambda x: annexes[x][1])

In [None]:
len(site_df[~site_df['commons_cat'].isnull()])

In [None]:
len(site_df[~site_df['image'].isnull()])

In [None]:
filtered_site_df = site_df[(site_df['commons_cat'].isnull() | (site_df['image'].isnull()))]

In [None]:
len(filtered_site_df)

In [None]:
# Retrieval of images
image_columns = ['image_title', 'code', 'uploader', 'uploader_registration',
                 'timestamp', 'date', 'size', 'height', 'width', 'qi', 'finalist']
images_df = pd.DataFrame(columns=image_columns)

pb.output('Retrieving --> {0} in {1} images list from cache'.format(TAG, COUNTRY))
for log_page in LOG_PAGES:
    list_page = pb.Page(commons_site, log_page)
    list_page_text = StringIO(list_page.text[list_page.text.find('\n') + 1:list_page.text.rfind('\n')])
    yearly_df = pd.read_csv(list_page_text,
                            sep=";",
                            index_col=False,
                            names=image_columns
                           )
    images_df = pd.concat([images_df, yearly_df])
pb.output('Retrieved --> {0} in {1} images list from cache'.format(TAG, COUNTRY))

images_df['timestamp'] = pd.to_datetime(images_df['timestamp'], format="%Y-%m-%d %H:%M:%S")

images_df.set_index(["timestamp"], inplace=True)
del images_df.index.name

total_images_length = len(images_df)
total_images_length

In [None]:
filtered_images_df = images_df[(~images_df['code'].isnull()) & (images_df['code'].isin(filtered_site_df['code'].values))]

In [None]:
filtered_images_df['code'].unique()

In [None]:
missing_sites_df = filtered_site_df[filtered_site_df['code'].isin(filtered_images_df['code'].unique())].fillna('')
missing_sites_df['annex'] = missing_sites_df["community"].apply(lambda x: annexes[x][0])

In [None]:
template = """==Missing sites of community importance in WLE==
<center>
{| class="wikitable sortable" style="width:65%; font-size:89%; margin-top:0.5em;"
|- valign="middle"
! Number
! Site of community importance
! Autonomous community
! Has category in annexes
! Has image in annexes
! Uploaded photographs
% for index, row in df.iterrows() :
|-
| ${index+1}
| ${row['name']} (${row["code"]})
| [[:es:${row['annex']}|${row['aut_com']}]]
%if len (row['commons_cat']) > 0 :
| align="center" | {{OK}}<br/><small>[[:Category:${row['commons_cat']}|${row['commons_cat']}]]</small>
%else :
| align="center" | {{NotOK}}
%endif
%if len (row['image']) > 0 :
| align="center" | {{OK}}
%else :
| align="center" | {{NotOK}}
%endif
| [[:Category:Images of a site of community importance with code ${row['code']} from Wiki Loves Earth in Spain|Category]]
% endfor
|}
</center>
"""

vars = {
    "df": missing_sites_df.reset_index()
}
t = Template(template)
report_text = t.render(**vars)

In [None]:
report_page = pb.Page(commons_site, REPORT_WLE_PAGE)
report_page.text = report_text
pb.output('Publishing --> {0} in Spain Report'.format(TAG))
report_page.save("{0} in Spain report".format(TAG))