In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This notebook adds the WLE per contributor, per site and per autonomous 
community categories to all the files in the WLE log.
"""

import os, sys, inspect

try :
    import pywikibot as pb
    from pywikibot import pagegenerators, textlib
    import mwparserfromhell as mwh

except :
    current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
    folder_parts = current_folder.split(os.sep)
    pywikibot_folder = os.sep.join(folder_parts[0:-1])

    if current_folder not in sys.path:
        sys.path.insert(0, current_folder)
    if pywikibot_folder not in sys.path:
        sys.path.insert(0, pywikibot_folder)

    import pywikibot as pb
    from pywikibot import pagegenerators, textlib
    import mwparserfromhell as mwh
    
import numpy as np
import pandas as pd
from io import StringIO
import re
from mako.template import Template

In [None]:
YEAR                = 2015
TAG                 = 'WLE'
TAG_EXT             = 'Wiki Loves Earth'

BASE_NAME           = "Commons:Wiki Loves in Spain/{1}/{0}".format(YEAR, TAG_EXT)
LOG_PAGE            = BASE_NAME + '/Log'
BASE_SITE_DB_NAME   = "Commons:Wiki Loves Earth 2016 in Spain"
SITE_DB_PAGE        = BASE_SITE_DB_NAME + "/SCI DB"

commons_site = pb.Site("commons", "commons")

In [None]:
communities = {
    'ES-AN': u'Andalusia',
    'ES-AR': u'Aragon',
    'ES-AS': u'Asturias',
    'ES-CB': u'Cantabria',
    'ES-CM': u'Castile-La Mancha',
    'ES-CL': u'Castile and León',
    'ES-CT': u'Catalonia',
    'ES-MD': u'Community of Madrid',
    'ES-VC': u'Valencian Community',
    'ES-EX': u'Extremadura',
    'ES-IB': u'Balearic Islands',
    'ES-CN': u'Canary Islands',
    'ES-GA': u'Galicia',
    'ES-RI': u'La Rioja',
    'ES-NC': u'Navarre',
    'ES-MC': u'Region of Murcia',
    'ES-CE': u'Ceuta',
    'ES-ML': u'Melilla',
    'ES-PV': u'Basque Country',
    'ES-MAGRAMA': u'MAGRAMA'
}

In [None]:
def get_community (x) :
    try:
        return communities[x]
    except :
        return np.nan

### Retrieval of list of sites of community importance

In [None]:
pb.output('Retrieving --> WLE SCI list')
site_list_page = pb.Page(commons_site, SITE_DB_PAGE)
site_list_text = StringIO(site_list_page.text[site_list_page.text.find('\n') + 1:site_list_page.text.rfind('\n')])
site_df = pd.read_csv(site_list_text, sep=";",
                     index_col=False,
                     names=["name", "lic_id", "magrama_url", "community",
                            "bio_region", "continent", "min_altitude",
                            "max_altitude", "avg_altitude", "longitude",
                            "latitude", "area", "marine_percentage",
                            "marine_area", "image", "commons_cat", "wikidata_id"])
pb.output('Retrieved --> WLE SCI list')

site_df['community_name'] = site_df['community'].apply(get_community)

codes = site_df["lic_id"].tolist()

In [None]:
site_df.describe(include="all")

### Retrieval of the image log

In [None]:
pb.output('Retrieving --> {0} {1} images list from cache'.format(TAG, YEAR))
list_page = pb.Page(commons_site, LOG_PAGE)
list_page_text = StringIO(list_page.text[list_page.text.find('\n') + 1:list_page.text.rfind('\n')])
images_df = pd.read_csv(list_page_text,
                        sep=";",
                        index_col=False,
                        names=['image_title', 'lic_id', 
                               'uploader', 'uploader_registration', 
                               'timestamp', 'date', 'size', 
                               'height', 'width', 'qi', 
                               'finalist'])

pb.output('Retrieved --> {0} {1} images list from cache'.format(TAG, YEAR))

In [None]:
images_df = images_df.merge(site_df, on='lic_id', how="left")

In [None]:
site_cat_template = """'''Site of Community Importance''': ${lic_name} ([http://natura2000.eea.europa.eu/Natura2000/SDF.aspx?site=${lic_code} ${lic_code}])

{{hiddencat}}
[[Category:Images from ${tag} in Spain by site| ${lic_code}]]
"""
site_vars = {
    "lic_name": None,
    "lic_code": None,
    "tag": TAG_EXT
}

In [None]:
yearly_site_cat_template = """'''Site of Community Importance''': ${lic_name} ([http://natura2000.eea.europa.eu/Natura2000/SDF.aspx?site=${lic_code} ${lic_code}])

{{hiddencat}}
[[Category:Images from ${tag} ${year} in Spain by site| ${lic_code}]]
"""
yearly_site_vars = {
    "lic_name": None,
    "lic_code": None,
    "tag": TAG_EXT,
    "year": YEAR
}

In [None]:
autcom_cat_template = """{{hiddencat}}
[[Category:Images from ${tag} ${year} in Spain by autonomous community| ${aut_com}]]"""
autcom_vars = {
    "aut_com": None,
    "tag": TAG_EXT,
    "year": YEAR}

In [None]:
author_cat_template = """{{hiddencat}}
[[Category:Images from ${tag} ${year} in Spain by author| ${author_name}]]"""
author_vars = {
    "author_name": None,
    "tag": TAG_EXT,
    "year": YEAR}

In [None]:
deletion_request = """{{speedydelete|1=Empty category, it's been moved. No longer needed}}"""

In [None]:
article_needed = [u'Region', u'Basque', u'Balearic', u'Canary', u'Valencian', u'Community']

counter = 0
for image_counter, row in images_df.iterrows():
    #print ("Image ({}) handled".format(row['image_title']))
    page = pb.Page(commons_site, row["image_title"], ns=6)
    text = page.text
    cats = [cat.title(withNamespace=False) for cat in page.categories()]
    #print (cats)
    wikicode = mwh.parse(text)
    templates = wikicode.filter_templates()
    
    if (image_counter != 0) and (image_counter % 50 == 0) :
        pb.output ('Reviewing --> %d image pages downloaded' %(image_counter))

    new_cats = []
    if (type(row['lic_id']) is str and row['lic_id'].startswith('ES')):
        yearly_site_category = 'Images of a site of community importance with code {0} from {1} {2} in Spain'.format(row["lic_id"], TAG_EXT, YEAR)
        if yearly_site_category not in cats:
            new_cats.append(yearly_site_category)
            pass
        #print (yearly_site_category)
        category = pb.Category(commons_site, yearly_site_category)
        if not category.exists() :
            yearly_site_vars['lic_name'] = row['name']
            yearly_site_vars['lic_code'] = row['lic_id']
            t = Template(yearly_site_cat_template)
            category.text = t.render(**yearly_site_vars)
            category.save("{0} Spain {1}: site category creation".format(TAG, YEAR))
        else :
            #print ('Existing category ({0})'.format(category))
            pass

        site_category = 'Images of a site of community importance with code {0} from {1} in Spain'.format(row["lic_id"], TAG_EXT)
        if site_category not in cats:
            new_cats.append(site_category)
        #print (site_category)
        category = pb.Category(commons_site, site_category)
        if not category.exists() :
            site_vars['lic_name'] = row['name']
            site_vars['lic_code'] = row['lic_id']
            t = Template(site_cat_template)
            category.text = t.render(**site_vars)
            category.save("{0} Spain: site category creation".format(TAG))
        else :
            #print ('Existing category ({0})'.format(category))
            pass

        try :
            if len ([i for i in article_needed if row['community_name'].startswith(i)]) == 0 :
                aut_com_name = row["community_name"]
            else :
                aut_com_name = 'the ' + row["community_name"]
        except :
            break

        community_category = 'Images of a site of community importance in {0} from {1} {2} in Spain'.format(aut_com_name, TAG_EXT, YEAR)
        #print (community_category)
        if community_category not in cats:
            new_cats.append(community_category)
        category = pb.Category(commons_site, community_category)
        if not category.exists() :
            autcom_vars['aut_com'] = row["community_name"]
            t = Template(autcom_cat_template)
            category.text = t.render(**autcom_vars)
            category.save("{0} Spain {1}: autonomous community category creation".format(TAG, YEAR))
        else :
            #print ('Existing category ({0})'.format(category))
            pass
    else :
        print ("-->Image ({}) from an invalid location".format(row['image_title']))
    if 'flickr' in row['uploader'] :
        author = '{0} (flickr)'.format(' '.join(row['uploader'].split(' ')[1:]))
    else :
        author = row['uploader']
    author_category = u'Images from {1} {2} in Spain by {0}'.format(author, TAG_EXT, YEAR)
    if author_category not in cats:
        new_cats.append(author_category)
    category = pb.Category(commons_site, author_category)
    if not category.exists() :
        t = Template(author_cat_template)
        category.text = t.render(**author_vars)
        category.save("{0} Spain {1}: user category creation".format(TAG, YEAR))
    else :
        #print ('Existing category ({0})'.format(category))
        pass
        
    if len(new_cats) > 0 :
        cat_string = '\n'.join(['[[Category:' + i + ']]' for i in new_cats])
        new_text = text + '\n' + cat_string
        if page.text != new_text :
            page.text = new_text
            page.save("{1} {0} in Spain: Classification".format(YEAR, TAG))