In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
This notebook adds the WLE per contributor, per site and per autonomous 
community categories to all the files in the WLE log.
"""

import os, sys, inspect

try :
    import pywikibot as pb
    from pywikibot import textlib
    import mwparserfromhell as mwh

except :
    current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
    folder_parts = current_folder.split(os.sep)
    pywikibot_folder = os.sep.join(folder_parts[0:-1])

    if current_folder not in sys.path:
        sys.path.insert(0, current_folder)
    if pywikibot_folder not in sys.path:
        sys.path.insert(0, pywikibot_folder)

    import pywikibot as pb
    from pywikibot import textlib
    import mwparserfromhell as mwh
    
import numpy as np
import pandas as pd
from io import StringIO
import re
from mako.template import Template
from datetime import datetime, timedelta

In [None]:
YEAR                = 2016
TAG                 = 'WLE'
TAG_EXT             = 'Wiki Loves Earth'
COUNTRY             = "Spain"
CODE_TEMPLATE       = "lic"

BASE_NAME           = "Commons:Wiki Loves in {2}/{1}/{0}".format(YEAR, TAG_EXT, COUNTRY)
LOG_PAGE            = BASE_NAME + '/Log'
BASE_SITE_DB_NAME   = "Commons:Wiki Loves in {1}/{0}".format(TAG_EXT, COUNTRY)
SITE_DB_PAGE        = BASE_SITE_DB_NAME + "/Sites DB"

START_TIME        = datetime(YEAR, 5, 1, 0, 0, 0)
END_TIME          = datetime(YEAR, 6, 1, 0, 0, 0) + timedelta(hours=1)

commons_site = pb.Site("commons", "commons")

In [None]:
communities = {
    'ES-AN': u'Andalusia',
    'ES-AR': u'Aragon',
    'ES-AS': u'Asturias',
    'ES-CB': u'Cantabria',
    'ES-CM': u'Castile-La Mancha',
    'ES-CL': u'Castile and León',
    'ES-CT': u'Catalonia',
    'ES-MD': u'Community of Madrid',
    'ES-VC': u'Valencian Community',
    'ES-EX': u'Extremadura',
    'ES-IB': u'Balearic Islands',
    'ES-CN': u'Canary Islands',
    'ES-GA': u'Galicia',
    'ES-RI': u'La Rioja',
    'ES-NC': u'Navarre',
    'ES-MC': u'Region of Murcia',
    'ES-CE': u'Ceuta',
    'ES-ML': u'Melilla',
    'ES-PV': u'Basque Country',
    'ES-MAGRAMA': u'MAGRAMA'
}

In [None]:
def get_community (x) :
    try:
        return communities[x]
    except :
        return np.nan

### Retrieval of list of sites of community importance

In [None]:
pb.output('Retrieving --> WLE SCI list')
site_list_page = pb.Page(commons_site, SITE_DB_PAGE)
site_list_text = StringIO(site_list_page.text[site_list_page.text.find('\n') + 1:site_list_page.text.rfind('\n')])
site_df = pd.read_csv(site_list_text, sep=";",
                     index_col=False,
                     names=["name", "lic_id", "magrama_url", "community",
                            "bio_region", "continent", "min_altitude",
                            "max_altitude", "avg_altitude", "longitude",
                            "latitude", "area", "marine_percentage",
                            "marine_area", "image", "commons_cat", "wikidata_id"])
pb.output('Retrieved --> WLE SCI list')

site_df['community_name'] = site_df['community'].apply(get_community)

codes = site_df["lic_id"].tolist()

In [None]:
site_df.describe(include="all")

### Retrieval of the image log

In [None]:
pb.output('Retrieving --> {0} {1} images list from cache'.format(TAG, YEAR))
list_page = pb.Page(commons_site, LOG_PAGE)
list_page_text = StringIO(list_page.text[list_page.text.find('\n') + 1:list_page.text.rfind('\n')])
images_df = pd.read_csv(list_page_text,
                        sep=";",
                        index_col=False,
                        names=['image_title', 'lic_id', 
                               'uploader', 'uploader_registration', 
                               'timestamp', 'date', 'size', 
                               'height', 'width', 'qi', 
                               'finalist'])

images_df['timestamp'] = pd.to_datetime(images_df['timestamp'], format="%Y-%m-%d %H:%M:%S")
pb.output('Retrieved --> {0} {1} images list from cache'.format(TAG, YEAR))

In [None]:
images_df = images_df.merge(site_df, on='lic_id', how="left")

In [None]:
site_cat_template = """'''Site of Community Importance''': ${lic_name} ([http://natura2000.eea.europa.eu/Natura2000/SDF.aspx?site=${lic_code} ${lic_code}])

{{hiddencat}}
[[Category:Images from ${tag} in ${country} by site| ${lic_code}]]
"""
site_vars = {
    "lic_name": None,
    "lic_code": None,
    "country": COUNTRY,
    "tag": TAG_EXT
}

In [None]:
yearly_site_cat_template = """'''Site of Community Importance''': ${lic_name} ([http://natura2000.eea.europa.eu/Natura2000/SDF.aspx?site=${lic_code} ${lic_code}])

{{hiddencat}}
[[Category:Images from ${tag} ${year} in ${country} by site| ${lic_code}]]
"""
yearly_site_vars = {
    "lic_name": None,
    "lic_code": None,
    "country": COUNTRY,
    "tag": TAG_EXT,
    "year": YEAR
}

In [None]:
autcom_cat_template = """{{hiddencat}}
[[Category:Images from ${tag} ${year} in ${country} by autonomous community| ${aut_com}]]"""
autcom_vars = {
    "aut_com": None,
    "country": COUNTRY,
    "tag": TAG_EXT,
    "year": YEAR}

In [None]:
author_cat_template = """{{hiddencat}}
[[Category:Images from ${tag} ${year} in ${country} by author| ${author_name}]]"""
author_vars = {
    "author_name": None,
    "country": COUNTRY,
    "tag": TAG_EXT,
    "year": YEAR}

In [None]:
implicit_cats = ["Sites of Community Importance in Spain with known IDs",
                 "CC-BY-SA-4.0", "CC-BY-4.0", "GFDL", 
                 "Media with locations", 
                 "License migration redundant",
                 "Self-published work",
                 "Pages with maps",                 
                 "Images with watermarks",
                 "All media needing categories as of {1}",
                 "Images from {0} {1}",
                 "Images from {0} {1} in {2}"]
implicit_cats = [cat.format(TAG_EXT, YEAR, COUNTRY) for cat in implicit_cats]

In [None]:
c = ["Images from {0} {1} in {2} with a wrong code",
     "Images from {0} {1} in {2} without code",
     "Images from {0} {1} in {2} without valid template",
     "Uncategorized images from {0} {1} in {2}",
     "Unqualified images from {0} {1} in {2} (wrong submission time)",
     "Unqualified images from {0} {1} in {2} (too small)"]
to_remove_cats = [cat.format(TAG_EXT, YEAR, COUNTRY) for cat in c]
to_remove_cats

In [None]:
autcom_with_article = ['Region', 'Basque', 'Balearic', 'Canary', 'Valencian', 'Community']

In [None]:
lost_cats = []

In [None]:
for image_counter, row in images_df.iterrows():
    #print(row["image_title"])
    page = pb.FilePage(commons_site, row["image_title"])
    text = page.text
    if (image_counter != 0) and (image_counter % 50 == 0) :
        pb.output ('Reviewing --> %d image pages downloaded' %(image_counter))

    nocat_text = textlib.removeCategoryLinks(text)
    cats = [cat for cat in page.categories()]
    cat_titles = [cat.title(withNamespace=False) for cat in cats]
    lost_cats.extend([cat.title(withNamespace=False) for cat in cats if (not cat.isHiddenCategory() and not cat.exists())])
    visible_cats = [cat.title(withNamespace=False) for cat in cats if (not cat.isHiddenCategory())]
    cleaned_visible_cats = [cat.title(withNamespace=False) for cat in cats if (not cat.isHiddenCategory() and cat.exists())]
    
    hidden_cats = [cat.title(withNamespace=False) for cat in cats if cat.isHiddenCategory()]

    cleaned_hidden_cats = [cat.title(withNamespace=False) for cat in cats if cat.isHiddenCategory() and not 
                                                             (re.match('Images of a site of community importance with code ES\d+ from Wiki Loves Earth 2017 in Spain', cat.title(withNamespace=False)) or
                                                              re.match('Images of a site of community importance with code ES\d+ from Wiki Loves Earth in Spain', cat.title(withNamespace=False)) or
                                                              re.match('Images of a site of community importance in [a-zA-Z ]+ from Wiki Loves Earth 2017 in Spain', cat.title(withNamespace=False)) or
                                                              re.match('Images from Wiki Loves Earth 2017 in Spain by .+', cat.title(withNamespace=False)) or
                                                              cat.title(withNamespace=False) in to_remove_cats)]

    wikicode = mwh.parse(text)
    wle_templates = [template for template in wikicode.filter_templates() 
                    if template.name.lower().strip() == CODE_TEMPLATE]

    # authorship classification
    if 'flickr' in row['uploader'] :
        author = '{0} (flickr)'.format(' '.join(row['uploader'].split(' ')[1:]))
    else :
        author = row['uploader']
    author_category = 'Images from {1} {2} in {3} by {0}'.format(author, TAG_EXT, YEAR, COUNTRY)
    cleaned_hidden_cats.append(author_category)
    
    category = pb.Category(commons_site, author_category)
    if not category.exists() :
        t = Template(author_cat_template)
        category.text = t.render(**author_vars)
        category.save("{0} {2} {1}: user category creation".format(TAG, YEAR, COUNTRY))

    # size classification
    if page.latest_file_info["width"] * page.latest_file_info["height"] < 2000000 :
        print ("too small ({})".format(row["image_title"]))
        cleaned_hidden_cats.append('Unqualified images from {0} {1} in {2} (too small)'.format(TAG_EXT, YEAR, COUNTRY))
        try:
            cleaned_hidden_cats.remove("Images from {0} {1} in {2} to be evaluated".format(TAG_EXT, YEAR, COUNTRY))
        except :
            pass
    
    # date classification
    if row["timestamp"] > END_TIME or row["timestamp"] < START_TIME :
        print ("uploaded too late or too soon ({})".format(row["image_title"]))
        cleaned_hidden_cats.append('Unqualified images from {0} {1} in {2} (wrong submission time)'.format(TAG_EXT, YEAR, COUNTRY))
        try :
            cleaned_hidden_cats.remove("Images from {0} {1} in {2} to be evaluated".format(TAG_EXT, YEAR, COUNTRY))
        except:
            pass
        
    #uncategorized classification
    if '{{Uncategorized|year=' in text :
        p = re.compile('\{\{Uncategorized\|year=(\d+)\|month=([a-zA-Z]*)\|day=(\d+)\}\}')
        m = p.search(text)
        uncategorized_cat = 'Media needing categories as of {0} {1} {2}'.format(m.groups()[2], m.groups()[1], m.groups()[0])
        try :
            cleaned_hidden_cats.remove(uncategorized_cat)
        except :
            pass
        cleaned_hidden_cats.append('Uncategorized images from {0} {1} in {2}'.format(TAG_EXT, YEAR, COUNTRY))
    elif len(cleaned_visible_cats) == 0 or '{{Uncategorized|year=' in text:
        cleaned_hidden_cats.append('Uncategorized images from {0} {1} in {2}'.format(TAG_EXT, YEAR, COUNTRY))
    
    # site/autcom classification
    if len(wle_templates) == 0:
        print ("No template ({})".format(row["image_title"]))
        if (("Unqualified images from {0} {1} in {2} (not from a site of community importance)".format(TAG_EXT, YEAR, COUNTRY) not in cleaned_hidden_cats) and
            ("Unqualified images from {0} {1} in {2} (not from {2})".format(TAG_EXT, YEAR, COUNTRY) not in cleaned_hidden_cats) and
            ("Unqualified images from {0} {1} in {2} (unidentified locations)".format(TAG_EXT, YEAR, COUNTRY) not in cleaned_hidden_cats)) :
            cleaned_hidden_cats.append("Images from {0} {1} in {2} without valid template".format(TAG_EXT, YEAR, COUNTRY))
    else :
        for template in wle_templates :
            #print (template)
            code = template.get(1).value.strip()
            if code in codes :
                is_lic = True
                #print ("Valid code")
                yearly_site_category = 'Images of a site of community importance with code {0} from {1} {2} in {3}'.format(code, TAG_EXT, YEAR, COUNTRY)
                cleaned_hidden_cats.append(yearly_site_category)
                site_category = 'Images of a site of community importance with code {0} from {1} in {3}'.format(code, TAG_EXT, YEAR, COUNTRY)
                cleaned_hidden_cats.append(site_category)
                autcom_name = site_df[site_df['lic_id'] == code]['community_name'].values[0]
                if len ([i for i in autcom_with_article if autcom_name.startswith(i)]) != 0 :
                    autcom_name = 'the ' + autcom_name

                if 'Images of a site of community importance in {0} from {1} {2} in {3}'.format(autcom_name, TAG_EXT, YEAR, COUNTRY) not in cleaned_hidden_cats:
                    community_category = 'Images of a site of community importance in {0} from {1} {2} in {3}'.format(autcom_name, TAG_EXT, YEAR, COUNTRY)
                    cleaned_hidden_cats.append(community_category)
                    category = pb.Category(commons_site, community_category)
                    if not category.exists() :
                        yearly_site_vars['lic_name'] = row['name']
                        yearly_site_vars['lic_code'] = code
                        t = Template(yearly_site_cat_template)
                        category.text = t.render(**yearly_site_vars)
                        category.save("{0} {2} {1}: site category creation".format(TAG, YEAR, COUNTRY))
                    
                category = pb.Category(commons_site, yearly_site_category)
                if not category.exists() :
                    yearly_site_vars['lic_name'] = row['name']
                    yearly_site_vars['lic_code'] = code
                    t = Template(yearly_site_cat_template)
                    category.text = t.render(**yearly_site_vars)
                    category.save("{0} {2} {1}: site category creation".format(TAG, YEAR, COUNTRY))

                category = pb.Category(commons_site, site_category)
                if not category.exists() :
                    autcom_vars['aut_com'] = row["community_name"]
                    t = Template(autcom_cat_template)
                    category.text = t.render(**autcom_vars)
                    category.save("{0} {2} {1}: autonomous community category creation".format(TAG, YEAR, COUNTRY))

            elif code.startswith('ES'):
                print ("Code not from a LIC")
                cleaned_hidden_cats.append("Images from {0} {1} in {2} with a wrong code".format(TAG_EXT, YEAR, COUNTRY))
            else :
                print ("Invalid code")
                cleaned_hidden_cats.append("Images from {0} {1} in {2} without code".format(TAG_EXT, YEAR, COUNTRY))
                
    if len(set(cat_titles) ^ set(visible_cats + cleaned_hidden_cats)) > 0:
        print (set(cat_titles) ^ set(cleaned_visible_cats + cleaned_hidden_cats))
        cat_text = '\n'.join(['[[Category:{0}]]'.format(cat) for cat in set(visible_cats+cleaned_hidden_cats) if cat not in implicit_cats])
        page_text = nocat_text + '\n' + cat_text
        page.text = page_text
        #page.save("{1} {0} in {2}: Classification".format(YEAR, TAG, COUNTRY))

In [None]:
for i in set(lost_cats) :
    print (i)