In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""This notebook reviews the contributions to WLF in Spain in YEAR"""

import os, sys, inspect

try :
    import pywikibot as pb
    from pywikibot import pagegenerators, textlib
    import mwparserfromhell as mwh

except :
    current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
    folder_parts = current_folder.split(os.sep)
    pywikibot_folder = os.sep.join(folder_parts[0:-1])

    if current_folder not in sys.path:
        sys.path.insert(0, current_folder)
    if pywikibot_folder not in sys.path:
        sys.path.insert(0, pywikibot_folder)

    import pywikibot as pb
    from pywikibot import pagegenerators, textlib
    import mwparserfromhell as mwh

In [None]:
import pandas as pd
from io import StringIO
import re

In [None]:
from modules.wmtools import additional_festivals_df

In [None]:
YEAR                = 2016
TAG                 = 'WLF'
TAG_EXT             = 'Wiki Loves Folk'

BASE_WLF_NAME       = 'Commons:{1}/{0}'.format(YEAR, TAG_EXT)
BASE_WLF2016_NAME   = 'Commons:{1}/{0}'.format(2016, TAG_EXT)
LOG_PAGE            = BASE_WLF_NAME + "/Log"
FESTIVAL_DB_PAGE    = BASE_WLF2016_NAME + "/Festival DB"

WLF_CATEGORY          = "Category:Images from {1} {0} in Spain".format(YEAR, TAG_EXT)
NO_ID_CATEGORY        = "Category:Images from {1} {0} in Spain with no ID".format(YEAR, TAG)
WRONG_ID_CATEGORY     = "Category:Images from {1} {0} in Spain with no valid ID".format(YEAR, TAG)

commons_site = pb.Site("commons", "commons")

In [None]:
additional_festivals = additional_festivals_df['wikidata_id'].values
additional_festivals

In [None]:
cat_wlf = pb.Category(commons_site, WLF_CATEGORY)
gen_wlf = pagegenerators.CategorizedPageGenerator(cat_wlf)

images_wlf = [page.title(withNamespace=True) for page in gen_wlf if page.is_filepage()]
len(images_wlf)

In [None]:
pb.output('Retrieving --> {0} in Spain Festivals list from cache'.format(TAG))
festival_list_page = pb.Page(commons_site, FESTIVAL_DB_PAGE)
festival_list_text = StringIO(festival_list_page.text[festival_list_page.text.find('\n') + 1:festival_list_page.text.rfind('\n')])
festivals_df = pd.read_csv(festival_list_text, sep=";", index_col=False, names=['name', 
                                                                                'aut_com', 
                                                                                'wikidata_id', 
                                                                                'wikidata_timestamp', 
                                                                                'category', 
                                                                                'cat_timestamp', 
                                                                                'image', 
                                                                                'latitude', 
                                                                                'longitude'])

festivals = festivals_df["wikidata_id"].values

In [None]:
for image_counter, image in enumerate(images_wlf):
    page = pb.Page(commons_site, image)
    text = page.text
    wikicode = mwh.parse(text)
    templates = wikicode.filter_templates()
    
    if (image_counter != 0) and (image_counter % 50 == 0) :
        pb.output ('Reviewing --> %d image pages downloaded' %(image_counter))

    cats = [cat.title(withNamespace=True) for cat in textlib.getCategoryLinks(text)]
    
    WLF_template_found = False
    WLF_identifier = ''
    wlf_templates = [template for template in wikicode.filter_templates() 
                    if template.name.lower().strip() == u"wlf"]
    
    new_cats = []

    if len(wlf_templates) > 0:
        WLF_template_found = True
        WLF_identifier = wlf_templates[0].get(1).value.strip()
        if (WLF_identifier not in festivals) and (WLF_identifier not in additional_festivals):
            print ('Festival identifier not valid')
            if WRONG_ID_CATEGORY not in cats :
                new_cats.append(WRONG_ID_CATEGORY)
            WLF_identifier = ''
            
    else:
        if NO_ID_CATEGORY not in cats :
            new_cats.append(NO_ID_CATEGORY)

    author_cat = u"Category:Images from {1} {0} in Spain by {2}".format(YEAR, TAG_EXT, page.oldest_revision["user"])
    author_cat_page = pb.Page(commons_site, author_cat)
    if not author_cat_page.exists():
        author_cat_page_text = '{{{{hiddencat}}}}\n[[Category:Images from {1} {0} in Spain by author|{2}]]'.format(YEAR, TAG_EXT, page.oldest_revision["user"])
        author_cat_page.text = author_cat_page_text
        author_cat_page.save("{1} {0} in Spain: Authorshipt classification".format(YEAR, TAG))

    if author_cat not in cats:
        new_cats.append(author_cat)

    if (WLF_identifier != '') and ((WLF_identifier in festivals) or (WLF_identifier in additional_festivals)) :
        fest_cat        = 'Category:Images of festival with code {0} from {2} {1} in Spain'.format(WLF_identifier, 
                                                                                                   YEAR, 
                                                                                                   TAG_EXT)
        fest_cat_global = 'Category:Images of festival with code {0}'.format(WLF_identifier)
        fest_cat_page = pb.Page(commons_site, fest_cat)
        fest_cat_page_global = pb.Page(commons_site, fest_cat_global)
        if not fest_cat_page.exists():
            fest_cat_page_text = '{{{{hiddencat}}}}\n[[Category:Images from {1} {0} in Spain by festival| {2}]]'.format(YEAR,
                                                                                                                        TAG_EXT,
                                                                                                                        WLF_identifier)
            fest_cat_page.text = fest_cat_page_text
            fest_cat_page.save("{1} {0} in Spain: Festival classification".format(YEAR, TAG))

        if not fest_cat_page_global.exists():
            fest_cat_page_global_text = '{{{{hiddencat}}}}\n[[Category:Images from {0} in Spain by festival| {1}]]'.format(TAG_EXT,
                                                                                                                           WLF_identifier)
            fest_cat_page_global.text = fest_cat_page_global_text
            fest_cat_page_global.save("{} in Spain: Festival classification".format(TAG))

        if fest_cat not in cats:
            new_cats.append(fest_cat)
        if fest_cat_global not in cats:
            new_cats.append(fest_cat_global)
            
    if len(new_cats) > 0 :
        cat_string = '\n'.join(['[[' + i + ']]' for i in new_cats])
        updated_text = text.replace('[[{0}]]\n'.format(NO_ID_CATEGORY), '') + '\n' + cat_string
        page.text = updated_text
        page.save("{1} {0} in Spain: Classification".format(YEAR, TAG))