In [None]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""This notebook creates the map of TAG for all its 
editions, provided images have been properly 
categorized"""

import inspect, os, sys

try :
    import pywikibot as pb
    from pywikibot import pagegenerators

except :
    current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
    folder_parts = current_folder.split(os.sep)
    pywikibot_folder = os.sep.join(folder_parts[:-1])

    if current_folder not in sys.path:
        sys.path.insert(0, current_folder)
    if pywikibot_folder not in sys.path:
        sys.path.insert(0, pywikibot_folder)

    import pywikibot as pb
    from pywikibot import pagegenerators

In [None]:
import pandas as pd
import numpy as np
from mako.template import Template
from io import StringIO
import random
import seaborn as sns

In [None]:
from geojson import Feature, Point, FeatureCollection
import geojson

In [None]:
from modules.wmtools import coordinate_shaker, additional_festivals_df

In [None]:
YEARS               = [2016, 2017]

TAG                 = 'WLF'
TAG_EXT             = 'Wiki Loves Folk'

LOG_PAGES           = ['Commons:{1}/{0}/Log'.format(i, TAG_EXT) for i in YEARS]
FESTIVAL_DB_PAGE    = 'Commons:{1}/{0}/Festival DB'.format(2016, TAG_EXT)
MAP_WLF_PAGE        = 'Commons:{0}/Map'.format(TAG_EXT)

commons_site = pb.Site('commons', 'commons')

In [None]:
festival_annexes = [
    ['Anexo:Fiestas de interés turístico de Andalucía', 'Andalusia'],
    ['Anexo:Fiestas de interés turístico de Aragón', 'Aragon'],
    ['Anexo:Fiestas de interés turístico de Asturias', 'Asturias'],
    ['Anexo:Fiestas de interés turístico de Cantabria', 'Cantabria'],
    ['Anexo:Fiestas de interés turístico de Castilla-La Mancha', 'Castile-La Mancha'],
    ['Anexo:Fiestas de interés turístico de Castilla y León', 'Castile and León'],
    ['Anexo:Fiestas de interés turístico de Cataluña', 'Catalonia'],
    ['Anexo:Fiestas de interés turístico de la Comunidad de Madrid', 'Community of Madrid'],
    ['Anexo:Fiestas de interés turístico de la Comunidad Valenciana', 'Valencian Community'],
    ['Anexo:Fiestas de interés turístico de Extremadura', 'Extremadura'],
    ['Anexo:Fiestas de interés turístico de las Islas Baleares', 'Balearic Islands'],
    ['Anexo:Fiestas de interés turístico de las Islas Canarias', 'Canary Islands'],
    ['Anexo:Fiestas de interés turístico de Galicia', 'Galicia'],
    ['Anexo:Fiestas de interés turístico de La Rioja', 'La Rioja'],
    ['Anexo:Fiestas de interés turístico de Navarra', 'Navarre'],
    ['Anexo:Fiestas de interés turístico de la Región de Murcia', 'Region of Murcia'],
    ['Anexo:Fiestas y tradiciones del País Vasco', 'Basque Country']
]

In [None]:
autcom_palette = [i[1:] for i in sns.color_palette('hls', 17).as_hex()]
autcoms = [i[1] for i in festival_annexes]
autcom_colors = {autcom: autcom_palette[i] for i, autcom in enumerate(autcoms)}
autcom_colors

In [None]:
def to_geojson (row) :
    reduced_images_df = images_df[(images_df['wikidata_id'] == row['wikidata_id']) & (images_df['width'] > images_df['height'])]
    if len (reduced_images_df.index) == 0:
        reduced_images_df = images_df[images_df['wikidata_id'] == row['wikidata_id']]

    tries = len(reduced_images_df.index) 
    if len(reduced_images_df[reduced_images_df['qi'] == 'qi']) > 0 :
        image = reduced_images_df[reduced_images_df['qi'] == 'qi'].sample(1, random_state=0)['image_title'].values[0]
    elif len(reduced_images_df[reduced_images_df['finalist'] == 'finalist']) > 0 :
        image = reduced_images_df[reduced_images_df['finalist'] == 'finalist'].sample(1, random_state=0)['image_title'].values[0]
    else :
        image = reduced_images_df.sample(1, random_state=0)['image_title'].values[0]

    properties = {"description": "[[File:{0}|150px]]".format(image),
                  "title": "[[:Category:Images of festival with code {0}|{1}]]".format(row['wikidata_id'], row['category']),
                  "marker-size": "small",
                  "marker-symbol": "circle",
                  "marker-color": autcom_colors[row['aut_com']]}
    if row['additional'] == 'additional':
        properties['marker-symbol'] = 'circle-stroked'
    feature = Feature(geometry=Point((row['longitude'], row['latitude'])), 
                      properties=properties
                     )
    return feature

In [None]:
pb.output('Retrieving --> {0} in Spain Festivals list from cache'.format(TAG))
festival_list_page = pb.Page(commons_site, FESTIVAL_DB_PAGE)
festival_list_text = StringIO(festival_list_page.text[festival_list_page.text.find('\n') + 1:festival_list_page.text.rfind('\n')])
festivals_df = pd.read_csv(festival_list_text, 
                           sep=";", 
                           index_col=False, 
                           names=['name', 'aut_com', 
                                  'wikidata_id', 'wikidata_timestamp', 
                                  'category', 'cat_timestamp', 'image',
                                  'latitude', 'longitude']
                          )
pb.output('Retrieved --> {0} in Spain Festivals list from cache'.format(TAG))

In [None]:
valid_festivals = festivals_df['wikidata_id'].values
valid_festivals

In [None]:
additional_festivals_df['additional'] = 'additional'
additional_festivals = additional_festivals_df['wikidata_id'].values
additional_festivals

In [None]:
image_columns = ['image_title', 'wikidata_id', 'uploader', 'time_to_upload',
                 'timestamp', 'size', 'height', 'width', 'qi', 'finalist']
images_df = pd.DataFrame(columns=image_columns)

In [None]:
pb.output('Retrieving --> {0} in Spain images list from cache'.format(TAG))
for log_page in LOG_PAGES:
    list_page = pb.Page(commons_site, log_page)
    list_page_text = StringIO(list_page.text[list_page.text.find('\n') + 1:list_page.text.rfind('\n')])
    yearly_df = pd.read_csv(list_page_text,
                            sep=";",
                            index_col=False,
                            names=image_columns
                           ).fillna('')
    images_df = pd.concat([images_df, yearly_df])
pb.output('Retrieved --> {0} in Spain images list from cache'.format(TAG))

images_df['timestamp'] = pd.to_datetime(images_df['timestamp'], format="%Y-%m-%d %H:%M:%S")

images_df.set_index(["timestamp"], inplace=True)
del images_df.index.name

total_images_length = len(images_df)
total_images_length

In [None]:
images_extended_df = pd.merge(images_df, festivals_df, on='wikidata_id', how='left')
len(images_extended_df.index)

In [None]:
images_per_festival = images_extended_df[images_extended_df['wikidata_id'].isin(valid_festivals)]['wikidata_id'].value_counts()
images_per_festival

In [None]:
images_per_festival_df = pd.DataFrame(data=images_per_festival).reset_index()
images_per_festival_df.rename(columns={'index': 'wikidata_id', 'wikidata_id': 'count'}, inplace=True)

In [None]:
images_per_festival_df = pd.merge(images_per_festival_df, festivals_df, on='wikidata_id')[['count', 'wikidata_id', 'category', 'aut_com', 'latitude', 'longitude']].fillna('')
images_per_festival_df = images_per_festival_df.iloc[np.lexsort([images_per_festival_df['category'], -images_per_festival_df['count']])]
images_per_festival_df['category'] = images_per_festival_df['category'].map(lambda x: x.replace('_', ' '))
images_per_festival_df.head()

In [None]:
images_per_additional_festival = images_extended_df[images_extended_df['wikidata_id'].isin(additional_festivals)]['wikidata_id'].value_counts()
images_per_additional_festival_df = pd.DataFrame(data=images_per_additional_festival).reset_index()
images_per_additional_festival_df.rename(columns={'index': 'wikidata_id', 'wikidata_id': 'count'}, inplace=True)
images_per_additional_festival_df = pd.merge(images_per_additional_festival_df, additional_festivals_df, on='wikidata_id')[['count', 'wikidata_id', 'category', 'aut_com', 'latitude', 'longitude']].fillna('')
images_per_additional_festival_df = images_per_additional_festival_df.iloc[np.lexsort([images_per_additional_festival_df['category'], -images_per_additional_festival_df['count']])]
images_per_additional_festival_df['category'] = images_per_additional_festival_df['category'].map(lambda x: x.replace('_', ' '))
images_per_additional_festival_df['additional'] = 'additional'
images_per_additional_festival_df.head()

In [None]:
images_per_combined_festival_df = pd.concat([images_per_festival_df, images_per_additional_festival_df]).fillna('')
images_per_combined_festival_df.head()

In [None]:
images_per_combined_festival_df['dup_index'] = images_per_combined_festival_df.groupby(['latitude', 'longitude'])['latitude'].transform('idxmin')    
images_per_combined_festival_df['dup'] = images_per_combined_festival_df.duplicated(subset=['dup_index'])
images_per_combined_festival_df.head()

In [None]:
images_per_combined_festival_df['coordinates'] = images_per_combined_festival_df.apply(coordinate_shaker, axis=1)
images_per_combined_festival_df[['latitude', 'longitude']] = images_per_combined_festival_df['coordinates'].apply(pd.Series)
images_per_combined_festival_df.head()

In [None]:
images_per_combined_festival_df['geojson'] = images_per_combined_festival_df.apply(lambda row: to_geojson(row), axis=1)

In [None]:
features = images_per_combined_festival_df['geojson'].tolist()
feature_collection = FeatureCollection(features)
dump = geojson.dumps(feature_collection, ensure_ascii=False, indent=2)
#print(dump)

In [None]:
template = """=== WLF contributions map ===
The map below includes all the contributions, by festival, for all the editions of the contest (${years[0]}-${years[-1]}).
<mapframe text="Festivals" latitude="39" longitude="-4" zoom="5" width="800" height="600" align="center"> 
${map}
</mapframe>
"""
vars = {
    "map": dump,
    "years": YEARS
}
t = Template(template)
map_text = t.render(**vars)

In [None]:
maps_page = pb.Page(commons_site, MAP_WLF_PAGE)
maps_page.text = map_text
pb.output('Publishing --> {0} in Spain Statistics'.format(TAG))
maps_page.save("{0} in Spain statistics".format(TAG))