In [55]:
#!/usr/bin/python
# -*- coding: latin-1 -*-
"""This notebook creates the map of TAG for all its 
editions, provided images have been properly 
categorized"""

import inspect, os, sys

try :
    import pywikibot as pb
    from pywikibot import pagegenerators

except :
    current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
    folder_parts = current_folder.split(os.sep)
    pywikibot_folder = os.sep.join(folder_parts[:-1])

    if current_folder not in sys.path:
        sys.path.insert(0, current_folder)
    if pywikibot_folder not in sys.path:
        sys.path.insert(0, pywikibot_folder)

    import pywikibot as pb
    from pywikibot import pagegenerators

In [56]:
import pandas as pd
import numpy as np
from mako.template import Template
from io import StringIO
import random
import seaborn as sns

In [57]:
from geojson import Feature, Point, FeatureCollection
import geojson

In [58]:
YEARS               = [2016, 2017]

TAG                 = 'WLF'
TAG_EXT             = 'Wiki Loves Folk'

LOG_PAGES           = ['Commons:{1}/{0}/Log'.format(i, TAG_EXT) for i in YEARS]
FESTIVAL_DB_PAGE    = 'Commons:{1}/{0}/Festival DB'.format(2016, TAG_EXT)
MAP_WLF_PAGE        = 'Commons:{0}/Map'.format(TAG_EXT)

commons_site = pb.Site('commons', 'commons')

In [59]:
festival_annexes = [
    ['Anexo:Fiestas de interés turístico de Andalucía', 'Andalusia'],
    ['Anexo:Fiestas de interés turístico de Aragón', 'Aragon'],
    ['Anexo:Fiestas de interés turístico de Asturias', 'Asturias'],
    ['Anexo:Fiestas de interés turístico de Cantabria', 'Cantabria'],
    ['Anexo:Fiestas de interés turístico de Castilla-La Mancha', 'Castile-La Mancha'],
    ['Anexo:Fiestas de interés turístico de Castilla y León', 'Castile and León'],
    ['Anexo:Fiestas de interés turístico de Cataluña', 'Catalonia'],
    ['Anexo:Fiestas de interés turístico de la Comunidad de Madrid', 'Community of Madrid'],
    ['Anexo:Fiestas de interés turístico de la Comunidad Valenciana', 'Valencian Community'],
    ['Anexo:Fiestas de interés turístico de Extremadura', 'Extremadura'],
    ['Anexo:Fiestas de interés turístico de las Islas Baleares', 'Balearic Islands'],
    ['Anexo:Fiestas de interés turístico de las Islas Canarias', 'Canary Islands'],
    ['Anexo:Fiestas de interés turístico de Galicia', 'Galicia'],
    ['Anexo:Fiestas de interés turístico de La Rioja', 'La Rioja'],
    ['Anexo:Fiestas de interés turístico de Navarra', 'Navarre'],
    ['Anexo:Fiestas de interés turístico de la Región de Murcia', 'Region of Murcia'],
    ['Anexo:Fiestas y tradiciones del País Vasco', 'Basque Country']
]

In [60]:
shake_step=0.01

def shaker (row) :
    coordinates = (row['latitude'], row['longitude'])
    if row['dup'] == True:
        random.seed(len(row['category']))
        term_selector = random.randint(0,1)
        sign_selector = -1
        if random.random() < 0.5 :
            sign_selector = 1
        shake = [0.0] * 2
        shake[term_selector] = sign_selector*shake_step
        shake = set(shake)
        coordinates = tuple(x + y for x, y in zip(shake, coordinates))

    return coordinates

In [61]:
autcom_palette = [i[1:] for i in sns.color_palette('hls', 17).as_hex()]
autcoms = [i[1] for i in festival_annexes]
autcom_colors = {autcom: autcom_palette[i] for i, autcom in enumerate(autcoms)}
autcom_colors

{'Andalusia': 'db5f57',
 'Aragon': 'db8d57',
 'Asturias': 'dbbc57',
 'Balearic Islands': '578ddb',
 'Basque Country': 'db577e',
 'Canary Islands': '575edb',
 'Cantabria': 'ccdb57',
 'Castile and León': '6edb57',
 'Castile-La Mancha': '9ddb57',
 'Catalonia': '57db6e',
 'Community of Madrid': '57db9d',
 'Extremadura': '57bcdb',
 'Galicia': '7e57db',
 'La Rioja': 'ad57db',
 'Navarre': 'db57db',
 'Region of Murcia': 'db57ac',
 'Valencian Community': '57dbcc'}

In [62]:
def to_geojson (row) :
    reduced_images_df = images_df[images_df['wikidata_id'] == row['wikidata_id']]
    if len(reduced_images_df[reduced_images_df['qi'] == 'qi']) > 0 :
        image = reduced_images_df[reduced_images_df['qi'] == 'qi'].sample(1, random_state=0)['image_title'].values[0]
    elif len(reduced_images_df[reduced_images_df['finalist'] == 'finalist']) > 0 :
        image = reduced_images_df[reduced_images_df['finalist'] == 'finalist'].sample(1, random_state=0)['image_title'].values[0]
    else :
        image = reduced_images_df.sample(1, random_state=0)['image_title'].values[0]

    feature = Feature(geometry=Point((row['longitude'], row['latitude'])), 
                      properties={"description": "[[File:{0}|150px]]".format(image),
                                  "title": "[[:Category:Images of festival with code {0}|{1}]]".format(row['wikidata_id'], row['category']),
                                  "marker-size": "small",
                                  "marker-symbol": "circle",
                                  "marker-color": autcom_colors[row['aut_com']]}
                     )
    return feature

In [63]:
pb.output('Retrieving --> {0} in Spain Festivals list from cache'.format(TAG))
festival_list_page = pb.Page(commons_site, FESTIVAL_DB_PAGE)
festival_list_text = StringIO(festival_list_page.text[festival_list_page.text.find('\n') + 1:festival_list_page.text.rfind('\n')])
festivals_df = pd.read_csv(festival_list_text, 
                           sep=";", 
                           index_col=False, 
                           names=['name', 'aut_com', 
                                  'wikidata_id', 'wikidata_timestamp', 
                                  'category', 'cat_timestamp', 'image',
                                  'latitude', 'longitude']
                          )
pb.output('Retrieved --> {0} in Spain Festivals list from cache'.format(TAG))

Retrieving --> WLF in Spain Festivals list from cache
Retrieved --> WLF in Spain Festivals list from cache


In [64]:
valid_festivals = festivals_df['wikidata_id'].values
valid_festivals

array(['Q2939698', 'Q9075883', 'Q1469338', ..., 'Q23655957', 'Q23655971',
       'Q23655987'], dtype=object)

In [65]:
image_columns = ['image_title', 'wikidata_id', 'uploader', 'time_to_upload', 'timestamp', 'qi', 'finalist']
images_df = pd.DataFrame(columns=image_columns)

In [66]:
pb.output('Retrieving --> {0} in Spain images list from cache'.format(TAG))
for log_page in LOG_PAGES:
    list_page = pb.Page(commons_site, log_page)
    list_page_text = StringIO(list_page.text[list_page.text.find('\n') + 1:list_page.text.rfind('\n')])
    yearly_df = pd.read_csv(list_page_text,
                            sep=";",
                            index_col=False,
                            names=image_columns
                           ).fillna('')
    images_df = pd.concat([images_df, yearly_df])
pb.output('Retrieved --> {0} in Spain images list from cache'.format(TAG))

images_df['timestamp'] = pd.to_datetime(images_df['timestamp'], format="%Y-%m-%d %H:%M:%S")

images_df.set_index(["timestamp"], inplace=True)
del images_df.index.name

total_images_length = len(images_df)
total_images_length

Retrieving --> WLF in Spain images list from cache
Retrieved --> WLF in Spain images list from cache


4162

In [67]:
images_extended_df = pd.merge(images_df, festivals_df, on='wikidata_id', how='left')
len(images_extended_df.index)

4162

In [68]:
images_per_festival = images_extended_df[images_extended_df['wikidata_id'].isin(valid_festivals)]['wikidata_id'].value_counts()
images_per_festival

Q1143768     750
Q23657061    609
Q9075883     435
Q9075892     415
Q23541308    376
Q6124533     282
Q9075846      58
Q23309364     57
Q5685566      57
Q23662660     54
Q23663119     44
Q3092925      41
Q23663100     40
Q8355038      33
Q5861751      29
Q3290365      29
Q21479929     28
Q9075868      23
Q26756828     21
Q23199370     20
Q17627979     19
Q23660893     18
Q23663127     16
Q23660880     14
Q23663151     14
Q23663335     14
Q23453336     14
Q23310072     14
Q23453538     14
Q23662027     13
            ... 
Q5963246       1
Q23662787      1
Q5855501       1
Q2838024       1
Q20541714      1
Q21483881      1
Q20109030      1
Q6135210       1
Q3092923       1
Q5815728       1
Q23310019      1
Q3444111       1
Q4895460       1
Q7401479       1
Q5660198       1
Q23657558      1
Q6024342       1
Q23663003      1
Q17636451      1
Q9066149       1
Q17305074      1
Q23661532      1
Q5752915       1
Q6124477       1
Q5858861       1
Q5836853       1
Q23662185      1
Q21003051     

In [69]:
images_per_festival_df = pd.DataFrame(data=images_per_festival).reset_index()
images_per_festival_df.rename(columns={'index': 'wikidata_id', 'wikidata_id': 'count'}, inplace=True)

In [70]:
images_per_festival_df = pd.merge(images_per_festival_df, festivals_df, on='wikidata_id')[['count', 'wikidata_id', 'category', 'aut_com', 'latitude', 'longitude']].fillna('')
images_per_festival_df = images_per_festival_df.iloc[np.lexsort([images_per_festival_df['category'], -images_per_festival_df['count']])]
images_per_festival_df['category'] = images_per_festival_df['category'].map(lambda x: x.replace('_', ' '))
images_per_festival_df.head()

Unnamed: 0,count,wikidata_id,category,aut_com,latitude,longitude
0,750,Q1143768,Falles de València,Valencian Community,39.466667,-0.375
1,609,Q23657061,Bodas de Isabel de Segura,Aragon,40.343611,-1.107222
2,435,Q9075883,Holy Week in Málaga,Andalusia,36.716667,-4.416667
3,415,Q9075892,Holy Week in Zaragoza,Aragon,41.65,-0.883333
4,376,Q23541308,Holy Week in Teruel,Aragon,40.343611,-1.107222


In [71]:
images_per_festival_df['dup_index'] = images_per_festival_df.groupby(['latitude', 'longitude'])['latitude'].transform('idxmin')    
images_per_festival_df['dup'] = images_per_festival_df.duplicated(subset=['dup_index'])
images_per_festival_df.head()

Unnamed: 0,count,wikidata_id,category,aut_com,latitude,longitude,dup_index,dup
0,750,Q1143768,Falles de València,Valencian Community,39.466667,-0.375,0.0,False
1,609,Q23657061,Bodas de Isabel de Segura,Aragon,40.343611,-1.107222,1.0,False
2,435,Q9075883,Holy Week in Málaga,Andalusia,36.716667,-4.416667,2.0,False
3,415,Q9075892,Holy Week in Zaragoza,Aragon,41.65,-0.883333,3.0,False
4,376,Q23541308,Holy Week in Teruel,Aragon,40.343611,-1.107222,1.0,True


In [72]:
images_per_festival_df['coordinates'] = images_per_festival_df.apply(shaker, axis=1)
images_per_festival_df[['latitude', 'longitude']] = images_per_festival_df['coordinates'].apply(pd.Series)
images_per_festival_df.head()

Unnamed: 0,count,wikidata_id,category,aut_com,latitude,longitude,dup_index,dup,coordinates
0,750,Q1143768,Falles de València,Valencian Community,39.466667,-0.375,0.0,False,"(39.46666666666667, -0.375)"
1,609,Q23657061,Bodas de Isabel de Segura,Aragon,40.343611,-1.107222,1.0,False,"(40.343611111111116, -1.1072222222222223)"
2,435,Q9075883,Holy Week in Málaga,Andalusia,36.716667,-4.416667,2.0,False,"(36.71666666666667, -4.416666666666667)"
3,415,Q9075892,Holy Week in Zaragoza,Aragon,41.65,-0.883333,3.0,False,"(41.65, -0.8833333333333333)"
4,376,Q23541308,Holy Week in Teruel,Aragon,40.343611,-1.117222,1.0,True,"(40.343611111111116, -1.1172222222222223)"


In [73]:
images_per_festival_df['geojson'] = images_per_festival_df.apply(lambda row: to_geojson(row), axis=1)

In [74]:
features = images_per_festival_df['geojson'].tolist()
feature_collection = FeatureCollection(features)
dump = geojson.dumps(feature_collection, ensure_ascii=False, indent=2)
#print(dump)

In [77]:
template = """=== WLF contributions map ===
The map below includes all the contributions, by festival, for all the editions of the contest (${years[0]}-${years[-1]}).
<mapframe text="Festivals" latitude="39" longitude="-4" zoom="5" width="800" height="600" align="center"> 
${map}
</mapframe>
"""
vars = {
    "map": dump,
    "years": YEARS
}
t = Template(template)
map_text = t.render(**vars)
map_text

'=== WLF contributions map ===\nThe map below includes all the contributions, by festival, for all the editions of the contest (2016-2017).\n<mapframe text="Festivals" latitude="39" longitude="-4" zoom="5" width="800" height="600" align="center"> \n{\n  "type": "FeatureCollection",\n  "features": [\n    {\n      "type": "Feature",\n      "geometry": {\n        "type": "Point",\n        "coordinates": [\n          -0.375,\n          39.46666666666667\n        ]\n      },\n      "properties": {\n        "description": "[[File:Fallas2016 Peinado 02.jpg|150px]]",\n        "title": "[[:Category:Images of festival with code Q1143768|Falles de València]]",\n        "marker-size": "small",\n        "marker-symbol": "circle",\n        "marker-color": "57dbcc"\n      }\n    },\n    {\n      "type": "Feature",\n      "geometry": {\n        "type": "Point",\n        "coordinates": [\n          -1.1072222222222223,\n          40.343611111111116\n        ]\n      },\n      "properties": {\n        "

In [78]:
maps_page = pb.Page(commons_site, MAP_WLF_PAGE)
maps_page.text = map_text
pb.output('Publishing --> {0} in Spain Statistics'.format(TAG))
maps_page.save("{0} in Spain statistics".format(TAG))

Publishing --> WLF in Spain Statistics
Page [[Commons:Wiki Loves Folk/Map]] saved
