In [23]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""This notebook creates the map of TAG for all its 
editions, provided images have been properly 
categorized"""

import inspect, os, sys

try :
    import pywikibot as pb
    from pywikibot import pagegenerators

except :
    current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
    folder_parts = current_folder.split(os.sep)
    pywikibot_folder = os.sep.join(folder_parts[:-1])

    if current_folder not in sys.path:
        sys.path.insert(0, current_folder)
    if pywikibot_folder not in sys.path:
        sys.path.insert(0, pywikibot_folder)

    import pywikibot as pb
    from pywikibot import pagegenerators

In [24]:
import pandas as pd
import numpy as np
from mako.template import Template
from io import StringIO
import random
import seaborn as sns

In [25]:
from geojson import Feature, Point, FeatureCollection
import geojson

In [26]:
from modules.wmtools import coordinate_shaker, additional_festivals_df

In [27]:
YEARS               = [2016, 2017]

TAG                 = 'WLF'
TAG_EXT             = 'Wiki Loves Folk'

LOG_PAGES           = ['Commons:{1}/{0}/Log'.format(i, TAG_EXT) for i in YEARS]
FESTIVAL_DB_PAGE    = 'Commons:{1}/{0}/Festival DB'.format(2016, TAG_EXT)
MAP_WLF_PAGE        = 'Commons:{0}/Map'.format(TAG_EXT)

commons_site = pb.Site('commons', 'commons')

In [28]:
festival_annexes = [
    ['Anexo:Fiestas de interés turístico de Andalucía', 'Andalusia'],
    ['Anexo:Fiestas de interés turístico de Aragón', 'Aragon'],
    ['Anexo:Fiestas de interés turístico de Asturias', 'Asturias'],
    ['Anexo:Fiestas de interés turístico de Cantabria', 'Cantabria'],
    ['Anexo:Fiestas de interés turístico de Castilla-La Mancha', 'Castile-La Mancha'],
    ['Anexo:Fiestas de interés turístico de Castilla y León', 'Castile and León'],
    ['Anexo:Fiestas de interés turístico de Cataluña', 'Catalonia'],
    ['Anexo:Fiestas de interés turístico de la Comunidad de Madrid', 'Community of Madrid'],
    ['Anexo:Fiestas de interés turístico de la Comunidad Valenciana', 'Valencian Community'],
    ['Anexo:Fiestas de interés turístico de Extremadura', 'Extremadura'],
    ['Anexo:Fiestas de interés turístico de las Islas Baleares', 'Balearic Islands'],
    ['Anexo:Fiestas de interés turístico de las Islas Canarias', 'Canary Islands'],
    ['Anexo:Fiestas de interés turístico de Galicia', 'Galicia'],
    ['Anexo:Fiestas de interés turístico de La Rioja', 'La Rioja'],
    ['Anexo:Fiestas de interés turístico de Navarra', 'Navarre'],
    ['Anexo:Fiestas de interés turístico de la Región de Murcia', 'Region of Murcia'],
    ['Anexo:Fiestas y tradiciones del País Vasco', 'Basque Country']
]

In [29]:
autcom_palette = [i[1:] for i in sns.color_palette('hls', 17).as_hex()]
autcoms = [i[1] for i in festival_annexes]
autcom_colors = {autcom: autcom_palette[i] for i, autcom in enumerate(autcoms)}
autcom_colors

{'Andalusia': 'db5f57',
 'Aragon': 'db8d57',
 'Asturias': 'dbbc57',
 'Balearic Islands': '578ddb',
 'Basque Country': 'db577e',
 'Canary Islands': '575edb',
 'Cantabria': 'ccdb57',
 'Castile and León': '6edb57',
 'Castile-La Mancha': '9ddb57',
 'Catalonia': '57db6e',
 'Community of Madrid': '57db9d',
 'Extremadura': '57bcdb',
 'Galicia': '7e57db',
 'La Rioja': 'ad57db',
 'Navarre': 'db57db',
 'Region of Murcia': 'db57ac',
 'Valencian Community': '57dbcc'}

In [30]:
def to_geojson (row) :
    reduced_images_df = images_df[(images_df['wikidata_id'] == row['wikidata_id']) & (images_df['width'] > images_df['height'])]
    if len (reduced_images_df.index) == 0:
        reduced_images_df = images_df[images_df['wikidata_id'] == row['wikidata_id']]

    tries = len(reduced_images_df.index) 
    if len(reduced_images_df[reduced_images_df['qi'] == 'qi']) > 0 :
        image = reduced_images_df[reduced_images_df['qi'] == 'qi'].sample(1, random_state=0)['image_title'].values[0]
    elif len(reduced_images_df[reduced_images_df['finalist'] == 'finalist']) > 0 :
        image = reduced_images_df[reduced_images_df['finalist'] == 'finalist'].sample(1, random_state=0)['image_title'].values[0]
    else :
        image = reduced_images_df.sample(1, random_state=0)['image_title'].values[0]

    properties = {"description": "[[File:{0}|150px]]".format(image),
                  "title": "[[:Category:Images of festival with code {0}|{1}]]".format(row['wikidata_id'], row['category']),
                  "marker-size": "small",
                  "marker-symbol": "circle",
                  "marker-color": autcom_colors[row['aut_com']]}
    if row['additional'] == 'additional':
        properties['marker-symbol'] = 'circle-stroked'
    feature = Feature(geometry=Point((row['longitude'], row['latitude'])), 
                      properties=properties
                     )
    return feature

In [31]:
pb.output('Retrieving --> {0} in Spain Festivals list from cache'.format(TAG))
festival_list_page = pb.Page(commons_site, FESTIVAL_DB_PAGE)
festival_list_text = StringIO(festival_list_page.text[festival_list_page.text.find('\n') + 1:festival_list_page.text.rfind('\n')])
festivals_df = pd.read_csv(festival_list_text, 
                           sep=";", 
                           index_col=False, 
                           names=['name', 'aut_com', 
                                  'wikidata_id', 'wikidata_timestamp', 
                                  'category', 'cat_timestamp', 'image',
                                  'latitude', 'longitude']
                          )
pb.output('Retrieved --> {0} in Spain Festivals list from cache'.format(TAG))

Retrieving --> WLF in Spain Festivals list from cache
Retrieved --> WLF in Spain Festivals list from cache


In [32]:
valid_festivals = festivals_df['wikidata_id'].values
valid_festivals

array(['Q2939698', 'Q9075883', 'Q1469338', ..., 'Q23655957', 'Q23655971',
       'Q23655987'], dtype=object)

In [33]:
additional_festivals_df['additional'] = 'additional'
additional_festivals = additional_festivals_df['wikidata_id'].values
additional_festivals

array(['Q47120057', 'Q47119796', 'Q47119479', 'Q12175110', 'Q47128911',
       'Q47129466', 'Q47128318', 'Q5861379'], dtype=object)

In [34]:
image_columns = ['image_title', 'wikidata_id', 'uploader', 'time_to_upload',
                 'timestamp', 'size', 'height', 'width', 'qi', 'finalist']
images_df = pd.DataFrame(columns=image_columns)

In [35]:
pb.output('Retrieving --> {0} in Spain images list from cache'.format(TAG))
for log_page in LOG_PAGES:
    list_page = pb.Page(commons_site, log_page)
    list_page_text = StringIO(list_page.text[list_page.text.find('\n') + 1:list_page.text.rfind('\n')])
    yearly_df = pd.read_csv(list_page_text,
                            sep=";",
                            index_col=False,
                            names=image_columns
                           ).fillna('')
    images_df = pd.concat([images_df, yearly_df])
pb.output('Retrieved --> {0} in Spain images list from cache'.format(TAG))

images_df['timestamp'] = pd.to_datetime(images_df['timestamp'], format="%Y-%m-%d %H:%M:%S")

images_df.set_index(["timestamp"], inplace=True)
del images_df.index.name

total_images_length = len(images_df)
total_images_length

Retrieving --> WLF in Spain images list from cache
Retrieved --> WLF in Spain images list from cache


4163

In [36]:
images_extended_df = pd.merge(images_df, festivals_df, on='wikidata_id', how='left')
len(images_extended_df.index)

4163

In [37]:
images_per_festival = images_extended_df[images_extended_df['wikidata_id'].isin(valid_festivals)]['wikidata_id'].value_counts()
images_per_festival

Q1143768     750
Q23657061    609
Q9075883     435
Q9075892     415
Q23541308    376
Q6124533     282
Q9075846      59
Q23309364     57
Q5685566      57
Q23662660     54
Q23663119     44
Q3092925      41
Q23663100     40
Q8355038      33
Q3290365      29
Q5861751      29
Q21479929     28
Q9075868      23
Q26756828     21
Q23199370     20
Q17627979     19
Q23660893     18
Q23663127     16
Q23453538     14
Q23663335     14
Q23660880     14
Q23310072     14
Q23663151     14
Q23453336     14
Q5044119      13
            ... 
Q5858861       1
Q4895460       1
Q23663003      1
Q20109030      1
Q23661532      1
Q7401479       1
Q23662185      1
Q5752915       1
Q5836853       1
Q21003051      1
Q5855501       1
Q5660198       1
Q23310019      1
Q5815728       1
Q20105477      1
Q5400462       1
Q2886090       1
Q23657558      1
Q20105952      1
Q6124477       1
Q23662787      1
Q6024342       1
Q9066149       1
Q20014639      1
Q21483881      1
Q3444111       1
Q17636451      1
Q20541714     

In [38]:
images_per_festival_df = pd.DataFrame(data=images_per_festival).reset_index()
images_per_festival_df.rename(columns={'index': 'wikidata_id', 'wikidata_id': 'count'}, inplace=True)

In [39]:
images_per_festival_df = pd.merge(images_per_festival_df, festivals_df, on='wikidata_id')[['count', 'wikidata_id', 'category', 'aut_com', 'latitude', 'longitude']].fillna('')
images_per_festival_df = images_per_festival_df.iloc[np.lexsort([images_per_festival_df['category'], -images_per_festival_df['count']])]
images_per_festival_df['category'] = images_per_festival_df['category'].map(lambda x: x.replace('_', ' '))
images_per_festival_df.head()

Unnamed: 0,count,wikidata_id,category,aut_com,latitude,longitude
0,750,Q1143768,Falles de València,Valencian Community,39.466667,-0.375
1,609,Q23657061,Bodas de Isabel de Segura,Aragon,40.343611,-1.107222
2,435,Q9075883,Holy Week in Málaga,Andalusia,36.716667,-4.416667
3,415,Q9075892,Holy Week in Zaragoza,Aragon,41.65,-0.883333
4,376,Q23541308,Holy Week in Teruel,Aragon,40.343611,-1.107222


In [40]:
images_per_additional_festival = images_extended_df[images_extended_df['wikidata_id'].isin(additional_festivals)]['wikidata_id'].value_counts()
images_per_additional_festival_df = pd.DataFrame(data=images_per_additional_festival).reset_index()
images_per_additional_festival_df.rename(columns={'index': 'wikidata_id', 'wikidata_id': 'count'}, inplace=True)
images_per_additional_festival_df = pd.merge(images_per_additional_festival_df, additional_festivals_df, on='wikidata_id')[['count', 'wikidata_id', 'category', 'aut_com', 'latitude', 'longitude']].fillna('')
images_per_additional_festival_df = images_per_additional_festival_df.iloc[np.lexsort([images_per_additional_festival_df['category'], -images_per_additional_festival_df['count']])]
images_per_additional_festival_df['category'] = images_per_additional_festival_df['category'].map(lambda x: x.replace('_', ' '))
images_per_additional_festival_df['additional'] = 'additional'
images_per_additional_festival_df.head()

Unnamed: 0,count,wikidata_id,category,aut_com,latitude,longitude,additional
0,81,Q5861379,Festivals of Saint Vincent Ferrer in the city ...,Valencian Community,39.466667,-0.375,additional
1,30,Q47119796,Holy Week in Aranjuez,Community of Madrid,40.033333,-3.602778,additional
2,28,Q12175110,Gran Fira de València,Valencian Community,39.466667,-0.375,additional
3,13,Q47120057,Falles de Dénia,Valencian Community,38.840278,0.108611,additional
4,7,Q47119479,"Fiesta de la Virgen de Agosto y San Roque, Req...",Valencian Community,39.488538,-1.102308,additional


In [41]:
images_per_combined_festival_df = pd.concat([images_per_festival_df, images_per_additional_festival_df]).fillna('')
images_per_combined_festival_df.head()

Unnamed: 0,additional,aut_com,category,count,latitude,longitude,wikidata_id
0,,Valencian Community,Falles de València,750,39.466667,-0.375,Q1143768
1,,Aragon,Bodas de Isabel de Segura,609,40.343611,-1.107222,Q23657061
2,,Andalusia,Holy Week in Málaga,435,36.716667,-4.416667,Q9075883
3,,Aragon,Holy Week in Zaragoza,415,41.65,-0.883333,Q9075892
4,,Aragon,Holy Week in Teruel,376,40.343611,-1.107222,Q23541308


In [42]:
images_per_combined_festival_df['dup_index'] = images_per_combined_festival_df.groupby(['latitude', 'longitude'])['latitude'].transform('idxmin')    
images_per_combined_festival_df['dup'] = images_per_combined_festival_df.duplicated(subset=['dup_index'])
images_per_combined_festival_df.head()

Unnamed: 0,additional,aut_com,category,count,latitude,longitude,wikidata_id,dup_index,dup
0,,Valencian Community,Falles de València,750,39.466667,-0.375,Q1143768,0.0,False
1,,Aragon,Bodas de Isabel de Segura,609,40.343611,-1.107222,Q23657061,1.0,False
2,,Andalusia,Holy Week in Málaga,435,36.716667,-4.416667,Q9075883,2.0,False
3,,Aragon,Holy Week in Zaragoza,415,41.65,-0.883333,Q9075892,3.0,False
4,,Aragon,Holy Week in Teruel,376,40.343611,-1.107222,Q23541308,1.0,True


In [43]:
images_per_combined_festival_df['coordinates'] = images_per_combined_festival_df.apply(coordinate_shaker, axis=1)
images_per_combined_festival_df[['latitude', 'longitude']] = images_per_combined_festival_df['coordinates'].apply(pd.Series)
images_per_combined_festival_df.head()

Unnamed: 0,additional,aut_com,category,count,latitude,longitude,wikidata_id,dup_index,dup,coordinates
0,,Valencian Community,Falles de València,750,39.466667,-0.375,Q1143768,0.0,False,"(39.46666666666667, -0.375)"
1,,Aragon,Bodas de Isabel de Segura,609,40.343611,-1.107222,Q23657061,1.0,False,"(40.343611111111116, -1.1072222222222223)"
2,,Andalusia,Holy Week in Málaga,435,36.716667,-4.416667,Q9075883,2.0,False,"(36.71666666666667, -4.416666666666667)"
3,,Aragon,Holy Week in Zaragoza,415,41.65,-0.883333,Q9075892,3.0,False,"(41.65, -0.8833333333333333)"
4,,Aragon,Holy Week in Teruel,376,40.341676,-1.09946,Q23541308,1.0,True,"(40.34167573594632, -1.0994598564120144)"


In [44]:
images_per_combined_festival_df['geojson'] = images_per_combined_festival_df.apply(lambda row: to_geojson(row), axis=1)

In [45]:
features = images_per_combined_festival_df['geojson'].tolist()
feature_collection = FeatureCollection(features)
dump = geojson.dumps(feature_collection, ensure_ascii=False, indent=2)
#print(dump)

In [46]:
template = """=== WLF contributions map ===
The map below includes all the contributions, by festival, for all the editions of the contest (${years[0]}-${years[-1]}).
<mapframe text="Festivals" latitude="39" longitude="-4" zoom="5" width="800" height="600" align="center"> 
${map}
</mapframe>
"""
vars = {
    "map": dump,
    "years": YEARS
}
t = Template(template)
map_text = t.render(**vars)

In [47]:
maps_page = pb.Page(commons_site, MAP_WLF_PAGE)
maps_page.text = map_text
pb.output('Publishing --> {0} in Spain Statistics'.format(TAG))
maps_page.save("{0} in Spain statistics".format(TAG))

Publishing --> WLF in Spain Statistics
Page [[commons:Commons:Wiki Loves Folk/Map]] saved
