In [1]:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""This notebook creates the statistics of TAG in Spain in YEAR"""

import inspect, os, sys

try :
    import pywikibot as pb
    from pywikibot import pagegenerators, textlib
    from pywikibot.specialbots import UploadRobot

except :
    current_folder = os.path.realpath(os.path.abspath(os.path.split(inspect.getfile(inspect.currentframe()))[0]))
    folder_parts = current_folder.split(os.sep)
    pywikibot_folder = os.sep.join(folder_parts[:-1])

    if current_folder not in sys.path:
        sys.path.insert(0, current_folder)
    if pywikibot_folder not in sys.path:
        sys.path.insert(0, pywikibot_folder)

    import pywikibot as pb
    from pywikibot import pagegenerators, textlib
    from pywikibot.specialbots import UploadRobot

import mwparserfromhell as mwh

In [2]:
from modules.wmtools import upload_to_commons, \
                            get_registration_time, \
                            heat_color, \
                            get_project_name, \
                            wrap_label, \
                            coordinate_shaker, \
                            additional_festivals_df

In [3]:
import pandas as pd
import numpy as np
from mako.template import Template
from io import StringIO, BytesIO
from datetime import datetime, timedelta
from urllib.parse import urlencode
import requests
import json
from itertools import groupby
from operator import itemgetter
from functools import reduce
import math
import random

In [4]:
from geojson import Feature, Point, FeatureCollection
import geojson

In [5]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import MaxNLocator
import seaborn as sns

sns.set_style("darkgrid")
%matplotlib inline

In [6]:
YEAR                = 2016
TAG                 = 'WLF'
TAG_EXT             = 'Wiki Loves Folk'

BASE_WLF_NAME       = 'Commons:{1}/{0}'.format(YEAR, TAG_EXT)
BASE_WLF2016_NAME   = 'Commons:{1}/{0}'.format(2016, TAG_EXT)
LOG_PAGE            = BASE_WLF_NAME + '/Log'
STATISTICS_PAGE     = BASE_WLF_NAME + '/Stats'
FESTIVAL_DB_PAGE    = BASE_WLF2016_NAME + '/Festival DB'
GALLERY_QI          = BASE_WLF_NAME + '/QI'

VALID_NAMESPACES    = ['0', '4', '100', '104']
DAYS_BEFORE_REGISTRATION = 15

WLF_FINALIST_CATEGORY = "Category:Images from {1} {0} in Spain (finalists)".format(YEAR, TAG_EXT)

commons_site = pb.Site('commons', 'commons')

In [7]:
MW_API_BASE_URL     = 'https://commons.wikimedia.org/w/api.php'
MW_API_QUERY_STRING = {"action": "query",
                       "format": "json",
                       "gulimit": "500",
                       "prop": "globalusage",
                       "guprop": "url|namespace",
                       "titles": None
                      }

In [8]:
figsize=[15., 10.]
figsize_half=[8., 10.]
figsize_high=[15., 30.]
figsize_low=[15., 6.]

In [9]:
cwd = os.getcwd()

images_directory = os.path.join(cwd, 'images')
if not os.path.exists(images_directory):
    os.makedirs(images_directory)
    
templates_directory = os.path.join(cwd, 'templates')

In [10]:
now = (datetime.now().strftime("%Y-%m-%d"))

In [None]:
def get_campaign (row):
    """Function that classifies images according to uploade date.
    THIS IS A YEAR-DEPENDENT FUNCTION"""
    start_datetime_fallas = datetime(YEAR, 4, 1)
    end_datetime_fallas   = datetime(YEAR, 5, 1) + timedelta(hours=2)
    start_datetime_summer = datetime(YEAR, 8, 25)
    end_datetime_summer   = datetime(YEAR, 9, 6) + timedelta(hours=2)
    start_datetime_autumm = datetime(YEAR, 11, 25)
    end_datetime_autumm   = datetime(YEAR, 12, 16) + timedelta(hours=2)
    
    if (row['timestamp'] > start_datetime_fallas) and (row['timestamp'] < end_datetime_fallas):
        return "spring"
    elif (row['timestamp'] > start_datetime_summer) and (row['timestamp'] < end_datetime_summer):
        return "summer"
    elif (row['timestamp'] > start_datetime_autumm) and (row['timestamp'] < end_datetime_autumm):
        return "autumn"
    else :
        return ""

In [None]:
def expand_itemid (_list):
    new_list = [{"itemid": i, "category": festivals_df[festivals_df['wikidata_id'] == i]['category'].values[0]} for i in _list]
    if len(new_list) > 0:
        new_list = sorted(new_list, key=lambda k: k['category']) 
    return new_list

def decode_list (_list) :
    try:
        new_list = _list[:]
    except :
        new_list = []
    return new_list

In [None]:
def to_geojson (row) :
    reduced_images_df = images_df[(images_df['wikidata_id'] == row['wikidata_id']) & (images_df['width'] > images_df['height'])]
    if len (reduced_images_df.index) == 0:
        reduced_images_df = images_df[images_df['wikidata_id'] == row['wikidata_id']]

    tries = len(reduced_images_df.index) 
    if len(reduced_images_df[reduced_images_df['qi'] == 'qi']) > 0 :
        image = reduced_images_df[reduced_images_df['qi'] == 'qi'].sample(1, random_state=0)['image_title'].values[0]
    elif len(reduced_images_df[reduced_images_df['finalist'] == 'finalist']) > 0 :
        image = reduced_images_df[reduced_images_df['finalist'] == 'finalist'].sample(1, random_state=0)['image_title'].values[0]
    else :
        image = reduced_images_df.sample(1, random_state=0)['image_title'].values[0]

    properties = {"description": "[[File:{0}|150px]]".format(image),
                  "title": "[[:Category:Images of festival with code {0} from {2} {1} in Spain|{3}]]".format(row['wikidata_id'], YEAR, TAG_EXT, row['category']),
                  "marker-size": "small",
                  "marker-symbol": "circle",
                  "marker-color": autcom_colors[row['aut_com']]}
    if row['additional'] == 'additional':
        properties['marker-symbol'] = 'circle-stroked'
    feature = Feature(geometry=Point((row['longitude'], row['latitude'])), 
                      properties=properties
                     )
    return feature

In [None]:
# Image description
plot_description = """== {{{{int:filedesc}}}} ==
{{{{Information
|description={{{{en|{2}}}}}
|source={{{{own}}}}
|author=[[User:Discasto|Discasto]]
|date={3}
}}}}

{{{{Created with Matplotlib}}}}

== {{{{int:license-header}}}} ==
{{{{self|cc-by-sa-4.0}}}}

[[Category:Photos by User:Discasto]]
[[Category:{0} {1} in Spain]]"""

In [None]:
festival_annexes = [
    ['Anexo:Fiestas de interés turístico de Andalucía', 'Andalusia'],
    ['Anexo:Fiestas de interés turístico de Aragón', 'Aragon'],
    ['Anexo:Fiestas de interés turístico de Asturias', 'Asturias'],
    ['Anexo:Fiestas de interés turístico de Cantabria', 'Cantabria'],
    ['Anexo:Fiestas de interés turístico de Castilla-La Mancha', 'Castile-La Mancha'],
    ['Anexo:Fiestas de interés turístico de Castilla y León', 'Castile and León'],
    ['Anexo:Fiestas de interés turístico de Cataluña', 'Catalonia'],
    ['Anexo:Fiestas de interés turístico de la Comunidad de Madrid', 'Community of Madrid'],
    ['Anexo:Fiestas de interés turístico de la Comunidad Valenciana', 'Valencian Community'],
    ['Anexo:Fiestas de interés turístico de Extremadura', 'Extremadura'],
    ['Anexo:Fiestas de interés turístico de las Islas Baleares', 'Balearic Islands'],
    ['Anexo:Fiestas de interés turístico de las Islas Canarias', 'Canary Islands'],
    ['Anexo:Fiestas de interés turístico de Galicia', 'Galicia'],
    ['Anexo:Fiestas de interés turístico de La Rioja', 'La Rioja'],
    ['Anexo:Fiestas de interés turístico de Navarra', 'Navarre'],
    ['Anexo:Fiestas de interés turístico de la Región de Murcia', 'Region of Murcia'],
    ['Anexo:Fiestas y tradiciones del País Vasco', 'Basque Country']
]
annexes = pd.DataFrame(data=festival_annexes, columns=['annex', 'aut_com']).set_index(['aut_com'])['annex']
annexes

In [None]:
autcom_palette = [i[1:] for i in sns.color_palette('hls', 17).as_hex()]
autcoms = [i[1] for i in festival_annexes]
autcom_colors = {autcom: autcom_palette[i] for i, autcom in enumerate(autcoms)}
autcom_colors

In [None]:
pb.output('Retrieving --> {0} in Spain Festivals list from cache'.format(TAG))
festival_list_page = pb.Page(commons_site, FESTIVAL_DB_PAGE)
festival_list_text = StringIO(festival_list_page.text[festival_list_page.text.find('\n') + 1:festival_list_page.text.rfind('\n')])
festivals_df = pd.read_csv(festival_list_text, 
                           sep=";", 
                           index_col=False, 
                           names=['name', 'aut_com', 
                                  'wikidata_id', 'wikidata_timestamp', 
                                  'category', 'cat_timestamp', 'image',
                                  'latitude', 'longitude']
                          )
pb.output('Retrieved --> {0} in Spain Festivals list from cache'.format(TAG))

In [None]:
festivals_df.head()

In [None]:
festival_length = len(festivals_df.index)
festival_length

In [None]:
valid_festivals = festivals_df['wikidata_id'].values
valid_festivals

In [None]:
additional_festivals_df['additional'] = 'additional'
additional_festivals = additional_festivals_df['wikidata_id'].values
additional_festivals

In [None]:
additional_festivals_df.head()

In [None]:
pb.output('Retrieving --> {1} {0} in Spain images list from cache'.format(YEAR, TAG))
list_page = pb.Page(commons_site, LOG_PAGE)
list_page_text = StringIO(list_page.text[list_page.text.find('\n') + 1:list_page.text.rfind('\n')])
images_df = pd.read_csv(list_page_text,
                            sep=";",
                            index_col=False,
                            names=['image_title', 'wikidata_id', 'uploader', 'uploader_registration', 
                                   'timestamp', 'date', 'size', 'height', 'width', 'qi', 'finalist']
                       ).fillna('')
pb.output('Retrieved --> {1} {0} in Spain images list from cache'.format(YEAR, TAG))

images_df['timestamp'] = pd.to_datetime(images_df['timestamp'], format="%Y-%m-%d %H:%M:%S")
images_df['time_to_upload'] = images_df.apply(lambda row: (row['timestamp'] - pd.to_datetime(row['uploader_registration'], format="%Y-%m-%d")).days, axis=1)

images_df['campaign'] = images_df.apply (lambda row: get_campaign(row), axis=1)

images_df.set_index(["timestamp"], inplace=True)
del images_df.index.name

total_images_length = len(images_df)
total_images_length

In [None]:
qi_list = images_df[images_df['qi'] == 'qi']['image_title']
qi_list

In [None]:
qi_length = len(qi_list)
qi_length

In [None]:
# THIS IS YEAR-DEPENDENT
campaign_count = images_df['campaign'].value_counts()
campaign_count = campaign_count.reindex(['spring', 'summer', 'autumn'])
campaign_count

In [None]:
fig, ax = plt.subplots(figsize=figsize)

p = sns.barplot(ax=ax, x=campaign_count.index, y=campaign_count.values)
p.set_xlabel("Campaigns", fontsize=18)
p.set_ylabel("# Photographs", fontsize=18)

p.set_title(label='{1} {0} in Spain: Contributions by campaign'.format(YEAR, TAG), fontsize=20)
p.tick_params(labelsize=14)

# THIS IS YEAR-DEPENDENT
p.set_xticklabels(['Spring', 'Summer', 'Autumn'])

for patch in p.patches:
    height = patch.get_height()
    p.text(patch.get_x() + patch.get_width()/2.,
            height + 20,
            '{:1.0f}'.format(height),
            ha="center",
            fontsize=14)
    
upload_to_commons(p,
                  "{1} {0} in Spain - Contributions by campaign.png".format(YEAR, TAG),
                  '{1} {0} in Spain: Contributions by campaign'.format(YEAR, TAG_EXT),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
uploaders = images_df.groupby(['uploader']).min()['time_to_upload']
uploaders

In [None]:
time_to_upload = uploaders.value_counts().sort_index(ascending=False)
time_to_upload

In [None]:
age = pd.cut(uploaders, bins=[0, 15, 365, 730, 3650, 5000], include_lowest=True).value_counts()

In [None]:
fig, ax = plt.subplots(figsize=figsize)

p = sns.barplot(ax=ax, y=age.index, x=age.values)
p.set_xlabel("# Contestants", fontsize=18)
p.set_ylabel("Age (time from registration)", fontsize=18)

p.set_title(label='{1} {0} in Spain: Contestant age'.format(YEAR, TAG), fontsize=20)
p.tick_params(labelsize=14)

p.set_yticklabels(['Less than 15 days\nnew )', 
                   'Between 15 days\nand one year', 
                   'Between one\nand two years', 
                   'Between two\nand ten years', 
                   'More than ten years'])

for patch in ax.patches:
    ax.text(patch.get_width() + 1,
            patch.get_y() + patch.get_height()/2.,
            '{:1.0f}'.format(patch.get_width()),
            ha="center",
            fontsize=14)

upload_to_commons(p,
                  "{1} {0} in Spain - Contestant age.png".format(YEAR, TAG),
                  '{1} {0} in Spain: Contestant age. Time from registration to first contribution to contest.'.format(YEAR, TAG_EXT),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
authors_length = len(uploaders.index)
authors_length

In [None]:
images_per_uploader = images_df['uploader'].value_counts()
images_per_uploader = images_per_uploader.rename('images')
images_per_uploader

In [None]:
# New uploaders
days_before_registration = DAYS_BEFORE_REGISTRATION
new_uploaders = uploaders[uploaders<days_before_registration].index
new_uploaders

In [None]:
new_uploaders_length = len(new_uploaders)
new_uploaders_length

In [None]:
new_uploaders = images_per_uploader[new_uploaders]
new_uploaders

In [None]:
images_per_uploader = images_df['uploader'].value_counts()
images_per_uploader = images_per_uploader.rename('images')
images_per_uploader = images_per_uploader.iloc[np.lexsort([images_per_uploader.index, -images_per_uploader.values])]
images_per_uploader

In [None]:
# THIS PARAMETER IS YEAR-DEPENDENT AND COMES FROM MANUAL INSPECTION
TOP_UPLOADERS = 13

In [None]:
images_per_uploader.size

In [None]:
remaining_images_per_uploader_count = images_per_uploader[TOP_UPLOADERS:].count()
remaining_images_per_uploader_count

In [None]:
remaining_images_per_uploader_sum = images_per_uploader[TOP_UPLOADERS:].sum()
remaining_images_per_uploader_sum

In [None]:
remaining_images_per_uploader = images_per_uploader[:TOP_UPLOADERS]
remaining_images_per_uploader = remaining_images_per_uploader.iloc[np.lexsort([remaining_images_per_uploader.index, -remaining_images_per_uploader.values])]
remaining_images_per_uploader = remaining_images_per_uploader.append(pd.Series(data={"Rest ({0})".format(remaining_images_per_uploader_count): remaining_images_per_uploader_sum}))
remaining_images_per_uploader

In [None]:
fig, ax = plt.subplots(figsize=figsize)

p = sns.barplot(ax=ax, x=remaining_images_per_uploader.index, y=remaining_images_per_uploader.values)
p.set_xlabel("Contributors", fontsize=18)
p.set_ylabel("# Photographs", fontsize=18)

p.set_title(label='{1} {0} in Spain: Top uploaders'.format(YEAR, TAG), fontsize=20)
p.tick_params(labelsize=14)

p.set_xticklabels(p.get_xticklabels(), rotation=90)

for patch in p.patches:
    height = patch.get_height()
    p.text(patch.get_x() + patch.get_width()/2.,
            height + 10,
            '{:1.0f}'.format(height),
            ha="center",
            fontsize=13)

upload_to_commons(p,
                  "{1} {0} in Spain - Top authors.png".format(YEAR, TAG),
                  'Top {2} contributors to {1} {0} in Spain'.format(YEAR, TAG_EXT, TOP_UPLOADERS),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
images_df['uploader'].unique()

In [None]:
images_df['uploader'].unique().size

In [None]:
valid_images_length = len(images_df[images_df['wikidata_id'].isin(valid_festivals)].index)
valid_images_length

In [None]:
images_df[images_df['wikidata_id'].isin(valid_festivals)]['wikidata_id'].unique()

In [None]:
festivals_per_uploader_df = images_df[images_df['wikidata_id'].isin(valid_festivals)].\
                                    groupby(['uploader']).\
                                    agg({"wikidata_id": pd.Series.nunique}).\
                                    sort_values('wikidata_id', ascending=False)
festivals_per_uploader = festivals_per_uploader_df["wikidata_id"]
festivals_per_uploader = festivals_per_uploader.rename('festivals')
festivals_per_uploader = festivals_per_uploader.iloc[np.lexsort([festivals_per_uploader.index, -festivals_per_uploader.values])]
festivals_per_uploader

In [None]:
# THIS PARAMETER IS YEAR-DEPENDENT AND COMES FROM MANUAL INSPECTION
TOP_UPLOADERS_BY_FESTIVAL = 17

In [None]:
images_df[images_df['wikidata_id'].isin(valid_festivals)]['wikidata_id'].unique()

In [None]:
wlf_festivals_length = images_df[images_df['wikidata_id'].isin(valid_festivals)]['wikidata_id'].unique().size
wlf_festivals_length

In [None]:
fig, ax = plt.subplots(figsize=figsize)

p = sns.barplot(ax=ax, 
                x=festivals_per_uploader[:TOP_UPLOADERS_BY_FESTIVAL].index, 
                y=festivals_per_uploader[:TOP_UPLOADERS_BY_FESTIVAL].values
               )
p.set_xlabel("Contributors", fontsize=18)
p.set_ylabel("# Festivals", fontsize=18)

p.set_title(label='{1} {0} in Spain: Top uploaders by festival'.format(YEAR, TAG), fontsize=20)
p.tick_params(labelsize=14)

p.set_xticklabels(p.get_xticklabels(), rotation=90)
p.yaxis.set_major_locator(MaxNLocator(integer=True))

for patch in p.patches:
    height = patch.get_height()
    p.text(patch.get_x() + patch.get_width()/2.,
            height + 0.1,
            '{:1.0f}'.format(height),
            ha="center",
            fontsize=13)

upload_to_commons(p,
                  "{1} {0} in Spain - Top authors by festival.png".format(YEAR, TAG),
                  'Top {2} contributors to {1} {0} in Spain'.format(YEAR, TAG_EXT, TOP_UPLOADERS_BY_FESTIVAL),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
uploaders_df = pd.concat([festivals_per_uploader, images_per_uploader], axis=1).fillna(0)
uploaders_df.columns=['Festivals', 'Photographs']
uploaders_df['Festivals'] = uploaders_df['Festivals'].astype(int)
uploaders_df = uploaders_df.iloc[np.lexsort([uploaders_df.index, -uploaders_df['Photographs']])]
uploaders_df

In [None]:
fig, ax = plt.subplots(figsize=figsize)

p = sns.barplot(ax=ax, 
                x=uploaders_df[:TOP_UPLOADERS].index, 
                y=uploaders_df[:TOP_UPLOADERS]['Photographs'], 
                hue=uploaders_df[:TOP_UPLOADERS]['Festivals'],
                dodge=False)
p.set_xlabel("Contributors", fontsize=18)
p.set_ylabel("# Photographs", fontsize=18)

p.set_title(label='{1} {0} in Spain: Top uploaders\nby number of photographs and festivals'.format(YEAR, TAG), fontsize=20)

p.tick_params(labelsize=14)
p.set_xticklabels(p.get_xticklabels(), rotation=90)

handles, labels = p.get_legend_handles_labels()
handles.reverse()
labels.reverse()

legend = plt.legend(loc='center right', 
                    title='Number of\nfestivals', 
                    fontsize=14,
                    labels=labels,
                    handles=handles)
plt.setp(legend.get_title(), fontsize=16)

for patch in p.patches:
    height = patch.get_height()
    if not math.isnan(height):
        p.text(patch.get_x() + patch.get_width()/2.,
                height + 15,
                '{:1.0f}'.format(height),
                ha="center",
                fontsize=13)

upload_to_commons(p,
                  "{1} {0} in Spain - Top authors (2).png".format(YEAR, TAG),
                  'Top {2} contributors to {1} {0} in Spain with contribution to festivals'.format(YEAR, TAG_EXT, TOP_UPLOADERS),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
images_df['wikidata_id'].value_counts()

In [None]:
# THIS IS YEAR-DEPENDENT
images_df[images_df["campaign"] == "autumn"]['wikidata_id'].value_counts()

In [None]:
# THIS IS YEAR-DEPENDENT
images_df[images_df["campaign"] == "summer"]['wikidata_id'].value_counts()

In [None]:
# THIS IS YEAR-DEPENDENT
images_df[images_df["campaign"] == "spring"]['wikidata_id'].value_counts()

In [None]:
# THIS IS YEAR-DEPENDENT
upload_ts = images_df[images_df['campaign'] == 'autumn']['image_title'].resample('d').count()
upload_ts[datetime(YEAR, 12, 5)] = upload_ts[datetime(YEAR, 12, 5)] + upload_ts[datetime(YEAR, 12, 6)]
upload_ts.drop(datetime(YEAR, 12, 6), inplace=True)
upload_ts = pd.Series([0]*21, index=pd.date_range(datetime(YEAR, 11, 25), periods=21, freq='D')) + upload_ts
upload_ts = upload_ts.fillna(0).astype(int)
upload_ts

In [None]:
# THIS IS YEAR-DEPENDENT
fig, ax = plt.subplots(figsize=figsize)

p = ax.bar(upload_ts.index.to_pydatetime(), 
       upload_ts.values,
       color=sns.color_palette("Blues_d", 21))
ax.xaxis.set_major_locator(mdates.AutoDateLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
ax.set_xlabel('Date', fontsize=18)
ax.set_ylabel("# Photographs", fontsize=18)
ax.set_title(label='Photographs uploaded to {1} {0} in Spain\n(Autumn campaign)'.format(YEAR, TAG), fontsize=20)

ax.tick_params(labelsize=14)
plt.xticks(rotation=90)

for patch in ax.patches:
    height = patch.get_height()
    if height > 0 :
        ax.text(patch.get_x() + patch.get_width()/2.,
                height + 2,
                '{:1.0f}'.format(height),
                ha="center",
                fontsize=13)

upload_to_commons(plt,
                  "{1} {0} in Spain - Uploads by day (Autumn campaign).png".format(YEAR, TAG),
                  'Images uploaded to {1} {0} in Spain (Autumn campaign) by day'.format(YEAR, TAG),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
# THIS IS YEAR-DEPENDENT
upload_ts = images_df[images_df['campaign'] == 'spring']['image_title'].resample('d').count()
upload_ts = pd.Series([0]*30, index=pd.date_range(datetime(YEAR, 4, 1), periods=30, freq='D')) + upload_ts
upload_ts = upload_ts.fillna(0).astype(int)
upload_ts

In [None]:
# THIS IS YEAR-DEPENDENT
fig, ax = plt.subplots(figsize=figsize)

p = ax.bar(upload_ts.index.to_pydatetime(), 
       upload_ts.values,
       color=sns.color_palette("Blues_d", 30))
ax.xaxis.set_major_locator(mdates.AutoDateLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
ax.set_xlabel('Date', fontsize=18)
ax.set_ylabel("# Photographs", fontsize=18)
ax.set_title(label='Photographs uploaded to {1} {0} in Spain\n(Spring campaign)'.format(YEAR, TAG), fontsize=20)
ax.set_ylim([0, 260])

ax.tick_params(labelsize=14)
plt.xticks(rotation=90)

for patch in ax.patches:
    height = patch.get_height()
    if height > 0 :
        ax.text(patch.get_x() + patch.get_width()/2.,
                height + 3,
                '{:1.0f}'.format(height),
                ha="center",
                fontsize=13)

upload_to_commons(plt,
                  "{1} {0} in Spain - Uploads by day (Spring campaign).png".format(YEAR, TAG),
                  'Images uploaded to {1} {0} in Spain (Spring campaign) by day'.format(YEAR, TAG),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
images_extended_df = pd.merge(images_df, festivals_df, on='wikidata_id', how='left')
len(images_extended_df.index)

In [None]:
valid_images_per_uploader = images_df[images_df['wikidata_id'].isin(valid_festivals)]['uploader'].value_counts()
valid_images_per_uploader = valid_images_per_uploader.rename('valid_images')
valid_images_per_uploader = valid_images_per_uploader.iloc[np.lexsort([valid_images_per_uploader.index, -valid_images_per_uploader.values])]
valid_images_per_uploader

In [None]:
festival_list_per_uploader = images_extended_df[images_extended_df['wikidata_id'].isin(valid_festivals)]\
                                               .groupby('uploader')['wikidata_id']\
                                               .apply(set)\
                                               .apply(lambda x: filter(None, x))\
                                               .apply(lambda x: expand_itemid(x))\
                                               .rename('festival_list', inplace=True)
festival_list_per_uploader

In [None]:
authors_df = pd.concat([images_per_uploader, valid_images_per_uploader, festivals_per_uploader, festival_list_per_uploader], axis=1)\
               .sort_values(by='images', ascending=False)\
               .reset_index()\
               .rename(columns = {'index': 'contestant'})

authors_df[['images', 'valid_images', 'festivals']] = authors_df[['images', 'valid_images', 'festivals']]\
                                                                        .fillna(0)\
                                                                        .astype('int')

authors_df = authors_df.iloc[np.lexsort([authors_df['contestant'], -authors_df['images']])]
authors_df['registration_string'] = authors_df['contestant'].map(lambda x: get_registration_time(x))
authors_df['festival_list'] = authors_df['festival_list'].map(lambda x: decode_list(x))
authors_df

In [None]:
images_per_festival = images_extended_df[images_extended_df['wikidata_id'].isin(valid_festivals)]['wikidata_id'].value_counts()
images_per_festival

In [None]:
# THIS PARAMETER IS YEAR-DEPENDENT AND COMES FROM MANUAL INSPECTION
# May be set to the number of festivals with more than 15 pictures
TOP_FESTIVALS = 14

In [None]:
images_per_festival_df = pd.DataFrame(data=images_per_festival).reset_index()
images_per_festival_df.rename(columns={'index': 'wikidata_id', 'wikidata_id': 'count'}, inplace=True)

In [None]:
images_per_festival_df = pd.merge(images_per_festival_df, festivals_df, on='wikidata_id')[['count', 'wikidata_id', 'category', 'aut_com', 'latitude', 'longitude']].fillna('')
images_per_festival_df = images_per_festival_df.iloc[np.lexsort([images_per_festival_df['category'], -images_per_festival_df['count']])]
images_per_festival_df['category'] = images_per_festival_df['category'].map(lambda x: x.replace('_', ' '))
images_per_festival_df.head()

In [None]:
images_per_additional_festival = images_extended_df[images_extended_df['wikidata_id'].isin(additional_festivals)]['wikidata_id'].value_counts()
images_per_additional_festival_df = pd.DataFrame(data=images_per_additional_festival).reset_index()
images_per_additional_festival_df.rename(columns={'index': 'wikidata_id', 'wikidata_id': 'count'}, inplace=True)
images_per_additional_festival_df = pd.merge(images_per_additional_festival_df, additional_festivals_df, on='wikidata_id')[['count', 'wikidata_id', 'category', 'aut_com', 'latitude', 'longitude']].fillna('')
images_per_additional_festival_df = images_per_additional_festival_df.iloc[np.lexsort([images_per_additional_festival_df['category'], -images_per_additional_festival_df['count']])]
images_per_additional_festival_df['category'] = images_per_additional_festival_df['category'].map(lambda x: x.replace('_', ' '))
images_per_additional_festival_df['additional'] = 'additional'
images_per_additional_festival_df.head()

In [None]:
images_per_combined_festival_df = pd.concat([images_per_festival_df, images_per_additional_festival_df]).fillna('')
images_per_combined_festival_df.head()

In [None]:
images_per_combined_festival_df['dup_index'] = images_per_combined_festival_df.groupby(['latitude', 'longitude'])['latitude'].transform('idxmin')    
images_per_combined_festival_df['dup'] = images_per_combined_festival_df.duplicated(subset=['dup_index'])
images_per_combined_festival_df.head()

In [None]:
images_per_combined_festival_df['coordinates'] = images_per_combined_festival_df.apply(coordinate_shaker, axis=1)
images_per_combined_festival_df[['latitude', 'longitude']] = images_per_combined_festival_df['coordinates'].apply(pd.Series)
images_per_combined_festival_df.head()

In [None]:
images_per_combined_festival_df['geojson'] = images_per_combined_festival_df.apply(lambda row: to_geojson(row), axis=1)

In [None]:
features = images_per_combined_festival_df['geojson'].tolist()
feature_collection = FeatureCollection(features)
dump = geojson.dumps(feature_collection, ensure_ascii=False, indent=2)
#print(dump)

In [None]:
festivals_per_autcom = images_per_festival_df.groupby(['aut_com']).\
                                              count().\
                                              sort_values(by='count', ascending=False).\
                                              reset_index()[['aut_com', 'count']]
festivals_per_autcom['aut_com'] = festivals_per_autcom['aut_com'].map(lambda x: wrap_label(x, 14))
festivals_per_autcom

In [None]:
aut_coms = len(festivals_per_autcom.index)
aut_coms

In [None]:
# THIS IS YEAR-DEPENDEND AND RELIES ON MANUAL INSPECTION
remaining_autcoms = pd.DataFrame(data=[{'aut_com': 'La Rioja', 'count':0}],
                                 index=[16])
remaining_autcoms

In [None]:
festivals_per_autcom = festivals_per_autcom.append(remaining_autcoms)
festivals_per_autcom

In [None]:
fig, ax = plt.subplots(figsize=figsize)

p = sns.barplot(ax=ax, data=festivals_per_autcom, x='aut_com', y='count')
p.set_xlabel("Autonomous community", fontsize=18)
p.set_ylabel("# Festivals", fontsize=18)

p.set_title(label='{1} {0} in Spain: Festivals per autonomous community'.format(YEAR, TAG), fontsize=20)
p.tick_params(labelsize=14)

p.set_xticklabels(p.get_xticklabels(), rotation=90)

for patch in p.patches:
    height = patch.get_height()
    p.text(patch.get_x() + patch.get_width()/2.,
            height + 0.2,
            '{:1.0f}'.format(height),
            ha="center",
            fontsize=13)

upload_to_commons(p,
                  "{1} {0} in Spain - Festivals per autonomous community.png".format(YEAR, TAG),
                  'Catalogued festivals per autonomous community in {1} {0} in Spain'.format(YEAR, TAG_EXT),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
compact_images_per_festival = images_per_festival_df[:TOP_FESTIVALS][['count', 'category']]
compact_images_per_festival = compact_images_per_festival.iloc[np.lexsort([compact_images_per_festival['category'], -compact_images_per_festival['count']])]
compact_images_per_festival

In [None]:
images_per_festival_df[TOP_FESTIVALS:].sum()['count']

In [None]:
len(images_per_festival_df[TOP_FESTIVALS:].index)

In [None]:
others_row = pd.DataFrame(data={'category': 'Other festivals ({0})'.format(len(images_per_festival_df[TOP_FESTIVALS:].index)),
                                'count': images_per_festival_df[TOP_FESTIVALS:].sum()['count']}, 
                          index=[TOP_FESTIVALS+1])

In [None]:
no_id_row = pd.DataFrame(data={'category': 'Unknown/invalid',
                               'count': len(images_extended_df[images_extended_df['wikidata_id'] == ''].index)}, 
                         index=[TOP_FESTIVALS+2])

In [None]:
compact_images_per_festival = compact_images_per_festival.append(others_row)
compact_images_per_festival = compact_images_per_festival.append(no_id_row)
compact_images_per_festival['category'] = compact_images_per_festival['category'].map(lambda x: wrap_label(x))
compact_images_per_festival

In [None]:
fig, ax = plt.subplots(figsize=figsize)

p = sns.barplot(ax=ax, data=compact_images_per_festival, x='category', y='count')
p.set_xlabel("Festivals", fontsize=18)
p.set_ylabel("# Photographs", fontsize=18)

p.set_title(label='{1} {0} in Spain: Top 14 festivals'.format(YEAR, TAG), fontsize=20)
p.tick_params(labelsize=14)

p.set_xticklabels(p.get_xticklabels(), rotation=90)

for patch in p.patches:
    height = patch.get_height()
    p.text(patch.get_x() + patch.get_width()/2.,
            height + 10,
            '{:1.0f}'.format(height),
            ha="center",
            fontsize=13)

upload_to_commons(p,
                  "{1} {0} in Spain - Top festivals.png".format(YEAR, TAG),
                  'Top 12 festivals in {1} {0} in Spain'.format(YEAR, TAG_EXT),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
images_per_autcom = images_per_festival_df.groupby(['aut_com']).\
                     sum().\
                     sort_values(by='count', ascending=False).\
                     reset_index()
images_per_autcom = images_per_autcom.append(remaining_autcoms)
images_per_autcom

In [None]:
fig, ax = plt.subplots(figsize=figsize)

p = sns.barplot(ax=ax, data=images_per_autcom, x='aut_com', y='count')
p.set_xlabel("Autonomous Community", fontsize=18)
p.set_ylabel("# Photographs", fontsize=18)

p.set_title(label='{1} {0} in Spain: Photographs per autonomous community'.format(YEAR, TAG), fontsize=20)
p.tick_params(labelsize=14)

p.set_xticklabels(p.get_xticklabels(), rotation=90)

for patch in p.patches:
    height = patch.get_height()
    p.text(patch.get_x() + patch.get_width()/2.,
            height + 10,
            '{:1.0f}'.format(height),
            ha="center",
            fontsize=13)

upload_to_commons(p,
                  "{1} {0} in Spain - Photographs per autonomous community.png".format(YEAR, TAG),
                  'Photographs per autonomous community in {1} {0} in Spain'.format(YEAR, TAG_EXT),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
# usage management
n = 5 # number of images to ask for each time. Must have a lower value in heavy-used scenarios
list_df = [images_df[i:i+n] for i in range(0, images_df.shape[0], n)]
usage_dict = {}

counter = 0
for df in list_df :
    query_string_items = list()
    for _, row in df.iterrows():
        title = 'File:{0}'.format(row["image_title"])
        query_string_items.append(title)
    raw_api_query_string = '|'.join(query_string_items)
    MW_API_QUERY_STRING["titles"] = raw_api_query_string
    
    r = requests.post(MW_API_BASE_URL, data=urlencode(MW_API_QUERY_STRING))
    response = r.text

    try:
        response_dict = json.loads(response)
        for _, value in response_dict["query"]["pages"].items():
            uses_dict = value['globalusage']
            tuples = [(item['wiki'], 1) for item in uses_dict if (item['ns'] in VALID_NAMESPACES)]
            summary = [reduce(lambda x, y: (x[0], x[1]+y[1]), group) for _, group in groupby(sorted(tuples), key=itemgetter(0))]
            if len(summary) > 0 :
                counter +=1
                title = value['title'].replace('File:', '')
                summary_dict = {tuple[0]: tuple[1] for tuple in summary}
                usage_dict.update({title: summary_dict})
    except :
        print ('Error found')
        pass

In [None]:
# unique images used
usage_df = pd.DataFrame(usage_dict).transpose()
total_unique=usage_df.count(axis=1).count()
total_unique

In [None]:
# unique images used with campaign indicator
usage_campaign_df = pd.merge(usage_df.reset_index().rename(columns={'index': 'image_title'}),
                             images_df[['image_title', 'campaign']], 
                             how='inner', 
                             on='image_title')

In [None]:
# summary table
usages_df = pd.concat([usage_df.sum(), usage_df.count()], axis=1)
usages_df.columns = ['usages', 'unique']
usages_df['usages'] = usages_df['usages'].astype(int)
usages_df.sort_values(by=['unique'], axis=0, ascending=False, inplace=True)
usages_df['name'] = usages_df.index
usages_df['name'] = usages_df['name'].map(get_project_name)
usages_df.set_index(['name'], inplace=True)
usages_df = usages_df.iloc[np.lexsort([usages_df.index, -usages_df['unique']])]
usages_df

In [None]:
# THIS PARAMETER IS YEAR-DEPENDENT AND COMES FROM MANUAL INSPECTION
# May be set to the number of projects with more than 1 picture
TOP_PROJECTS = 11

In [None]:
remaining_df = pd.DataFrame(usages_df.iloc[TOP_PROJECTS:].sum()).transpose()
other_projects_num = len(usages_df.index)-TOP_PROJECTS
remaining_df.index=['Other projects ({})'.format(other_projects_num)]
top_df = usages_df.iloc[:TOP_PROJECTS]
reduced_usages_df = top_df.append(remaining_df)
reduced_usages_df

In [None]:
fig, ax = plt.subplots(figsize=figsize)

p = sns.barplot(ax=ax, x='index', y='unique', data=reduced_usages_df.reset_index())
p.set_xlabel("Project", fontsize=18)
p.set_ylabel("# Photographs", fontsize=18)

p.set_title(label='Unique photographs from {2} {0} in Spain used in Wikimedia projects\n({1})'.format(YEAR, now, TAG), fontsize=20)
p.tick_params(labelsize=14)

p.set_xticklabels(p.get_xticklabels(), rotation=90)

for patch in p.patches:
    height = patch.get_height()
    p.text(patch.get_x() + patch.get_width()/2.,
            height + 1,
            '{:1.0f}'.format(height),
            ha="center",
            fontsize=13)

upload_to_commons(p,
                  "{1} {0} in Spain - Unique photographs used in WMF projects.png".format(YEAR, TAG),
                  'Unique photographs from {1} {0} in Spain used in WMF projects: top 12 projects'.format(YEAR, TAG_EXT),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
fig, ax = plt.subplots(figsize=figsize)

p = sns.barplot(ax=ax, x='index', y='usages', data=reduced_usages_df.reset_index())
p.set_xlabel("Project", fontsize=18)
p.set_ylabel("# Uses", fontsize=18)

p.set_title(label='Uses of photographs from {2} {0} in Spain in Wikimedia projects\n({1})'.format(YEAR, now, TAG), fontsize=20)
p.tick_params(labelsize=14)

p.set_xticklabels(p.get_xticklabels(), rotation=90)

for patch in p.patches:
    height = patch.get_height()
    p.text(patch.get_x() + patch.get_width()/2.,
            height + 1,
            '{:1.0f}'.format(height),
            ha="center",
            fontsize=13)
    
upload_to_commons(p,
                  "{1} {0} in Spain - Uses of photographs in WMF projects.png".format(YEAR, TAG),
                  'Uses of photographs from {1} {0} in Spain in WMF projects: top 12 projects'.format(YEAR, TAG_EXT),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
# Quality images gallery
template = """This page lists the ${len(QI_list)} '''[[Commons:Quality Images|quality images]]''' uploaded as part of the [[Commons:${tag}|${tag}]] contest in ${year} in Spain.

<gallery>
% for image in QI_list:
${image}
% endfor
</gallery>

'''Statistics generation date''': {{subst:CURRENTMONTHNAME}} {{subst:CURRENTDAY}}, {{subst:CURRENTYEAR}}

[[Category:${tag} in Spain| Quality]]

"""
vars = {
    "QI_list": qi_list.values,
    "tag": TAG_EXT,
    "year": YEAR
}
t = Template(template)
qi_gallery_text = t.render(**vars)

In [None]:
qi_page = pb.Page(commons_site, GALLERY_QI)
if qi_page.text != qi_gallery_text:
    qi_page.text = qi_gallery_text
    pb.output('Publishing --> {1} {0} in Spain featured articles gallery'.format(YEAR, TAG))
    qi_page.save("{1} {0} in Spain featured articles gallery".format(YEAR, TAG))

#### Finalists

In [None]:
cat_wlf = pb.Category(commons_site, WLF_FINALIST_CATEGORY)
gen_wlf = pagegenerators.CategorizedPageGenerator(cat_wlf)

finalist_images_wlf = [page.title(withNamespace=False) for page in gen_wlf if page.is_filepage()]
finalist_images_count = len(finalist_images_wlf)
finalist_images_count

In [None]:
finalist_images_df = images_extended_df[images_extended_df['image_title'].isin(finalist_images_wlf)]
finalist_authors = finalist_images_df['uploader'].value_counts()
finalist_authors = finalist_authors.iloc[np.lexsort([finalist_authors.index, -finalist_authors.values])]
finalist_authors

In [None]:
finalist_authors_count = len(finalist_authors)
finalist_authors_count

In [None]:
fig, ax = plt.subplots(figsize=figsize_high)

p = sns.barplot(ax=ax, y=finalist_authors.index.map(lambda x: wrap_label(x, 20)), x=finalist_authors.values)
p.set_xlabel("# Photographs", fontsize=18)
p.set_ylabel("Contestants", fontsize=18)

p.set_title(label='{1} {0} in Spain: Finalists'.format(YEAR, TAG), fontsize=20)
p.tick_params(labelsize=14)

for patch in ax.patches:
    ax.text(patch.get_width() + 1,
            patch.get_y() + patch.get_height()/2.,
            '{:1.0f}'.format(patch.get_width()),
            ha="center",
            fontsize=14)

upload_to_commons(p,
                  "{1} {0} in Spain - Finalists.png".format(YEAR, TAG),
                  'Top contributors reaching the final round of {1} {0} in Spain.'.format(YEAR, TAG_EXT),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
finalist_festivals = finalist_images_df['category'].value_counts()
finalist_festivals = finalist_festivals.iloc[np.lexsort([finalist_festivals.index, -finalist_festivals.values])]
finalist_festivals

In [None]:
fig, ax = plt.subplots(figsize=figsize_high)

p = sns.barplot(ax=ax, y=finalist_festivals.index.map(lambda x: wrap_label(x, 25)), x=finalist_festivals.values)
p.set_xlabel("# Photographs", fontsize=18)
p.set_ylabel("Festivals", fontsize=18)

p.set_title(label='{1} {0} in Spain: Festivals in the final round'.format(YEAR, TAG), fontsize=20)
p.tick_params(labelsize=14)

for patch in ax.patches:
    ax.text(patch.get_width() + 1,
            patch.get_y() + patch.get_height()/2.,
            '{:1.0f}'.format(patch.get_width()),
            ha="center",
            fontsize=14)

upload_to_commons(p,
                  "{1} {0} in Spain - Finalist festivals.png".format(YEAR, TAG),
                  'Top festivals in the final round of {1} {0} in Spain.'.format(YEAR, TAG_EXT),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
finalist_campaigns = finalist_images_df['campaign'].value_counts()
finalist_campaigns

In [None]:
fig, ax = plt.subplots(figsize=figsize_low)

p = sns.barplot(ax=ax, y=finalist_campaigns[:3].index, x=finalist_campaigns[:3].values)
p.set_xlabel("# Photographs", fontsize=18)
p.set_ylabel("Festivals", fontsize=18)

p.set_title(label='{1} {0} in Spain: Campaigns in the final round'.format(YEAR, TAG), fontsize=20)
p.tick_params(labelsize=14)

for patch in ax.patches:
    ax.text(patch.get_width() + 1,
            patch.get_y() + patch.get_height()/2.,
            '{:1.0f}'.format(patch.get_width()),
            ha="center",
            fontsize=14)

p.set_yticklabels(['Autumn', 'Spring', 'Summer'])

upload_to_commons(p,
                  "{1} {0} in Spain - Finalist campaigns.png".format(YEAR, TAG),
                  'Campaigns in the final round of {1} {0} in Spain.'.format(YEAR, TAG_EXT),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
finalist_autcoms = finalist_images_df['aut_com'].value_counts()
finalist_autcoms = finalist_autcoms.iloc[np.lexsort([finalist_autcoms.index, -finalist_autcoms.values])]
finalist_autcoms

In [None]:
fig, ax = plt.subplots(figsize=figsize)

p = sns.barplot(ax=ax, y=finalist_autcoms.index.map(lambda x: wrap_label(x, 14)), x=finalist_autcoms.values)
p.set_xlabel("# Photographs", fontsize=18)
p.set_ylabel("Festivals", fontsize=18)

p.set_title(label='{1} {0} in Spain: Autonomous communities in the final round'.format(YEAR, TAG), fontsize=20)
p.tick_params(labelsize=14)

for patch in ax.patches:
    ax.text(patch.get_width() + 1,
            patch.get_y() + patch.get_height()/2.,
            '{:1.0f}'.format(patch.get_width()),
            ha="center",
            fontsize=14)

upload_to_commons(p,
                  "{1} {0} in Spain - Finalist autonomous communities.png".format(YEAR, TAG),
                  'Spanish autonomous communities in the final round of {1} {0} in Spain.'.format(YEAR, TAG_EXT),
                  plot_description,
                  YEAR,
                  TAG_EXT
                 )

In [None]:
template_file = os.path.join(templates_directory, 'wlf{0}.wiki'.format(YEAR))
fh = open(template_file, 'r', encoding = "utf-8")
template = fh.read()
fh.close()

In [None]:
vars = {
    "images_length": total_images_length,
    "valid_images_length": valid_images_length,
    "festival_images_length": 0,
    "qi_length": qi_length,
    "gallery_quality_images": GALLERY_QI,
    "wlf_festivals_length": wlf_festivals_length,
    "authors_length": authors_length,
    "new_uploaders_length": new_uploaders_length,
    "festival_length": festival_length,
    "aut_coms": aut_coms,
    "authors_df": authors_df,
    "images_per_festival_df": images_per_festival_df,
    "usages_df": usages_df,
    "total_unique": total_unique,
    "new_uploaders": new_uploaders,
    "new_uploaders_sum": new_uploaders.sum(),
    "campaign_count": campaign_count,
    "finalist_images_count": finalist_images_count,
    "finalist_authors_count": finalist_authors_count,
    "annexes": {i[1]: i[0] for i in festival_annexes},
    "map": dump,
    "year": YEAR,
    "tag": TAG,
    "full_tag": TAG_EXT,
    "base": BASE_WLF_NAME
}
t = Template(template)
statisticts_text = t.render(**vars)

In [None]:
stats_page = pb.Page(commons_site, STATISTICS_PAGE)
stats_page.text = statisticts_text
pb.output('Publishing --> {1} {0} in Spain Statistics'.format(YEAR, TAG))
stats_page.save("{1} {0} in Spain statistics".format(YEAR, TAG))