# Create tool for informative infographics from structured information from Wikimedia projects - Task A




### Wikimedia API documentation

Here you can find the documentation for the Wikimedia API: https://wikimedia.org/api/rest_v1.

You do not need to read every function there, but focus on the *PageViews data* section.


### Completing the gaps

Based on knowledge of Python and the documentation for the Wikimedia API, I would implement below:
* A function to get a list of the most viewed articles in the Portuguese Wikipedia for the month of January, ordered from the most viewed to the least one;
  * Be aware that there are false positives, so you will need to remove rows of data of other projects other than the Portuguese Wikipedia;
* A function to get a dataframe of the most viewed articles in the Portuguese Wikipedia for the period of January 1st, 2024 and February 29th, 2024
  * The result dataframe should have the folowing structure:
    * Each row represents an article (A);
    * Each column header (besides the articles names) represents a day of this period (D);
    * The cells values store the visualization of the article A on the date D.
 

**Objective:** *(2) coding functions about the most viewed articles in the Portuguese Wikipedia.*

In [3]:
# TODO: add libraries as necessary
import pandas as pd
import requests


In [4]:
# Helper functions 
from datetime import datetime, timedelta
from collections import defaultdict

def get_all_days_between_start_and_end(start_date, end_date):
    days_list = []
    # create a loop to generate turples for each day
    current_date = start_date
    
    while current_date <= end_date:
        # append a tuple containing (year, month, day)
        days_list.append((current_date.year, current_date.strftime('%m'), current_date.strftime('%d')))
    
        # increment the current date by one day
        current_date += timedelta(days=1)
    return days_list

def combine_remove_duplicates_and_sort_list (combined_list):
    # Use a defaultdict to sum the views for each unique article
    views_dict = defaultdict(int)
    for entry in combined_list:
        views_dict[entry['article']] += entry['views']

    # Convert the dictionary back to a list of dictionaries
    result = [{'article': article, 'views': views} for article, views in views_dict.items()]

    # Sort the result based on views in descending order
    result.sort(key=lambda x: x['views'], reverse=True)

    return result
    


In [5]:
# To return a list of articles names instead of JSON objects

def combine_remove_duplicates_and_sort_list(combined_list):
    # Use a defaultdict to sum the views for each unique article
    views_dict = defaultdict(int)
    for entry in combined_list:
        views_dict[entry['article']] += entry['views']

    # Convert the dictionary back to a list of articles
    result = [{'article': article, 'views': views} for article, views in views_dict.items()]

    # Sort the result based on views in descending order
    result.sort(key=lambda x: x['views'], reverse=True)

    # Return a list of articles instead of a list of dictionaries
    return [article['article'] for article in result]


In [6]:
# TODO: add parameters as necessary and execute this block
# function to filter out fals positives from articles
def is_article(article_title):
    # Check if the article title does not have a prefix followed by a colon (':')
    # and if it is not equal to 'Wikipedia:What_is_an_article?'
    return ':' not in article_title and article_title != 'Wikipedia:What_is_an_article%3F'

def most_viewed_ptwiki_jan():
    start_date = datetime(2024, 1, 1)
    end_date = datetime(2024, 1, 30)
    days_in_jan = get_all_days_between_start_and_end(start_date, end_date)

    combined_articles = []

    for date in days_in_jan:
        articles = []
        path = f'https://wikimedia.org/api/rest_v1/metrics/pageviews/top/pt.wikipedia.org/all-access/{date[0]}/{date[1]}/{date[2]}'
        response = requests.get(path, headers={"User-Agent": "user_agent", "accept": "application/json"})
        items = response.json()["items"]
        articles = items[0]["articles"]
        articles = [{'article': entry['article'], 'views': entry['views']} for entry in articles if is_article(entry['article'])]

        combined_articles += articles

    merged_articles = combine_remove_duplicates_and_sort_list(combined_articles)
        
    # return a sorted list of the most viewed articles in the Portuguese Wikipedia from the top to the bottom
    return merged_articles


In [7]:
# TODO: add parameters as necessary and execute this block

def is_article(article_title):
    # Check if the article title does not have a prefix followed by a colon (':')
    return ':' not in article_title

def most_viewed_ptwiki_jan_feb_per_day():
    # return a dataframe of the most viewed articles in the Portuguese Wikipedia for the first two months of the year, by day
    start_date = datetime(2024, 1, 1)
    end_date = datetime(2024, 2, 29)
    
    days_in_jan_to_feb = get_all_days_between_start_and_end(start_date, end_date)

    data_for_jan_feb = {}

    for date_tuple in days_in_jan_to_feb:
        articles = []
        path = f'https://wikimedia.org/api/rest_v1/metrics/pageviews/top/pt.wikipedia.org/all-access/{date_tuple[0]}/{date_tuple[1]}/{date_tuple[2]}'
        response = requests.get(path, headers={"User-Agent": "user_agent", "accept": "application/json"})
        items = response.json().get("items", [])
        if items:
            articles = items[0].get("articles", [])
            # Filter out articles with title prefix followed by a colon
            articles = [{'article': article['article'], 'views': article['views']} for article in articles if is_article(article['article'])]

        date_object = datetime(*map(int, date_tuple))
        formatted_date = date_object.strftime("%b %d, %Y")
        data_for_jan_feb[formatted_date] = articles 

    df = pd.DataFrame()

    # Iterate over each date and its associated articles
    for date, articles in data_for_jan_feb.items():
        temp_df = pd.DataFrame({article["article"]: article["views"] for article in articles}, index=[date])
        df = pd.concat([df, temp_df], axis=0, sort=False)

    # If needed, fill NaN values with 0
    df = df.fillna(0)
    df = df.transpose()
    df = df.reset_index()
    df.columns = ["article"] + list(data_for_jan_feb.keys())

    df.set_index("article", inplace=True)
    
    return df


In [8]:
# TODO: add parameters as necessary and execute this block
top_viewed_list = most_viewed_ptwiki_jan()
top_viewed_list

['XXx',
 'Fotos_dos_Mamonas_Assassinas_mortos',
 'Voo_Força_Aérea_Uruguaia_571',
 'Facebook',
 'Zagallo',
 'Renascer',
 'Porno_Graffitti',
 'ChatGPT',
 'Yasmin_Brunet',
 'Cleópatra',
 'AMBEV',
 'Griselda_Blanco',
 'Renascer_(2024)',
 'Copa_São_Paulo_de_Futebol_Júnior',
 'YouTube',
 'Napoleão_Bonaparte',
 'Sony_Channel',
 'Rodriguinho_(cantor)',
 'Brasil',
 'Twitter',
 'TV_Globo',
 'João_Carreiro_&_Capataz',
 'Ano-novo',
 'Jeffrey_Epstein',
 'Canal_Brasil',
 'Domingos_Brazão',
 'Cristiano_Ronaldo',
 'Instagram',
 'Mamonas_Assassinas',
 'Louis_Joseph_César_Ducornet',
 'Copa_São_Paulo_de_Futebol_Júnior_de_2024',
 'Big_Brother_Brasil_24',
 'Campeonato_Africano_das_Nações',
 'Franz_Beckenbauer',
 'Marcinho_VP',
 'Carlos_Alberto_Parreira',
 'Dorival_Júnior',
 'Thiago_Carpini',
 'Robert_Oppenheimer',
 'Fernando_Parrado',
 'Vanessa_Lopes',
 'Wanessa_Camargo',
 'Mortes_em_janeiro_de_2024',
 'Jogo_do_bicho',
 'São_Paulo',
 'Roberto_Canessa',
 'Portugal',
 'Terra_e_Paixão',
 'Fundação_Padre_Anchi

In [9]:
# TODO: add parameters as necessary and execute this block
top_viewed_dataframe = most_viewed_ptwiki_jan_feb_per_day()
top_viewed_dataframe

Unnamed: 0_level_0,"Jan 01, 2024","Jan 02, 2024","Jan 03, 2024","Jan 04, 2024","Jan 05, 2024","Jan 06, 2024","Jan 07, 2024","Jan 08, 2024","Jan 09, 2024","Jan 10, 2024",...,"Feb 20, 2024","Feb 21, 2024","Feb 22, 2024","Feb 23, 2024","Feb 24, 2024","Feb 25, 2024","Feb 26, 2024","Feb 27, 2024","Feb 28, 2024","Feb 29, 2024"
article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ano-novo,112043.0,21164.0,2617.0,1548.0,1008.0,704.0,621.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Véspera_de_ano-novo,29027.0,4011.0,1891.0,1206.0,775.0,651.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ipira,16014.0,2457.0,831.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Facebook,13195.0,14024.0,14480.0,14609.0,13584.0,12304.0,12509.0,14081.0,15977.0,14702.0,...,15625.0,15662.0,14481.0,13224.0,12502.0,11664.0,15940.0,17258.0,15484.0,15548.0
2024,9245.0,4104.0,2624.0,2213.0,1670.0,1948.0,1592.0,1608.0,1396.0,1374.0,...,1009.0,905.0,928.0,969.0,855.0,883.0,985.0,1031.0,981.0,1085.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Yin-yang,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,540.0
Mulher,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,539.0
Soraia_Chaves,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,537.0
Aurora_polar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,537.0


### Data visualization

Here you can find the documentation for a library for a Bar Chart Race library: https://pypi.org/project/bar-chart-race.

Read and understood the documentation and used this library to create a function that display an animated race chart of the data frame produced in the section before (*top_viewed_dataframe*).

**Objective:** *(3) to create an animated visualization of the data gathered.*

In [10]:

!pip install --upgrade bar-chart-race



In [11]:
# TODO: add libraries as necessary
import bar_chart_race as bcr

In [12]:
# Download a static FFmpeg build and add it to PATH.
%run 'util/load-ffmpeg.ipynb'
print('Done!')

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 39.4M  100 39.4M    0     0   129M      0 --:--:-- --:--:-- --:--:--  129M
env: PATH=/srv/paws/pwb:/srv/paws/bin:/srv/paws:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/srv/openrefine:/srv/julia/bin:./ffmpeg-6.1-amd64-static
./ffmpeg-6.1-amd64-static/ffmpeg
Done!


In [13]:
def dataframe_to_race_chart(df=top_viewed_dataframe.head(10), n_bars=10): 
    fixed_order = list(df.columns)
    fixed_order = fixed_order[:n_bars]
    # fixed_order.reverse()
    
    bcr_html = bcr.bar_chart_race(
        df=df,
        title='Article Views Over Time',
        orientation='h',  
        sort='desc', 
        n_bars=n_bars,  # Number of bars to display
        steps_per_period=20, # Animation steps per period
        period_length=1000, # Length of each period in milliseconds
        figsize=(8, 5),  # Figure size
        bar_label_size=7,  # Adjust font size of the labels
        label_bars=True,  # Show labels on the bars
        fixed_order=fixed_order,  # Fix the order of bars to prevent flickering
    )

    display(bcr_html)


In [14]:
dataframe_to_race_chart()

  df_values.iloc[:, 0] = df_values.iloc[:, 0].fillna(method='ffill')
  ax.set_yticklabels(self.df_values.columns)
  ax.set_xticklabels([max_val] * len(ax.get_xticks()))
