# Step 1: Data Acquisition

In [1]:
import pandas as pd

**Note: The following cells are the professor's work unless specific lines are denoted otherwise**

In [2]:
# 
# These are standard python modules
import json, time, urllib.parse
#
# The 'requests' module is not a standard Python module. You will need to install this with pip/pip3 if you do not already have it
import requests

LO: Added my UW email below, changed start and end dates

In [3]:
#########
#
#    CONSTANTS
#

# The REST API 'pageviews' URL - this is the common URL/endpoint for all 'pageviews' API requests
API_REQUEST_PAGEVIEWS_ENDPOINT = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'

# This is a parameterized string that specifies what kind of pageviews request we are going to make
# In this case it will be a 'per-article' based request. The string is a format string so that we can
# replace each parameter with an appropriate value before making the request
API_REQUEST_PER_ARTICLE_PARAMS = 'per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'

# The Pageviews API asks that we not exceed 100 requests per second, we add a small delay to each request
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

# When making a request to the Wikimedia API they ask that you include your email address which will allow them
# to contact you if something happens - such as - your code exceeding rate limits - or some other error 
REQUEST_HEADERS = {
    'User-Agent': '<obrienl@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

# This is just a list of English Wikipedia article titles that we can use for example requests
ARTICLE_TITLES = [ 'Bison', 'Northern flicker', 'Red squirrel', 'Chinook salmon', 'Horseshoe bat' ]

# This template is used to map parameter values into the API_REQUST_PER_ARTICLE_PARAMS portion of an API request. The dictionary has a
# field/key for each of the required parameters. In the example, below, we only vary the article name, so the majority of the fields
# can stay constant for each request. Of course, these values *could* be changed if necessary.
ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
    "project":     "en.wikipedia.org",
    "access":      "desktop",      # this should be changed for the different access types
    "agent":       "user",
    "article":     "",             # this value will be set/changed before each request
    "granularity": "monthly",
    "start":       "2015070100",   # start and end dates need to be set
    "end":         "2023093000"    # this is likely the wrong end date
}


The API request will be made using one procedure. The idea is to make this reusable. The procedure is parameterized, but relies on the constants above for the important parameters. The underlying assumption is that this will be used to request data for a set of article pages. Therefore the parameter most likely to change is the article_title.

In [4]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageviews_per_article(article_title = None, 
                                  endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT, 
                                  endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS, 
                                  request_template = ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE,
                                  headers = REQUEST_HEADERS):

    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['article'] = article_title

    if not request_template['article']:
        raise Exception("Must supply an article title to make a pageviews request.")

    # Titles are supposed to have spaces replaced with "_" and be URL encoded
    article_title_encoded = urllib.parse.quote(request_template['article'].replace(' ','_'))
    request_template['article'] = article_title_encoded
    
    # now, create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


**end of direct copy of Prof's example**

## Get List of Article Names

https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html

In [5]:
df_articles = pd.read_excel('thank_the_academy.AUG.2023.csv.xlsx')
df_articles.describe()

Unnamed: 0,name,url
count,1359,1359
unique,1359,1359
top,Everything Everywhere All at Once,https://en.wikipedia.org/wiki/Everything_Every...
freq,1,1


In [6]:
article_names = df_articles['name']

## Pull Monthly Desktop Access

References:
- https://www.geeksforgeeks.org/append-to-json-file-using-python/#
- Things I learned from example:
    - the structure of the returned data from the API

In [7]:
page_params_mobile_app = {
    "project":     "en.wikipedia.org",
    "access":      "mobile-app",      # this should be changed for the different access types
    "agent":       "user",
    "article":     "",             # this value will be set/changed before each request
    "granularity": "monthly",
    "start":       "2015070100",   # start and end dates need to be set
    "end":         "2023093000"    # this is likely the wrong end date
}

page_params_mobile_web = {
    "project":     "en.wikipedia.org",
    "access":      "mobile-web",      # this should be changed for the different access types
    "agent":       "user",
    "article":     "",             # this value will be set/changed before each request
    "granularity": "monthly",
    "start":       "2015070100",   # start and end dates need to be set
    "end":         "2023093000"    # this is likely the wrong end date
}

In [None]:
# Define articles dictionary
articles_data = {}

article_count = 0

# For each article
for title in article_names:
    
    # Get data from API for article
    mobile_app_article_views = request_pageviews_per_article(title, endpoint_params=page_params_mobile_app)
    mobile_web_article_views = request_pageviews_per_article(title, endpoint_params=page_params_mobile_web)
    
    month_list = []
    
    # Modify the retrieved data -------------!!!!!!!!!!!! Got idea for this form of iterating from Prof example
    for month in article_views['items']:
        # Remove access key from each month's entry
        del month['access'] # learned from https://www.geeksforgeeks.org/python-ways-to-remove-a-key-from-dictionary/
        
        # Reconstruct list of months
        month_list.append(month)
        
        # Store in a dictionary of articles
        articles_data[title] = month_list
        
    article_count += 1
    print("------ Added {}: {} ----------".format(article_count,title))
        
# Save to JSON file - Reference: https://www.geeksforgeeks.org/writing-to-file-in-python/
my_file = open(r"academy_monthly_desktop_201507-202309.json", "w")
my_file.write(json.dumps(articles_data, indent=4)) # !!!!!borrowed part of a line of code from Prof example
my_file.close()

------ Added 1: Everything Everywhere All at Once ----------
------ Added 2: All Quiet on the Western Front (2022 film) ----------
------ Added 3: The Whale (2022 film) ----------
------ Added 4: Top Gun: Maverick ----------
------ Added 5: Black Panther: Wakanda Forever ----------
------ Added 6: Avatar: The Way of Water ----------
------ Added 7: Women Talking (film) ----------
------ Added 8: Guillermo del Toro's Pinocchio ----------
------ Added 9: Navalny (film) ----------
------ Added 10: The Elephant Whisperers ----------
------ Added 11: An Irish Goodbye ----------
------ Added 12: The Boy, the Mole, the Fox and the Horse (film) ----------
------ Added 13: RRR (film) ----------
------ Added 14: CODA (2021 film) ----------
------ Added 15: Dune (2021 film) ----------
------ Added 16: The Eyes of Tammy Faye (2021 film) ----------
------ Added 17: No Time to Die ----------
------ Added 18: The Windshield Wiper ----------
------ Added 19: The Long Goodbye (Riz Ahmed album) --------

------ Added 170: Midnight in Paris ----------
------ Added 171: The Help (film) ----------
------ Added 172: A Separation ----------
------ Added 173: The Fantastic Flying Books of Mr. Morris Lessmore ----------
------ Added 174: The Shore (2011 film) ----------
------ Added 175: Undefeated (2011 film) ----------
------ Added 176: The Muppets (film) ----------
------ Added 177: Saving Face (2012 film) ----------
------ Added 178: Beginners ----------
------ Added 179: Rango (2011 film) ----------
------ Added 180: The King's Speech ----------
------ Added 181: Inception ----------
------ Added 182: The Social Network ----------
------ Added 183: The Fighter ----------
------ Added 184: Toy Story 3 ----------
------ Added 185: Alice in Wonderland (2010 film) ----------
------ Added 186: Black Swan (film) ----------
------ Added 187: In a Better World ----------
------ Added 188: The Lost Thing ----------
------ Added 189: God of Love (film) ----------
------ Added 190: The Wolfman (201

## Pull Mobile Data

In [None]:
# Define articles dictionary
articles_data = {}

article_count = 0

# For each article
for title in article_names:
    
    # Get data from API for article
    article_views = request_pageviews_per_article(title)
    
    month_list = []
    
    # Modify the retrieved data -------------!!!!!!!!!!!! Got idea for this form of iterating from Prof example
    for month in article_views['items']:
        # Remove access key from each month's entry
        del month['access'] # learned from https://www.geeksforgeeks.org/python-ways-to-remove-a-key-from-dictionary/
        
        # Reconstruct list of months
        month_list.append(month)
        
        # Store in a dictionary of articles
        articles_data[title] = month_list
        
    article_count += 1
    print("------ Added {}: {} ----------".format(article_count,title))
        
# Save to JSON file - Reference: https://www.geeksforgeeks.org/writing-to-file-in-python/
my_file = open(r"academy_monthly_desktop_201507-202309.json", "w")
my_file.write(json.dumps(articles_data, indent=4)) # !!!!!borrowed part of a line of code from Prof example
my_file.close()