In [1]:
# Makes the plots appear within the notebook
%matplotlib inline

# Two fundamental packages for doing data manipulation
import numpy as np                   # http://www.numpy.org/
import pandas as pd                  # http://pandas.pydata.org/

#  for plotting data
import matplotlib.pyplot as plt      # http://matplotlib.org/

# Package to save/load Python structures in the "Pickle" format
import pickle

# For using Regular Expression string searches
import re



# Some setup for display, helper libs, etc.
from copy import deepcopy
pd.options.display.max_columns = 100
pd.options.display.max_rows = 110
pd.options.mode.chained_assignment = None 

# Exercise: JSON

Load the following simple json object:

In [2]:
import json
ex_json= json.loads(
'''
{
    "title": "Product",
    "schema": "http://json-schema.org/draft-04/schema#",
    "description": "A product from Acme's catalog",
    "type": "object",
    "properties": {
        "id": {
            "description": "The unique identifier for a product",
            "type": "integer"
        }
    },
    "required": ["id", "price", "range", "color"]
}'''
)

## E1: Access the value of the key 'schema' and then of 'description'

In [3]:
ex_json['schema']

'http://json-schema.org/draft-04/schema#'

In [4]:
ex_json['properties']['id']['description']

'The unique identifier for a product'

## E2: Access the list item on the 2nd position of the "required" key

In [5]:
ex_json['required'][2]

'range'

## E3: Access the content/value of the *second* key/value pair

The right answer is: **DON'T**, positional arguments make no sense in a key-value based format like used in json, unless there are specific lists included inside with predefined rules for ordering that you'd like to access. If you have to, do the following, but the results can be very incosistent if you parse larger amounts of (especially unseen) json objects. 

In [6]:
list(ex_json.values())[1]

'http://json-schema.org/draft-04/schema#'

## E4: Print out the values for all the keys under 'id' using a 'for' loop

In [7]:
for keys in ex_json['properties']['id']:
    print(ex_json['properties']['id'][keys])

The unique identifier for a product
integer


# Wikipedia Link API

## Oleg's Links

Get all the links to other pages contained in the article "Oleg Rykhlevich" and save them in a DF with link title and link namespace (ns) as the columns

See the API parameters / call here: https://www.mediawiki.org/wiki/API:Links

In [8]:

import requests

api_url = 'https://en.wikipedia.org/w/api.php' #base URL for the Web API of the English Wikipedia

some_params={'action': 'query',
            'titles': 'Oleg Rykhlevich',
            'prop': 'links',
            'pllimit': 'max',

            'format': 'json'}

def get_page_data_from_wp_api_simple(our_params):
        
    result = requests.get(url=api_url, params=our_params, timeout=30).json() #set up the query and retrieve the results
    pages = result['query']['pages'] # go and get the content of the sub-element 'pages' from the element 'query'
    
    for page_id in pages: # only 1 if called with only one article title
        page_data = pages[page_id] # content of the page/article element
        for link_data in page_data.get('links', []): #iterate over the entries in the "links" list
                yield link_data



page_link_list = list( get_page_data_from_wp_api_simple(some_params))


pagelinks_df = pd.DataFrame(page_link_list)
pagelinks_df

Unnamed: 0,ns,title
0,0,1996 Summer Olympics
1,0,1997 European Aquatics Championships
2,0,Belarus
3,0,Freestyle swimming
4,0,LEN European Aquatics Championships
5,0,Russian language
6,0,"Seville, Spain"
7,0,Summer Olympics
8,0,Swimmer
9,0,Swimming (sport)


## Oleg and the Magellanic Catalogue of Stars

Get the links for both 'Oleg Rykhlevich' and 'Magellanic Catalogue of Stars' and put them in one DF

In [9]:
some_params={'action': 'query',
            'titles': 'Oleg Rykhlevich|Magellanic Catalogue of Stars',
            'prop': 'links',
            'pllimit': 'max',
            'format': 'json'}

page_link_list = list(get_page_data_from_wp_api_simple(some_params))

pagelinks_df_both = pd.DataFrame(page_link_list)
pagelinks_df_both

Unnamed: 0,ns,title
0,0,Astronomical catalog
1,0,Catalogues of Fundamental Stars
2,0,ESO
3,0,Large Magellanic Cloud
4,0,PPM Star Catalogue
5,0,Photographic magnitude
6,0,Schmidt camera
7,0,Small Magellanic Cloud
8,4,Wikipedia:Stub
9,10,Template:Astronomical-catalogue-stub


## A merger

Get only the links for 'Magellanic Catalogue of Stars' as a DF and merge it afterwards with the Oleg-only DF you created in 2.1, so that you are left with only one DF that contains all the linked pages from both articles. Bonus: Add a row to indicate where each row comes from. 

In [10]:
some_params={'action': 'query',
            'titles': 'Magellanic Catalogue of Stars',
            'prop': 'links',
            'pllimit': 'max',
          
            'format': 'json'}

def get_page_data_from_wp_api_simple(our_params):
        
    result = requests.get(url=api_url, params=our_params, timeout=30).json() #set up the query and retrieve the results
        
    pages = result['query']['pages'] # go and get the content of the sub-element 'pages' from the element 'query'
    
    for page_id in pages: # only 1 if called with only one article title
        page_data = pages[page_id] # content of the page/article element
        for link_data in page_data.get('links', []): #iterate over the entries in the "links" list
                yield link_data

page_link_list = list() # an emtpy list

for link_data in get_page_data_from_wp_api_simple(some_params):
    page_link_list.append(link_data)

pagelinks_df_M = pd.DataFrame(page_link_list)
pagelinks_df_M

pagelinks_df.merge(pagelinks_df_M, how='outer', indicator=True)

Unnamed: 0,ns,title,_merge
0,0,1996 Summer Olympics,left_only
1,0,1997 European Aquatics Championships,left_only
2,0,Belarus,left_only
3,0,Freestyle swimming,left_only
4,0,LEN European Aquatics Championships,left_only
5,0,Russian language,left_only
6,0,"Seville, Spain",left_only
7,0,Summer Olympics,left_only
8,0,Swimmer,left_only
9,0,Swimming (sport),left_only
