# Notebook for experimenting with Wikimedia APIs, how to extract charts, etc. 

In [2]:
import json
from bs4 import BeautifulSoup
import requests
import spacy
from spacy.language import Language
from spacy_language_detection import LanguageDetector

import requests
from PIL import Image

In [3]:
CHART_DICT = {
    'title': "", 
    'type': "", 
    'url': "", 
    'image_path': "", 
    'source': "",
    'descriptions': "", 
    'wikipedia_pages': []
}

COMMONS_CHART_URL = "https://commons.wikimedia.org/{}"


In [5]:
def get_lang_detector(nlp, name):
    return LanguageDetector(seed=42)  # We use the seed 42


nlp_model = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp_model.add_pipe('language_detector', last=True)


<spacy_language_detection.spacy_language_detector.LanguageDetector at 0x2723dba1a60>

In [6]:
page = "https://commons.wikimedia.org/wiki/Category:Horizontal_bar_charts"
CHART_TYPE = "horizontal_bar_charts"

# load website
response = requests.get(page)
page_content = BeautifulSoup(response.content, "html.parser")
chart_image_list = []

In [7]:
def get_english_description(chart_page): 
    for td_item in chart_page.find_all("td", {"class": "description"}): 
        for div_item in td_item.find_all("div"):       
            doc = nlp_model(div_item.text)
            for i, sent in enumerate(doc.sents):
                # if language not english and probability > 0.5 stop 
                if sent._.language["language"]!="en" and sent._.language["score"]>0.5:
                    return None
                else: 
                    description = div_item.text
                    return description
                

def get_wiki_links(chart_page): 
    wiki_links = []
    page_items = chart_page.find_all("div", {"id": "mw-imagepage-section-globalusage"})
    
    if page_items and len(page_items)>0:
        for wiki_link_item in page_items[0].find_all("a"):
            wiki_links.append(wiki_link_item["href"])
            
    return wiki_links


def extract_chart(chart_link): 
    # access single chart image page, e.g. "https://commons.wikimedia.org/wiki/File:1_guadeloupe_pesticides.jpg"
    chart_response = requests.get(chart_link)
    chart_page = BeautifulSoup(chart_response.content, "html.parser")
    
    # check if English chart
    description = get_english_description(chart_page)
    if not description or description is None: 
        print(f"{chart_link} non-english chart!")
        return None
    
    # extract Wikipedia pages using this chart
    wiki_links = get_wiki_links(chart_page)
        
    # save chart image locally
    image_path = ""
        
    # save chart dict 
    chart_dict_copy = CHART_DICT.copy()
    chart_dict_copy["title"] = chart_page.title.text
    chart_dict_copy["type"] = CHART_TYPE
    chart_dict_copy["url"] = chart_link
    chart_dict_copy["image_path"] = image_path
    chart_dict_copy["description"] = description
    chart_dict_copy["wikipedia_pages"] = wiki_links
    
    return chart_dict_copy
    

In [18]:
# iterate over images on overview page
for item in page_content.find_all('li', {"class": "gallerybox"})[:1]:

    # for each image retrieve link to its page
    chart_link = COMMONS_CHART_URL.format(item.find_all('a')[1]["href"])
    
    chart_response = requests.get(chart_link)
    chart_page = BeautifulSoup(chart_response.content, "html.parser")
    
    
    # extract and save chart (content)
#     chart_dict_copy = extract_chart(chart_link)
    
#     if chart_dict_copy:
#         chart_image_list.append(chart_dict_copy)

# continue with "next" page


In [59]:
chart_link = "https://commons.wikimedia.org//wiki/File:1881_Occupational_Structure_of_Frostenden.png"

In [60]:
chart_response = requests.get(chart_link)
chart_page = BeautifulSoup(chart_response.content, "html.parser")
chart_page

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>File:1881 Occupational Structure of Frostenden.png - Wikimedia Commons</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":true,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"c4fcfdbb-b8da-41ad-a110-02e900330de2","wgCSPNonce":false,"wgCanonicalNamespace":"File","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":6,"wgPageName":"File:1881_Occupational_Structure_of_Frostenden.png","wgTitle":"1881 Occupational Structure of Frostenden.png","wgCurRevisionId":461237249,"wgRevisionId":461237249,"wgArticleId":57073346,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CC-BY-SA-4.0","Self-published work",

In [61]:
for img_item in chart_page.find_all("img"):
    if "File:" in img_item["alt"]:
        print(img_item["src"])


https://upload.wikimedia.org/wikipedia/commons/b/b8/1881_Occupational_Structure_of_Frostenden.png?20170313212635


In [86]:
# response = requests.get(chart_page.find_all("img")[0]["src"])
response = requests.get("https://upload.wikimedia.org/wikipedia/commons/c/c8/WM_1.png")

with open(r"C:\Users\k20116188\PycharmProjects\chartfc_dataset_wikicommons\data\local.png", 'wb') as f:
    f.write(response.content)

In [53]:
source = "https://commons.wikimedia.org//wiki/File:1881_Occupational_Orders_in_Poughill,_Devon.png"
"".join(source.split("File:")[1:])

'1881_Occupational_Orders_in_Poughill,_Devon.png'

In [24]:
page_content.find_all('li', {"class": "gallerybox"})[0].find_all('a')[1]["href"]

'/wiki/File:%22Americans_more_likely_to_saw_growing_diversity_makes_their_country_a_better_place_to_live%22_(2016),_Pew_Research.png'

In [25]:
href = page_content.find_all('li', {"class": "gallerybox"})[0].find_all('a')[1]["href"]

In [26]:
chart_link = COMMONS_CHART_URL.format(href)

In [73]:
chart_link = "https://commons.wikimedia.org/wiki/File:1_guadeloupe_pesticides.jpg"

In [74]:
chart_link

'https://commons.wikimedia.org/wiki/File:1_guadeloupe_pesticides.jpg'

In [75]:
chart_response = requests.get(chart_link)
chart_page = BeautifulSoup(chart_response.content, "html.parser")


In [76]:
chart_page

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>File:1 guadeloupe pesticides.jpg - Wikimedia Commons</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":true,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"cc67fee5-28c3-497d-93ae-1aa450d94d9f","wgCSPNonce":false,"wgCanonicalNamespace":"File","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":6,"wgPageName":"File:1_guadeloupe_pesticides.jpg","wgTitle":"1 guadeloupe pesticides.jpg","wgCurRevisionId":659605967,"wgRevisionId":659605967,"wgArticleId":2789216,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Images which should use wikicharts","Graph images that should use vector graphics","JPG ima

In [93]:
for wiki_link_item in chart_page.find_all("div", {"id": "mw-imagepage-section-globalusage"})[0].find_all("a"): 
    print(wiki_link_item["href"])

<a class="external" href="https://fr.wikipedia.org/wiki/G%C3%A9ographie_de_la_Guadeloupe">Géographie de la Guadeloupe</a>
https://fr.wikipedia.org/wiki/G%C3%A9ographie_de_la_Guadeloupe


In [85]:
next_page_link = ""

for link in page_content.find_all("a"): 
    if link.text.strip() == "next page": 
        next_page_link = link
        
next_page_link

<a href="/w/index.php?title=Category:Horizontal_bar_charts&amp;filefrom=Wikipedia+top+25+week+June+3+June+9%2C+2018.png#mw-category-media" title="Category:Horizontal bar charts">next page</a>

In [84]:
COMMONS_CHART_URL.format(next_page_link["href"])

'https://commons.wikimedia.org//w/index.php?title=Category:Horizontal_bar_charts&filefrom=Wikipedia+top+25+week+June+3+June+9%2C+2018.png#mw-category-media'

### How-to WikiMedia Commons API

In [93]:

file = 'File:The_Blue_Marble.jpg'

headers = {
  # 'Authorization': 'Bearer YOUR_ACCESS_TOKEN',
  'User-Agent': 'YOUR_APP_NAME (YOUR_EMAIL_OR_CONTACT_PAGE)'
}

base_url = 'https://api.wikimedia.org/core/v1/commons/file/'
url = base_url + file
response = requests.get(url, headers=headers)

In [159]:
file = '"Americans more likely to saw growing diversity makes their country a better place to live" (2016), Pew Research.png'
url = 'https://api.wikimedia.org/core/v1/commons/file/' + file

headers = {
#   'Authorization': 'Bearer YOUR_ACCESS_TOKEN',
  'User-Agent': 'mubashara.akhtar@kcl.ac.uk'
}

response = requests.get(url, headers=headers)
data = response.json()
print(data)

data["original"]["url"]

{'title': '"Americans more likely to saw growing diversity makes their country a better place to live" (2016), Pew Research.png', 'file_description_url': '//commons.wikimedia.org/wiki/File:%22Americans_more_likely_to_saw_growing_diversity_makes_their_country_a_better_place_to_live%22_(2016),_Pew_Research.png', 'latest': {'timestamp': '2020-05-24T09:24:37Z', 'user': {'id': 1547075, 'name': 'Illegitimate Barrister'}}, 'preferred': {'mediatype': 'BITMAP', 'size': 15477, 'width': 420, 'height': 474, 'duration': None, 'url': 'https://upload.wikimedia.org/wikipedia/commons/e/e6/%22Americans_more_likely_to_saw_growing_diversity_makes_their_country_a_better_place_to_live%22_%282016%29%2C_Pew_Research.png'}, 'original': {'mediatype': 'BITMAP', 'size': 15477, 'width': 420, 'height': 474, 'duration': None, 'url': 'https://upload.wikimedia.org/wikipedia/commons/e/e6/%22Americans_more_likely_to_saw_growing_diversity_makes_their_country_a_better_place_to_live%22_%282016%29%2C_Pew_Research.png'}, 'th

'https://upload.wikimedia.org/wikipedia/commons/e/e6/%22Americans_more_likely_to_saw_growing_diversity_makes_their_country_a_better_place_to_live%22_%282016%29%2C_Pew_Research.png'

In [112]:
url = "https://upload.wikimedia.org/wikipedia/commons/thumb/7/78/The_Blue_Marble.jpg/1023px-The_Blue_Marble.jpg"
response = requests.get(data["preferred"]["url"], headers=headers)

with open(r"C:\Users\k20116188\PycharmProjects\chartfc_dataset_wikicommons\data\local.png", 'wb') as f:
    f.write(response.content)


In [119]:
headers = {
    'User-Agent': 'My User Agent 1.0'
}
picture_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/e/ea/Van_Gogh_-_Starry_Night_-_Google_Art_Project.jpg/2728px-Van_Gogh_-_Starry_Night_-_Google_Art_Project.jpg"
r = requests.get(picture_url, headers=headers, stream=True)
r.status_code == 200

True

### Wikimedia Commons API: get all images in a category

In [141]:
category_url = "https://commons.wikimedia.org/w/api.php?action=query&list=categorymembers&cmtitle=Category:Horizontal_bar_charts&cmlimit=500&cmtype=file&format=json"
r = requests.get(category_url, headers=headers, stream=True)
r

<Response [200]>

In [145]:
response_json = r.json()
response_json

{'batchcomplete': '',
 'query': {'categorymembers': [{'pageid': 90591760,
    'ns': 6,
    'title': 'File:"Americans more likely to saw growing diversity makes their country a better place to live" (2016), Pew Research.png'},
   {'pageid': 5979050,
    'ns': 6,
    'title': 'File:040209 MC CR Una idea Img2.jpg'},
   {'pageid': 2789216, 'ns': 6, 'title': 'File:1 guadeloupe pesticides.jpg'},
   {'pageid': 116967696,
    'ns': 6,
    'title': 'File:1850-2019 Cumulative greenhouse gas emissions by region - bar chart - IPCC AR6 WG3 - Fig SPM.2b.svg'},
   {'pageid': 56090464,
    'ns': 6,
    'title': 'File:1881 data for occupations of males and females in Hessett. .png'},
   {'pageid': 48303014,
    'ns': 6,
    'title': 'File:1881 Occupation structure graph of Whinburgh and Westfield.jpg'},
   {'pageid': 58106443,
    'ns': 6,
    'title': 'File:1881 Occupational Orders in Poughill, Devon.png'},
   {'pageid': 57073346,
    'ns': 6,
    'title': 'File:1881 Occupational Structure of Frostend

In [149]:
response_json["query"]["categorymembers"][0]

{'pageid': 90591760,
 'ns': 6,
 'title': 'File:"Americans more likely to saw growing diversity makes their country a better place to live" (2016), Pew Research.png'}

In [148]:
response_json["query"]["categorymembers"]

[{'pageid': 90591760,
  'ns': 6,
  'title': 'File:"Americans more likely to saw growing diversity makes their country a better place to live" (2016), Pew Research.png'},
 {'pageid': 5979050, 'ns': 6, 'title': 'File:040209 MC CR Una idea Img2.jpg'},
 {'pageid': 2789216, 'ns': 6, 'title': 'File:1 guadeloupe pesticides.jpg'},
 {'pageid': 116967696,
  'ns': 6,
  'title': 'File:1850-2019 Cumulative greenhouse gas emissions by region - bar chart - IPCC AR6 WG3 - Fig SPM.2b.svg'},
 {'pageid': 56090464,
  'ns': 6,
  'title': 'File:1881 data for occupations of males and females in Hessett. .png'},
 {'pageid': 48303014,
  'ns': 6,
  'title': 'File:1881 Occupation structure graph of Whinburgh and Westfield.jpg'},
 {'pageid': 58106443,
  'ns': 6,
  'title': 'File:1881 Occupational Orders in Poughill, Devon.png'},
 {'pageid': 57073346,
  'ns': 6,
  'title': 'File:1881 Occupational Structure of Frostenden.png'},
 {'pageid': 56535285,
  'ns': 6,
  'title': 'File:1881 occupational structure of Ripple,

In [154]:
image_file_url = response_json["query"]["categorymembers"][0]["title"]
url = 'https://api.wikimedia.org/core/v1/commons/file/' + image_file_url

response = requests.get(url, headers=headers)
data = response.json()

data["preferred"]["url"]

'https://upload.wikimedia.org/wikipedia/commons/e/e6/%22Americans_more_likely_to_saw_growing_diversity_makes_their_country_a_better_place_to_live%22_%282016%29%2C_Pew_Research.png'