In [47]:
import requests
from imageio import imread
from urllib.parse import unquote
import unidecode
import re
import requests
from bs4 import BeautifulSoup
from pymongo import MongoClient
from fuzzywuzzy import fuzz, process
import matplotlib.pyplot as plt

In [13]:
def get_wikipedia_names(url="",name="",lang="en"):
    '''
    Find the different possible names of a wikipedia entity.
    Right now it is only tested on organizations gotten from ror db
    
    Parameters
    ----------
    url : str
        The wikipedia url if is available
    name : str
        The name of keywords to do the search over wikipedia api
    lang : str
        The iso-639 lang code to fix the language endpooint of the search language
        
    Returns
    -------
    data : dict
        The response of the wikipedia requests with the langlinks of the prop params
        
    '''
    if url:
        subject = unquote(url.split("/")[-1].replace("_"," "))
    elif name:
        subject = name
    else:
        return {"response":[],"names":[]}
    
    base = 'https://'+lang+'.wikipedia.org/w/api.php'
    #searching entire wikipedia
    print("Searching ",subject)
    params = {
            'action':'query',
            'format':'json',
            'list':'search',
            'srsearch':subject
        }
 
    data = requests.get(base, params=params).json()
    #print(data)
    entry=""
    pageid=""
    if not "query" in data.keys():
        return None
    for reg in data["query"]["search"]: #searching among the results and checking twice with fuzzywuzzy
        score=fuzz.ratio(reg["title"].lower(),subject.lower())
        if score>90:
            entry=reg
            pageid=int(reg["pageid"])
        elif score>50:
            score=fuzz.partial_ratio(reg["title"].lower(),subject.lower())
            if score>95:
                entry=reg
                pageid=int(reg["pageid"])
            elif score>80:
                score=fuzz.token_set_ratio(reg["title"].lower(),subject.lower())
                if score>98:
                    entry=reg
                    pageid=int(reg["pageid"])
        if entry!="":
            break

    if pageid!="": #if the page id is available
        #retrieving the actual page's langlinks 
        params = {
                'action': 'query',
                'format': 'json',
                'pageids': pageid,
                'prop': 'langlinks',
                'lllimit':500,
                #'exintro': True,
                #'explaintext': True,
            }

        response = requests.get(base, params=params)
        data = response.json()
        return data
    else:
        return None

In [14]:
get_wikipedia_names?

[0;31mSignature:[0m [0mget_wikipedia_names[0m[0;34m([0m[0murl[0m[0;34m=[0m[0;34m''[0m[0;34m,[0m [0mname[0m[0;34m=[0m[0;34m''[0m[0;34m,[0m [0mlang[0m[0;34m=[0m[0;34m'en'[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Find the different possible names of a wikipedia entity.
Right now it is only tested on organizations gotten from ror db

Parameters
----------
url : str
    The wikipedia url if is available
name : str
    The name of keywords to do the search over wikipedia api
lang : str
    The iso-639 lang code to fix the language endpooint of the search language
    
Returns
-------
data : dict
    The response of the wikipedia requests with the langlinks of the prop params
    
[0;31mFile:[0m      /tmp/ipykernel_3545795/3214475295.py
[0;31mType:[0m      function


In [9]:
get_wikipedia_names(name="universidad de antioquia",lang="es")

Searching  universidad de antioquia


{'batchcomplete': '',
 'query': {'pages': {'120427': {'pageid': 120427,
    'ns': 0,
    'title': 'Universidad de Antioquia',
    'langlinks': [{'lang': 'ca', '*': "Universitat d'Antioquia"},
     {'lang': 'ceb', '*': 'University of Antioquia'},
     {'lang': 'de', '*': 'Universidad de Antioquia'},
     {'lang': 'en', '*': 'University of Antioquia'},
     {'lang': 'eo', '*': 'Universitato de Antjokio'},
     {'lang': 'et', '*': 'Antioquia Ülikool'},
     {'lang': 'fr', '*': "Université d'Antioquia"},
     {'lang': 'gd', '*': 'Oilthigh Antioquia'},
     {'lang': 'ja', '*': 'アンティオキア大学'},
     {'lang': 'nl', '*': 'Universiteit van Antioquia'},
     {'lang': 'pl', '*': 'Uniwersytet Antioquia'},
     {'lang': 'sv', '*': 'Antioquias universitet'},
     {'lang': 'tl', '*': 'Unibersidad ng Antioquia'},
     {'lang': 'uz', '*': 'Antiokiya universiteti'}]}}}}

In [10]:
get_wikipedia_names(url="https://en.wikipedia.org/wiki/University_of_Rovira_i_Virgili",lang="en")

Searching  University of Rovira i Virgili


{'batchcomplete': '',
 'query': {'pages': {'106170': {'pageid': 106170,
    'ns': 0,
    'title': 'University of Rovira i Virgili',
    'langlinks': [{'lang': 'ar', '*': 'جامعة روبيرا الأول بيرجيلي'},
     {'lang': 'arz', '*': 'جامعة روبيرا الاول بيرجيلى'},
     {'lang': 'be', '*': 'Універсітэт імя Антоніа Равіра-і-Вірджылі'},
     {'lang': 'ca', '*': 'Universitat Rovira i Virgili'},
     {'lang': 'el', '*': 'Πανεπιστήμιο Ροβίρα ι Μπιρζίλι'},
     {'lang': 'es', '*': 'Universidad Rovira i Virgili'},
     {'lang': 'eu', '*': 'Rovira i Virgili Unibertsitatea'},
     {'lang': 'fr', '*': 'Université Rovira i Virgili'},
     {'lang': 'gl', '*': 'Universidade Rovira i Virgili'},
     {'lang': 'it', '*': 'Università Rovira i Virgili'},
     {'lang': 'ja', '*': 'ルビーラ・イ・ビルジーリ大学'},
     {'lang': 'pt', '*': 'Universidade Rovira i Virgili'},
     {'lang': 'zh', '*': '罗维拉-威尔吉利大学'}]}}}}

In [82]:
def get_logo_wikipedia(url="",name="",lang="en"):
    '''
    Find and image of a wikipedia page.
    Right now it is only tested for the logos of organizations gotten from ror db
    
    Parameters
    ----------
    url : str
        The wikipedia url if is available
    name : str
        The name of keywords to do the search over wikipedia api
    lang : str
        The iso-639 lang code to fix the language endpooint of the search language
        
    Returns
    -------
    data : dict
        The response of the wikipedia request
        
    '''
    if url:
        subject = unquote(url.split("/")[-1].replace("_"," "))
    elif name:
        subject = name
    else:
        return {"response":[],"names":[]}
    
    base = 'https://'+lang+'.wikipedia.org/w/api.php'
    #searching entire wikipedia
    print("Searching ",subject)
    params = {
            'action':'query',
            'format':'json',
            'list':'search',
            'srsearch':subject
        }
 
    data = requests.get(base, params=params).json()
    #print(data)
    entry=""
    pageid=""
    if not "query" in data.keys():
        return None
    for reg in data["query"]["search"]:
        score=fuzz.ratio(reg["title"].lower(),subject.lower())
        if score>90:
            entry=reg
            pageid=int(reg["pageid"])
        elif score>50:
            score=fuzz.partial_ratio(reg["title"].lower(),subject.lower())
            if score>95:
                entry=reg
                pageid=int(reg["pageid"])
            elif score>80:
                score=fuzz.token_set_ratio(reg["title"].lower(),subject.lower())
                if score>98:
                    entry=reg
                    pageid=int(reg["pageid"])
        if entry!="":
            break

    if pageid!="":
        #retrieveing the actual page    
        params = {
            'action': 'query',
            'format': 'json',
            'pageids': pageid,
            'prop': 'images'
        }

        response = requests.get(base, params=params)
        data = response.json()
        try:
            title=""
            for img in data["query"]["pages"][str(pageid)]["images"]:
                if "commons" in img["title"].lower(): #avoid the wikipedia logo
                    continue
                for keyword in ["flag", "escudo", "logo", "shield", "bandera"]:
                    if keyword in img["title"].lower():
                        title=img["title"]
                        break
                if title != "":
                    break
            print("title: ",title)
            params = {
                'action': 'query',
                'format': 'json',
                'titles': title,
                'prop': 'imageinfo',
                'iiprop':"url"
            }
            response = requests.get(base, params=params)
            data = response.json()
            return data
        except Exception as e:
            print("@@@@")
            print("Function error: ", e)
            print(data)
            print("@@@@")
    else:
        return None

In [83]:
get_logo_wikipedia(name="atletico nacional",lang="en")

Searching  atletico nacional
title:  File:Escudo Atlético Nacional.png


{'continue': {'iistart': '2015-06-08T02:52:40Z', 'continue': '||'},
 'query': {'pages': {'-1': {'ns': 6,
    'title': 'File:Escudo Atlético Nacional.png',
    'missing': '',
    'known': '',
    'imagerepository': 'shared',
    'imageinfo': [{'url': 'https://upload.wikimedia.org/wikipedia/commons/9/9a/Escudo_de_Atl%C3%A9tico_Nacional.png',
      'descriptionurl': 'https://commons.wikimedia.org/wiki/File:Escudo_de_Atl%C3%A9tico_Nacional.png',
      'descriptionshorturl': 'https://commons.wikimedia.org/w/index.php?curid=40813953'}]}}}}

In [84]:
get_logo_wikipedia(url="https://es.wikipedia.org/wiki/Universidad_de_Antioquia",lang="es")

Searching  Universidad de Antioquia
title:  Archivo:Escudo-UdeA.svg


{'continue': {'iistart': '2008-02-13T05:32:03Z', 'continue': '||'},
 'query': {'pages': {'-1': {'ns': 6,
    'title': 'Archivo:Escudo-UdeA.svg',
    'missing': '',
    'known': '',
    'imagerepository': 'shared',
    'imageinfo': [{'url': 'https://upload.wikimedia.org/wikipedia/commons/f/fb/Escudo-UdeA.svg',
      'descriptionurl': 'https://commons.wikimedia.org/wiki/File:Escudo-UdeA.svg',
      'descriptionshorturl': 'https://commons.wikimedia.org/w/index.php?curid=3547086'}]}}}}