In [1]:
import pandas as pd

import re
import os
import urllib
import requests

from datetime import datetime

from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
from oaipmh.error import NoRecordsMatchError

from tqdm.notebook import tqdm

import pycountry
from tld import get_tld
import geoip2.database
import socket

In [2]:
def find_oai_endpoint(url):
    ''' Finds an OAI endpoint given a base OJS-install URL '''    
    url = url.strip("/")
    verb_param = {'verb': 'Identify'}
    urls_to_try = []  # all the combinations of possible OAI URLs, given the base url
    urls_to_try.append(url + "/index.php/index/oai")
    urls_to_try.append(url + "/index/oai")
    urls_to_try.append(url + "/index.php?page=oai")
    urls_to_try.append(url + "?page=oai")

    for url_to_try in urls_to_try:
        try:
            request = requests.get(url_to_try, params=verb_param)
            if request.status_code == 200 and request.headers['Content-Type'].startswith('text/xml'): 
                # take out the verb parameter before returning
                return request.url.replace('verb=Identify', '').strip('&?')
        except:
            continue # move on to next URL
            
    return False

In [3]:
oai_registry = MetadataRegistry()
oai_registry.registerReader('oai_dc', oai_dc_reader)

# Use this to determine if the OAI URL is valid. If this returns False, do not include it on our list
def get_oai_identity(oai_url):
    ''' Finds an OAI repository identity given an OAI URL '''
    try:
        client = Client(oai_url, oai_registry)
        identity = client.identify()
        return identity
    except Exception as err:
        if err.code in [404, 500]: # Might need to expand this list? 
            print(err)
        # some other error, but probably still OK to ignore?
        raise 

    return False

In [13]:
country_stop_words = ['islands', 'saint', 'and', 'republic', 'virgin', 'united', 'south', 'of', 'new', 'the']

nationality_prefixes = {}
for country in pycountry.countries:
    nat = [x for x in re.split('[^\w\ \&]', country.name) if x not in country_stop_words][0].strip(' s').strip('aeiou')
    # a few that get confused
    if nat == 'Austr':
        nat = 'Austria'
    elif nat == 'Austral':
        nat = 'Australia'

    if len(nat) > 4:
        nationality_prefixes[nat] = country.alpha_2

def get_country_from_name(journal_name):
    matches = []
    try: 
        for nat in nationality_prefixes.keys():
#             print(nat.lower())
            if journal_name.lower().find(nat.lower()) >= 0:
                matches.append(nat)
        longest_match = max(matches, key=len)
        return nationality_prefixes[longest_match].upper()
#         return pycountry.countries.get(alpha_2=nationality_prefixes[longest_match])
    except: 
        pass
    return False

In [74]:
def get_country_from_tld(url):
    try:
        tld = get_tld(url)
        
        # special handling for the JOLS
        if tld == 'info':
            if 'vjol' in url:
                return 'VN'
            elif 'banglajol' in url:
                return 'BD'
            elif 'ajol' in url:
                return 'ZA'
            elif 'nepjol' in url:
                return 'NP'
            elif 'philjol' in url:
                return 'PH'
            elif 'mongoliajol' in url:
                return 'MN'
            elif 'lamjol' in url: # need to figure out what to do with these
                pass       
        elif tld == 'uk':
            return 'GB'
        elif tld == 'edu':
            return 'US'
        else:
            return tld.upper()
#         return pycountry.countries.get(alpha_2=tld)
    except:
        return None

In [12]:
geoIpReader = geoip2.database.Reader('data/GeoLite2-Country/GeoLite2-Country.mmdb')

def get_country_from_domain(url):
    global geoIpReader
    try: 
        domain = urllib.parse.urlparse(url).netloc
        ip = socket.gethostbyname(domain)
        alpha_2 = geoIpReader.country(ip).country.iso_code
        return alpha_2.upper()
#         return pycountry.countries.get(alpha_2=alpha_2)
    except: # probably should catch different exceptions from parsing, IP lookup, and geoIP Lookup
        raise

    return False

In [77]:
url = 'http://ajol.info'
oai_url = find_oai_endpoint(url)

In [70]:
oai_registry = MetadataRegistry()
oai_registry.registerReader('oai_dc', oai_dc_reader)

client = Client(oai_url, oai_registry)
journalSets = [(j, n) for (j,n,x) in client.listSets() if ':' not in j] # journalInitial, journalName

In [78]:
country_from_tld = get_country_from_tld(oai_url)
country_from_domain = get_country_from_domain(oai_url)

for journal_init, journal_name in journalSets: 
    break
    try:
        year = 2019
        ids = client.listIdentifiers(metadataPrefix='oai_dc', set=journal_init, from_=datetime(year,1,1), until=datetime(year,12,31))
        ids = list(ids)
        num_records = len([i for i in ids if ~i.isDeleted()])
        
        country_from_name = get_country_from_name(journal_name)
        print(oai_url, journal_init, journal_name, year, num_records, country_from_tld, country_from_domain, country_from_name)
    except NoRecordsMatchError:
        pass