In [32]:
import pandas as pd

import re
import os
import urllib
import requests
import sqlite3 as lite
from xml.etree import ElementTree as ET

from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader

from tqdm.notebook import tqdm

In [13]:
def domain_filter(url):
    ''' Don't consider domains that are IP addresses or are 'localhost'
        Consider also filtering things with 'demo' or 'test'
    '''
    
    domain = urllib.parse.urlparse(url).netloc
    if 'localhost' in domain: 
        return False
    if domain.replace('.', '').replace(':', '').isnumeric() == True:
        return False
    return True

In [14]:
def find_oai_endpoint(url):
    ''' Finds an OAI endpoint given a base OJS-install URL '''    
    url = url.strip("/")
    verb_param = {'verb': 'Identify'}
    urls_to_try = []  # all the combinations of possible OAI URLs, given the base url
    urls_to_try.append(url + "/index.php/index/oai")
    urls_to_try.append(url + "/index/oai")
    urls_to_try.append(url + "/index.php?page=oai")
    urls_to_try.append(url + "?page=oai")

    for url_to_try in urls_to_try:
        try:
            request = requests.get(url_to_try, params=verb_param)
            if request.status_code == 200 and request.headers['Content-Type'].startswith('text/xml'): 
                # take out the verb parameter before returning
                return request.url.replace('verb=Identify', '').strip('&?')
        except:
            continue # move on to next URL
            
    return False

In [None]:
oai_registry = MetadataRegistry()
oai_registry.registerReader('oai_dc', oai_dc_reader)

def get_oai_identity(oai_url):
    ''' Finds an OAI repository identity given an OAI URL '''
    try:
        client = Client(oai_url, oai_registry)
        identity = client.identify()
        return identity
    except Exception as err:
        if err.code in [404, 500]: # Might need to expand this list? 
            print(err)
        # some other error, but probably still OK to ignore?
        raise 

    return False

In [20]:
ojs_in_logs = pd.read_csv('data/2020/ojs_in_logs.csv')
ojs_in_logs = ojs_in_logs[ojs_in_logs.base_url.map(domain_filter)]
    

In [19]:
# url = ojs_in_logs.sample().iloc[0].base_url
print(url)
url = url.strip("/")
verb_param = {'verb': 'Identify'}
urls_to_try = []  # all the combinations of possible OAI URLs, given the base url
urls_to_try.append(url + "/index.php/index/oai")
urls_to_try.append(url + "/index/oai")
urls_to_try.append(url + "/index.php?page=oai")
urls_to_try.append(url + "?page=oai")
print(urls_to_try)

http://publikationen.soziologie.de
['http://publikationen.soziologie.de/index.php/index/oai', 'http://publikationen.soziologie.de/index/oai', 'http://publikationen.soziologie.de/index.php?page=oai', 'http://publikationen.soziologie.de?page=oai']


In [6]:
oai_in_logs = pd.read_csv('data/ojs_in_logs_urlmap.csv')

In [None]:
print(identity.adminEmails())
print(identity.earliestDatestamp())
print(identity.repositoryName())

In [21]:
ojs_in_db = pd.read_csv('data/2020/ojs_in_db.csv')
ojs_in_db.loc[:,'oai_url'] = ojs_in_db.oai_url.map(lambda x: x.replace("verb=identify", "").strip('&?'))

command to get OAI urls from tracker

    cat *log | sed -En "s/^.*\[(.*) -0700\].*ojs-version\.xml\?id=([^&]+)&oai=([^ ]+).*$/\1,\2,\3/gp" | sort | uniq > urls_in_tracker.csv

In [None]:
ojs_in_db[ojs_in_db.oai_url.str.contains('page=oai')].sample(10)

In [55]:
ojs_in_tracker = pd.read_csv('data/2020/logs/urls_in_tracker.csv', header=None)
ojs_in_tracker.columns = ['datetime', 'id', 'oai_url']
ojs_in_tracker.loc[:,'datetime'] = pd.to_datetime(ojs_in_tracker.datetime.map(lambda x: x.replace('2020:', '2020 ')))

ojs_in_tracker.loc[:,'oai_url'] = ojs_in_tracker.oai_url.map(lambda x: urllib.parse.unquote(x))
ojs_in_tracker.loc[:,'oai_url'] = ojs_in_tracker.oai_url.map(lambda x: 'http:' + x if x.startswith('//') else x)

ojs_in_tracker = ojs_in_tracker[ojs_in_tracker.oai_url.map(domain_filter)]

ojs_in_tracker = ojs_in_tracker.loc[ojs_in_tracker.groupby('oai_url')["datetime"].idxmax()]

In [59]:
ojs_in_tracker.to_csv('data/2020/ids_in_tracker.csv', index=False)

In [24]:
ojs_in_logs = pd.read_csv('data/foundurls.txt')

In [27]:
oai_urls = list(set(ojs_in_db.oai_url).union(ojs_in_tracker.oai_url).union(ojs_in_logs.oai_url))

In [36]:
ADMIN_RE = re.compile(r'<adminEmail>([^<]+)<')
REPO_RE = re.compile(r'<repositoryName>([^<]+)<')
TIME_RE = re.compile(r'<earliestDatestamp>([^<]+)<')

# res.text

m = ADMIN_RE.search(res.text)

In [111]:
ojs_in_tracker = pd.read_csv('data/2020/ids_in_tracker.csv')
ojs_in_tracker.set_index('oai_url', inplace=True)
ojs_in_tracker.columns = ['date_in_beacon', 'beacon_id']
ojs_in_db.loc[:,'date_in_beacon'] = pd.to_datetime(df.date_in_beacon)
ojs_in_tracker.beacon_id.fillna('', inplace=True)

df = pd.read_csv('data/foundoai.txt')
df.set_index('oai_url', inplace=True)
df.fillna('', inplace=True)
df.loc[:,'makeshift_id'] = df[['contact', 'name', 'time']].apply(lambda row: ';'.join(row), axis=1)

In [112]:
df = df.join(ojs_in_tracker)
df.reset_index(inplace=True)

In [113]:
df.sort_values('date_in_beacon', inplace=True)
df = df.groupby('makeshift_id').last()

In [116]:
df.groupby('beacon_id').size().nlargest(10)

beacon_id
5bff9aaebf90f    23
5eb26e02d9a0d    17
5677dacb07cbc    11
56e10670c3cbb    10
56519a9433bcc     9
5e59e9f73316e     8
5d19cdac5c693     7
53d674dc81834     6
542e3039b0c19     6
5c4b18fece111     6
dtype: int64