In [1]:
import re, json
import requests
import my_keys


# Objective

Access the API for AIF automatically so as to get the data in json format.µ

## Open a requests session 

- open session 
- recover csrf token 

In [5]:
s = requests.session()

url = 'https://aif.centre-mersenne.org/ojs/login'
r = s.get(url)

csrf_p = re.compile('name="csrfToken" value="(.*?)"')
mm = csrf_p.search(r.text)
token = mm.group(1)

we now need to build the parameter dictionnary

- login details are stored in my private cache my_keys.py

In [6]:
params = my_keys.keys['aif']
params['csrfToken'] = token
params['remember'] = 1
params

{'username': 'mcshane',
 'password': 'caca4herve',
 'csrfToken': '3ce95439be733a14d7e24bca189e49b8',
 'remember': 1}

the login now is trivial 

In [7]:
url = 'https://aif.centre-mersenne.org/ojs/login/signIn' 
r = s.post(url,params= params)

So I found the request in by inspecting network traffic in Chrome.

- This yields the raw json which is more convenient than scraping 
- I don't understand why the **status[]** is set twice

In [13]:
url = 'https://aif.centre-mersenne.org/ojs/index.php/AIF/api/v1/_submissions?'
params2 = 'status%5B%5D=4&status%5B%5D=3&assignedTo=37&orderBy=lastModified&searchPhrase=&count=20&offset=0&_=1554444524714'

#decode but Chrome does this for you too
from urllib.parse import unquote
unquote(unquote(params2))

In [17]:
params2 = {'status[]': 4,
           'assignedTo': 37,
           'orderBy' : 'lastModified',
           'searchPhrase' : '',
           'count' : 20,
           'offset' : 0, 
           '_' : 1554444524714}
params2

{'status[]': 4,
 'assignedTo': 37,
 'orderBy': 'lastModified',
 'searchPhrase': '',
 'count': 20,
 'offset': 0,
 '_': 1554444524714}

send the request and dump the result for future reference

In [19]:
r = s.get(url, params=params2)
with open('aif.html','w') as fp:
    fp.write(r.text)

decode the json and inspect the dictionnary

In [21]:
dd = json.loads(r.text)
dd.keys()

this was the code I used to scrape the page I was dumping by hand before

In [21]:
pp = re.compile('(\{"items".*?)\);',re.DOTALL)
#px = re.compile('pkp.registry.init\((.*?)\);',re.DOTALL)
data = re.sub('=\n','',data)
mm = pp.search(data)
txt = mm.group(1)

dd = json.loads( txt )

In [23]:
arts = dd['items']

In [24]:
len(arts)

20

In [25]:
arts[0].keys()

dict_keys(['id', 'fullTitle', 'status', 'submissionProgress', 'stages', 'reviewRounds', 'reviewAssignments', 'locale', 'urlWorkflow', 'urlAuthorWorkflow', 'urlEditorialWorkflow', '_href', 'lastModified', 'authorString', 'urlPublished'])

## Finally

Import to a Pandas dataframe as it looks prettier

In [41]:
import pandas as pd

fields = ['authorString', 'fullTitle']

def get_title(x):
    tt = x['fullTitle']['en_US']
    if tt != '' : return tt
    return x['fullTitle']['fr_FR']

stuff ={ff: [] for ff in fields }
    
for x in arts:
    for ff in fields:
        if ff != 'fullTitle':
            stuff[ff].append(x[ff])
        else:
            stuff[ff].append(get_title(x))


In [44]:
df = pd.DataFrame.from_dict(stuff)
df

Unnamed: 0,authorString,fullTitle
0,Cleon S. Barroso,Asymptotic monotone basic sequences and unit s...
1,Chol Park,A classification of Breuil modules
2,Antoine Ducros,Réduction en famille d'espaces affinoïdes
3,Christopher-Lloyd SIMON,Topology and enumeration of plane algebraic cu...
4,"Javier Oswaldo Rodríguez Velásquez, Signed Esp...",Probabilistic prediction of consecutive prime ...
5,Liviu Ornea,Hopf surfaces in locally conformally Kähler ma...
6,nasreen kausar,Dr Characterizations of non-associative ordere...
7,"Ilias Aarab, Mohamed Ali Tagmouti, Adnan Bakka...",The Conjecture 6P
8,"Beata Hejmej, Janusz Gwoździewicz",On Abhyankar-Moh irreducibility criterion for ...
9,Feng Rong,Classification of holomorphic endomorphisms of...


Unnamed: 0,authorString,fullTitle
0,Cleon S. Barroso,Asymptotic monotone basic sequences and unit s...
1,Chol Park,A classification of Breuil modules
2,Antoine Ducros,Réduction en famille d'espaces affinoïdes
3,Christopher-Lloyd SIMON,Topology and enumeration of plane algebraic cu...
4,"Javier Oswaldo Rodríguez Velásquez, Signed Esp...",Probabilistic prediction of consecutive prime ...
5,Liviu Ornea,Hopf surfaces in locally conformally Kähler ma...
6,nasreen kausar,Dr Characterizations of non-associative ordere...
7,"Ilias Aarab, Mohamed Ali Tagmouti, Adnan Bakka...",The Conjecture 6P
8,"Beata Hejmej, Janusz Gwoździewicz",On Abhyankar-Moh irreducibility criterion for ...
9,Feng Rong,Classification of holomorphic endomorphisms of...


In [89]:
arts[0]['fullTitle']

{'en_US': 'A note on the irrationality of certain hyperbolic, trigonometric, and logarithmic values',
 'fr_FR': ''}