In [1]:
from urllib2 import urlopen
from urllib import urlencode
from urllib import quote_plus
# import urllib.parse
import json
import pprint

class CoreApiRequestor:

    def __init__(self, endpoint, api_key):
        self.endpoint = endpoint
        self.api_key = api_key
        #defaults
        self.pagesize = 100
        self.page = 1

    def parse_response(self, decoded):
        res = []
        for item in decoded['data']:
            doi = None
            if 'identifiers' in item:
                for identifier in item['identifiers']:
                    if identifier and identifier.startswith('doi:'):
                        doi = identifier
                        break
            res.append([item['title'], doi])
        return res

    def request_url(self, url):
        print(url)
        response = urlopen(url)
        html = response.read()
        return html

    def get_method_query_request_url(self,method,query,fullText,page):
        if (fullText):
            fullText = 'true'
        else:
            fullText = 'false'
        params = {
            'apiKey':self.api_key,
            'page':page,
            'pageSize':self.pagesize,
            'fulltext':fullText
        }
        return self.endpoint + method + '/' + quote_plus(query) + '?' + urlencode(params)

    def get_up_to_20_pages_of_query(self,method,query,fulltext):
        url = self.get_method_query_request_url(method,query,fulltext,1)
        all_articles=[]
        resp = self.request_url(url)
        result = json.loads(resp.decode('utf-8'))
        all_articles.append(result)
        if (result['totalHits']>100):
            numOfPages = int(result['totalHits']/self.pagesize)  #rounds down
            if (numOfPages>20):
                numOfPages=20
            for i in range(2,numOfPages):
                url = self.get_method_query_request_url(method,query,False,i)
                print(url)
                resp =self.request_url(url)
                all_articles.append(json.loads(resp.decode('utf-8')))
        return all_articles
    
    def get_repository_articles_fulltext_as_dict(self,repository_id):
        return ""
    
    def get_search_repository_request_url(self,repoQuery,page=1,pageSize=10):
        params={
            'apiKey':self.api_key,
            'page':page,
            'pageSize':pageSize
        }
        return self.endpoint + "/repositories/search/"+quote_plus(repoQuery)+'?'+urlencode(params)
    
    def search_repository_ids_by_name(self,repoName):
        discoverRepoUrl = self.get_search_repository_request_url(repoName)
        resp = self.request_url(discoverRepoUrl)
        result = json.loads(resp.decode('utf-8'))
        for item in result['data']:
            if 'name' in item:
                name = item['name']
            if 'id' in item:
                id = item['id']
            repos[id]=name
        return repos
    
    def get_count_articles_of_repository_url(self,repoId,withFullText):
        ft = 'false'
        if(withFullText):
            ft = 'true'
                
        params = {
            'apiKey':self.api_key,
            'fulltext':ft
        }    
        return self.endpoint + "/articles/search/repositories.id:"+repoId+'?'+urlencode(params)
    
    def count_articles_of_repository(self,repoId,withFullText=False):
        countArticlesOfRepoUrl = self.get_count_articles_of_repository_url(repoId,withFullText)
        print(countArticlesOfRepoUrl)
        resp = self.request_url(countArticlesOfRepoUrl)
        result = json.loads(resp.decode('utf-8'))
        return result['totalHits']
    
    def get_url_of_download_articles_of_repository(self,repoId,fullText,page,pageSize):
        params={
            'apiKey':self.api_key,
            'page':page,
            'pageSize':pageSize,
            'fulltext':fullText
               }
        return self.endpoint + "/articles/search/repositories.id:"+str(repoId)+'?'+urlencode(params)
    
    def download_articles_of_repository(self, repoId, fulltext=True, page=1, pageSize=100):
        url = self.get_url_of_download_articles_of_repository(repoId,fulltext,page,pageSize)
        all_articles=[]
        resp = self.request_url(url)
        result = json.loads(resp.decode('utf-8'))
        if (result['totalHits']>100):
            numOfPages = int(result['totalHits']/self.pagesize)  #rounds down
            if (numOfPages>5):
                numOfPages=5
            for i in range(2,numOfPages):
                url = self.get_url_of_download_articles_of_repository(repoId,fulltext,page,pageSize)
                print(url)
                resp =self.request_url(url)
                all_articles.append(json.loads(resp.decode('utf-8')))
        return all_articles

In [2]:
'''
Initialise parameters
'''
# init 
endpoint = 'https://core.ac.uk/api-v2'

'''
********************************************
Add your own api key below
'''
api_key =""
# or get it from a config file
file = open("api_key.secret","r") 
api_key=file.read()
'''
********************************************
'''
'''
Create your api object
'''
api = CoreApiRequestor(endpoint,api_key)

In [3]:
api.get_url_of_download_articles_of_repository(39,True,1,100)

'https://core.ac.uk/api-v2/articles/search/repositories.id:39?fulltext=True&apiKey=20hIsS1F5j4D2C2iXrg4Wxf7VTp4Xt1j&page=1&pageSize=100'

In [4]:
url = api.get_url_of_download_articles_of_repository(1,True,39,12)

In [5]:
import pickle

In [6]:
repoId=86
'''
Use page size 50 
-
store as pickle every 200 articles (4 pages)
in the end you will have in the cwd files:
all_articles_on1_1.pkl
all_articles_on1_2.pkl
all_articles_on1_3.pkl
...

'''
all_articles=[]
for i in range(1,60):
    url = api.get_url_of_download_articles_of_repository(repoId,True,i,50)
    response = urlopen(url)
    resp = response.read()
    result = json.loads(resp.decode('utf-8'))
    all_articles.append(result)
    if (i%4==0):
        pickle.dump(all_articles,open('all_articles_on'+str(repoId)+'_'+str(i/4)+'.pkl','wb'),pickle.HIGHEST_PROTOCOL)
        all_articles=[]

In [7]:
try:
    articles = pickle.load(open('all_articles_on86_1.pkl','rb'))
except (OSError,IOError) as e:
    print(e)

In [8]:
articles[0]['data'][0]

{u'authors': [u'Mootien, Namasoondrum P.',
  u'Warren, James P.',
  u'Morris, Dick',
  u'Enoch, Marcus P.'],
 u'contributors': [],
 u'datePublished': u'2013',
 u'description': u'Aviation globally is characterised by significant change and consequently the future of the sector has always been difficult to predict. This study adopts a systemic approach based on findings from exploratory interviews with UK aviation academics to: determine the roles of stakeholders in the air transport system; report the current issues facing the sector; explore how these issues interact and impact on the stakeholders in the system; and speculate on the future implications. Six core stakeholders are identified: airlines, airports, consumers, manufacturers, governing institutions and interest groups. Nine core issues are reported, namely: local environment, climate change, peak oil, the state of the economy, social norms, demographics, disruptive events, national (or international) regulations and capacity.

In [9]:
text = articles[0]['data'][0]['fullText']

In [10]:
import re
match = re.findall(r'[\w\.-]+@[\w\.-]+', text)

In [11]:
match

[u'james.warren@open.ac.uk']

In [10]:
import re
all_emails = []
for page_pickle in range(1,10):
    try:
        articles = pickle.load(open('all_articles_on39_'+str(page_pickle)+'.pkl','rb'))
        for page in range(0,len(articles)):
            for item in articles[page]['data']:
                text = item['fullText']
                match = re.findall(r'[\w\.-]+@[\w\.-]+', text)
                print(match)
                all_emails.append(match)
    except (OSError,IOError) as e:
        print(e)

[u'j.k.knox@sms.ed.ac.uk', u'MOOCs@Edinburgh', u'78233395@N02', u'MOOCs@Edinburgh']
[u'crfr@ed.ac.uk']
[u'sarah.macpherson@ed.ac.uk']
[]
[u'crfr@ed.ac.uk']
[u'ibethune@epcc.ed.ac.uk', u'ibethune@epcc.ed.ac.uk']
[]
[u'crfr@ed.ac.uk']
[]
[u'h.lam@ed.ac.uk']
[]
[u'g.pennetta@ed.ac.uk', u'journals.permission@oup.com']
[]
[]
[u'crfr@ed.ac.uk']
[u'crfr@ed.ac.uk']
[u'thomas.bak@ed.ac.uk']
[u'crfr@ed.ac.uk']
[]
[u'e-Learning@Ed']
[u'h.lam@ed.ac.uk']
[u'crfr@ed.ac.uk']
[u'h.lam@ed.ac.uk']
[u'crfr@ed.ac.uk']
[]
[u'j.gales@ed.ac.uk', u'c.maluk@ed.ac.uk', u'luke.bisby@ed.ac.uk']
[u'h.lam@ed.ac.uk']
[]
[u'crfr@ed.ac.uk']
[u'ibethune@epcc.ed.ac.uk']
[u'crfr@ed.ac.uk']
[u'helpdesk@hector.ac.uk.']
[u'd.ahiaga-dagbui@ed.ac.uk', u'simon.smith@ed.ac.uk']
[u'Giovanni.terrasi@empa.ch', u'c.maluk@ed.ac.uk', u'luke.bisby@ed.ac.uk', u'erich.hugi@empa.ch', u'kanik@sacac.ch']
[]
[]
[]
[u's1135017@sms.ed.ac.uk', u's.abrahams@ed.ac.uk', u's1135017@sms.ed.ac.uk', u's.abrahams@ed.ac.uk']
[u'Martine.Manuel@ed.ac.uk'

In [9]:
len(all_emails)

200

In [11]:
type(all_emails)

list