In [1]:
from urllib2 import urlopen
from urllib import urlencode
from urllib import quote_plus
# import urllib.parse
import json
import pprint

class CoreApiRequestor:

    def __init__(self, endpoint, api_key):
        self.endpoint = endpoint
        self.api_key = api_key
        #defaults
        self.pagesize = 100
        self.page = 1

    def parse_response(self, decoded):
        res = []
        for item in decoded['data']:
            doi = None
            if 'identifiers' in item:
                for identifier in item['identifiers']:
                    if identifier and identifier.startswith('doi:'):
                        doi = identifier
                        break
            res.append([item['title'], doi])
        return res

    def request_url(self, url):
        print(url)
        response = urlopen(url)
        html = response.read()
        return html

    def get_method_query_request_url(self,method,query,fullText,page):
        if (fullText):
            fullText = 'true'
        else:
            fullText = 'false'
        params = {
            'apiKey':self.api_key,
            'page':page,
            'pageSize':self.pagesize,
            'fulltext':fullText
        }
        return self.endpoint + method + '/' + quote_plus(query) + '?' + urlencode(params)

    def get_up_to_20_pages_of_query(self,method,query,fulltext):
        url = self.get_method_query_request_url(method,query,fulltext,1)
        all_articles=[]
        resp = self.request_url(url)
        result = json.loads(resp.decode('utf-8'))
        all_articles.append(result)
        if (result['totalHits']>100):
            numOfPages = int(result['totalHits']/self.pagesize)  #rounds down
            if (numOfPages>20):
                numOfPages=20
            for i in range(2,numOfPages):
                url = self.get_method_query_request_url(method,query,False,i)
                print(url)
                resp =self.request_url(url)
                all_articles.append(json.loads(resp.decode('utf-8')))
        return all_articles
    
    def get_repository_articles_fulltext_as_dict(self,repository_id):
        return ""
    
    def get_search_repository_request_url(self,repoQuery,page=1,pageSize=10):
        params={
            'apiKey':self.api_key,
            'page':page,
            'pageSize':pageSize
        }
        return self.endpoint + "/repositories/search/"+quote_plus(repoQuery)+'?'+urlencode(params)
    
    def search_repository_ids_by_name(self,repoName):
        discoverRepoUrl = self.get_search_repository_request_url(repoName)
        resp = self.request_url(discoverRepoUrl)
        result = json.loads(resp.decode('utf-8'))
        for item in result['data']:
            if 'name' in item:
                name = item['name']
            if 'id' in item:
                id = item['id']
            repos[id]=name
        return repos
    
    def get_count_articles_of_repository_url(self,repoId,withFullText):
        ft = 'false'
        if(withFullText):
            ft = 'true'
                
        params = {
            'apiKey':self.api_key,
            'fulltext':ft
        }    
        return self.endpoint + "/articles/search/repositories.id:"+repoId+'?'+urlencode(params)
    
    def count_articles_of_repository(self,repoId,withFullText=False):
        countArticlesOfRepoUrl = self.get_count_articles_of_repository_url(repoId,withFullText)
        print(countArticlesOfRepoUrl)
        resp = self.request_url(countArticlesOfRepoUrl)
        result = json.loads(resp.decode('utf-8'))
        return result['totalHits']
    
    def get_url_of_download_articles_of_repository(self,repoId,fullText,page,pageSize):
        params={
            'apiKey':self.api_key,
            'page':page,
            'pageSize':pageSize,
            'fulltext':fullText
               }
        return self.endpoint + "/articles/search/repositories.id:"+str(repoId)+'?'+urlencode(params)
    
    def download_articles_of_repository(self, repoId, fulltext=True, page=1, pageSize=100):
        url = self.get_url_of_download_articles_of_repository(repoId,fulltext,page,pageSize)
        all_articles=[]
        resp = self.request_url(url)
        result = json.loads(resp.decode('utf-8'))
        if (result['totalHits']>100):
            numOfPages = int(result['totalHits']/self.pagesize)  #rounds down
            if (numOfPages>5):
                numOfPages=5
            for i in range(2,numOfPages):
                url = self.get_url_of_download_articles_of_repository(repoId,fulltext,page,pageSize)
                print(url)
                resp =self.request_url(url)
                all_articles.append(json.loads(resp.decode('utf-8')))
        return all_articles

In [2]:
'''
Initialise parameters
'''
# init 
endpoint = 'https://core.ac.uk/api-v2'

'''
********************************************
Add your own api key below
'''
api_key =""
# or get it from a config file
file = open("api_key.secret","r") 
api_key=file.read()
'''
********************************************
'''
'''
Create your api object
'''
api = CoreApiRequestor(endpoint,api_key)

In [3]:
api.get_url_of_download_articles_of_repository(1,True,1,100)

'https://core.ac.uk/api-v2/articles/search/repositories.id:1?fulltext=True&apiKey=nTo627BU8jPNth4EbsrDue9IXWzAfZiY&page=1&pageSize=100'

In [10]:
url = api.get_url_of_download_articles_of_repository(1,True,1,12)

In [11]:
import pickle

In [24]:
repoId=1
'''
Use page size 50 
-
store as pickle every 200 articles (4 pages)
in the end you will have in the cwd files:
all_articles_on1_1.pkl
all_articles_on1_2.pkl
all_articles_on1_3.pkl
...

'''
all_articles=[]
for i in range(1,40):
    url = api.get_url_of_download_articles_of_repository(repoId,True,i,50)
    response = urlopen(url)
    resp = response.read()
    result = json.loads(resp.decode('utf-8'))
    all_articles.append(result)
    if (i%4==0):
        pickle.dump(all_articles,open('all_articles_on'+str(repoId)+'_'+str(i/4)+'.pkl','wb'),pickle.HIGHEST_PROTOCOL)
        all_articles=[]

In [22]:
try:
    articles = pickle.load(open('all_articles_on1_1.pkl','rb'))
except (OSError,IOError) as e:
    print(e)

In [85]:
articles[0]['data'][0]

{u'authors': [u'Lenartowicz, Malgorzata',
  u'Kennedy, Christine',
  u'Hayes, Helen',
  u'McArdle, Harry J'],
 u'contributors': [u'University of Aberdeen, Medicine, Medical Sciences & Nutrition, RINH'],
 u'datePublished': u'2015-02',
 u'description': u'Acknowledgments The authors\u2019 work is supported by Scottish Government (Rural and Environmental Scientific and Analytical Services). We are grateful to Ms Val Stevens for analytical and technical assistance and to the Biological Resource Facility staff for husbandry and maintenance of the experimental animals. The authors declare no conflicts of interest. Open Access This article is distributed under the terms of the Creative Commons Attribution License which permits any use, distribution, and reproduction in any medium, provided the original author(s) and the source are credited.Peer reviewedPublisher PD',
 u'doi': u'10.1007/s10534-014-9802-z',
 u'fullText': u'Transcriptional regulation of copper metabolism genes\nin the liver of fe

In [27]:
text = articles[0]['data'][0]['fullText']

In [28]:
import re
match = re.findall(r'[\w\.-]+@[\w\.-]+', text)

In [29]:
match

[u'L.V.Burgin@leeds.ac.uk',
 u'Lorna.ramsay@education.gsi.gov.uk',
 u'r.aspden@abdn.ac.uk']

In [17]:
import pickle 
import re
def get_all_emails_of_repository(repoId):
    all_emails = []
    for page_pickle in range(1,13):
        try:
            articles = pickle.load(open('all_articles_on'+str(repoId)+'_'+str(page_pickle)+'.pkl','rb'))
            for page in range(0,len(articles)):
                for item in articles[page]['data']:
                    text = item['fullText']
                    match = re.findall(r'[\w\.-]+@[\w\.-]+', text)
#                     print(match)
                    all_emails.append(match)
        except (OSError,IOError) as e:
            print(e)
    return all_emails

In [18]:
email_of_1 = get_all_emails_of_repository(1)

[Errno 2] No such file or directory: 'all_articles_on1_10.pkl'
[Errno 2] No such file or directory: 'all_articles_on1_11.pkl'
[Errno 2] No such file or directory: 'all_articles_on1_12.pkl'


In [19]:
email_of_39 = get_all_emails_of_repository(39)

In [20]:
email_of_86 = get_all_emails_of_repository(86)

In [21]:
len(all_emails)

NameError: name 'all_emails' is not defined

In [23]:
def flatten_list(unevenList):
    flat_list=[]
    for sublist in unevenList:
        for item in sublist:
            flat_list.append(item)
    return flat_list

In [24]:
email_of_1_flat = flatten_list(email_of_1)
len(email_of_1_flat)

2936

In [25]:
email_of_39_flat = flatten_list(email_of_39)
email_of_86_flat = flatten_list(email_of_86)

In [27]:
#how many authors from aberdeen's articles are from aberdeen?
abdn_ctn=0
for i in email_of_1_flat:
    if i.endswith("abdn.ac.uk"):
        abdn_ctn +=1
print(abdn_ctn,len(email_of_1_flat))

(1132, 2936)


In [28]:
import pandas as pd

In [30]:
world_domains = pd.read_json('world_universities_and_domains.json')

In [33]:
world_domains.head()

Unnamed: 0,alpha_two_code,country,domain,name,web_page
0,US,United States,calbaptist.edu,California Baptist University,http://www.calbaptist.edu/
1,US,United States,stevenson.edu,Stevenson University,http://www.stevenson.edu/
2,US,United States,mbl.edu,Marine Biological Laboratory,http://www.mbl.edu/
3,US,United States,maryville.edu,Maryville University,http://www.maryville.edu/
4,US,United States,aii.edu,The Art Institutes,http://www.aii.edu/


In [34]:
world_domains.tail()

Unnamed: 0,alpha_two_code,country,domain,name,web_page
9450,US,United States,ewc.wy.edu,Eastern Wyoming College,http://ewc.wy.edu
9451,US,United States,lccc.wy.edu,Laramie County Community College,http://www.lccc.wy.edu
9452,US,United States,northwestcollege.edu,Northwest College,http://www.northwestcollege.edu
9453,US,United States,sheridan.edu,Sheridan College,http://www.sheridan.edu
9454,US,United States,wwcc.wy.edu,Western Wyoming Community College,http://www.wwcc.wy.edu


In [47]:
emails_df = pd.DataFrame.from_records({'email':email_of_1_flat+email_of_39_flat+email_of_86_flat})

In [48]:
emails_df

Unnamed: 0,email
0,L.V.Burgin@leeds.ac.uk
1,Lorna.ramsay@education.gsi.gov.uk
2,r.aspden@abdn.ac.uk
3,david.mcghee@abdn.ac.uk
4,j.mercer@abdn.ac.uk
5,michelle.murphy@abdn.ac.uk
6,Sarah.Harris@igmm.ed.ac.uk.
7,T.Gillingwater@ed.ac.uk.
8,jiabao.he@abdn.ac.uk
9,s.schwarzkopf@gmail.com


In [49]:
def extract_domain(email):
    return email.split("@")[1]

In [50]:
extract_domain("mackenzie@abdn.ac.uk")

'abdn.ac.uk'

In [52]:
emails_df['domain']=emails_df.apply(lambda row: extract_domain(row['email']),axis=1)
emails_df

Unnamed: 0,email,domain
0,L.V.Burgin@leeds.ac.uk,leeds.ac.uk
1,Lorna.ramsay@education.gsi.gov.uk,education.gsi.gov.uk
2,r.aspden@abdn.ac.uk,abdn.ac.uk
3,david.mcghee@abdn.ac.uk,abdn.ac.uk
4,j.mercer@abdn.ac.uk,abdn.ac.uk
5,michelle.murphy@abdn.ac.uk,abdn.ac.uk
6,Sarah.Harris@igmm.ed.ac.uk.,igmm.ed.ac.uk.
7,T.Gillingwater@ed.ac.uk.,ed.ac.uk.
8,jiabao.he@abdn.ac.uk,abdn.ac.uk
9,s.schwarzkopf@gmail.com,gmail.com


In [53]:
merged_domains = pd.merge(emails_df,world_domains , left_on='domain',right_on="domain")

In [54]:
merged_domains

Unnamed: 0,email,domain,alpha_two_code,country,name,web_page
0,L.V.Burgin@leeds.ac.uk,leeds.ac.uk,GB,United Kingdom,University of Leeds,http://www.leeds.ac.uk/
1,T.R.A.Ensor@leeds.ac.uk,leeds.ac.uk,GB,United Kingdom,University of Leeds,http://www.leeds.ac.uk/
2,T.R.A.Ensor@leeds.ac.uk,leeds.ac.uk,GB,United Kingdom,University of Leeds,http://www.leeds.ac.uk/
3,n.alwan@leeds.ac.uk,leeds.ac.uk,GB,United Kingdom,University of Leeds,http://www.leeds.ac.uk/
4,j.d.walley@leeds.ac.uk,leeds.ac.uk,GB,United Kingdom,University of Leeds,http://www.leeds.ac.uk/
5,r.m.wilkie@leeds.ac.uk,leeds.ac.uk,GB,United Kingdom,University of Leeds,http://www.leeds.ac.uk/
6,d.miller@leeds.ac.uk,leeds.ac.uk,GB,United Kingdom,University of Leeds,http://www.leeds.ac.uk/
7,pscplab@leeds.ac.uk,leeds.ac.uk,GB,United Kingdom,University of Leeds,http://www.leeds.ac.uk/
8,s.r.kingsbury@leeds.ac.uk,leeds.ac.uk,GB,United Kingdom,University of Leeds,http://www.leeds.ac.uk/
9,d.g.jayne@leeds.ac.uk,leeds.ac.uk,GB,United Kingdom,University of Leeds,http://www.leeds.ac.uk/


In [69]:
# merged_domains.groupby(['domain']).agg(['count']).reset_index('count').sort_values(['count'], ascending=False)
merged_domains.groupby(['domain']).count().sort(['email'],ascending=False)#.reset_index('count').sort_values(['count'], ascending=False)

Unnamed: 0_level_0,email,alpha_two_code,country,name,web_page
domain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
abdn.ac.uk,1126,1126,1126,1126,1126
open.ac.uk,1108,1108,1108,1108,1108
ed.ac.uk,685,685,685,685,685
ncl.ac.uk,69,69,69,69,69
nottingham.ac.uk,39,39,39,39,39
unsw.edu.au,26,26,26,26,26
cam.ac.uk,24,24,24,24,24
dundee.ac.uk,24,24,24,24,24
ucl.ac.uk,23,23,23,23,23
kcl.ac.uk,21,21,21,21,21


In [89]:
import textrazor
#pip install textrazor
file = open("article_19787393.txt","r") 
testarticle=file.read()

textrazor.api_key = "eba9d80209aaad9627277d21437d2e0f56ef9c1e6e16332f221af063"

client = textrazor.TextRazor(extractors=["entities", "topics"])
response = client.analyze(testarticle)

for entity in response.entities():
    print entity.id, entity.relevance_score, entity.confidence_score, entity.freebase_types

Species 0.3804 2.214 [u'/biology/organism_classification', u'/biology/organism_classification_rank']
Regeneration (biology) 0.5071 1.426 []
Natural environment 0.5846 1.335 [u'/book/book_subject', u'/law/legal_subject', u'/organization/organization_sector']
Pascal (unit) 0.3716 7.744 [u'/time_series/unit', u'/type/unit', u'/symbols/namesake', u'/measurement_unit/pressure_unit']
Pascal (unit) 0.3716 7.744 [u'/time_series/unit', u'/type/unit', u'/symbols/namesake', u'/measurement_unit/pressure_unit']
Pascal (unit) 0.3716 7.744 [u'/time_series/unit', u'/type/unit', u'/symbols/namesake', u'/measurement_unit/pressure_unit']
Pascal (unit) 0.3716 7.744 [u'/time_series/unit', u'/type/unit', u'/symbols/namesake', u'/measurement_unit/pressure_unit']
Pascal (unit) 0.3716 7.744 [u'/time_series/unit', u'/type/unit', u'/symbols/namesake', u'/measurement_unit/pressure_unit']
Pascal (unit) 0.3716 7.744 [u'/time_series/unit', u'/type/unit', u'/symbols/namesake', u'/measurement_unit/pressure_unit']
Pasc