In [141]:
from alphabet_detector import AlphabetDetector
import pandas as pd
import requests
import json
from sqlalchemy import create_engine
from sqlalchemy import text as sql_text

# Inputs for the MAK POST request, including the API key
HEADERS = {
    'Ocp-Apim-Subscription-Key': 'a9a9efa851b44d5bbd6c841215a99e00',
    'Content-Type': 'application/x-www-form-urlencoded'
}

# Fields to return from MAK
FIELDS = ["Id","Ti","D","AA.AuN","AA.AuId","F.FId","L","C.CN","E",
          "J.JId","AA.AfId","CC","ECC","AA.AfN","J.JN"]


class TitleProcessor(AlphabetDetector):
    '''Processes a pure utf-8 title into something ready for a MAK query.'''
    def process_title(self, title):
        # Get replace non-alphanums (allowing foreign characters)
        result = "".join([x
                          if len(self.detect_alphabet(x)) > 0
                          or x.isnumeric()
                          else " " for x in title.lower()])
        # Replace double-spaces with single-spaces
        while "  " in result:
            result = result.replace("  "," ")        
        return result


'''Find matches to titles from the MAK database.

    raw_titles: A list of titles in the form (id, title)
    call_limit: The maximum number of MAK API calls. 
                NB: Nesta's allowance is 10,000 per month.
'''
def mak_from_titles(raw_titles, call_limit, optional_columns):

    # Make arXiv titles match MAK title format (strip non-alphanums,
    # allowing foreign chars)
    tp = TitleProcessor()
    titles = [(pid,tp.process_title(t)) for pid,t in raw_titles]
    # Maximum of title_count titles, returning query_count results
    title_count = 600
    title_offset = 0
    query_count = 1000

    # Count the number of calls for book-keeping
    calls = 0

    # Iterate until done
    data = []
    while title_offset < len(titles):
        # A soft limit so that we don't overrun the API limit
        if calls >= call_limit:
            break
        calls += 1

        # Get the index of the final title
        last_title = title_offset+title_count
        # Python indexing [n:None] will return n --> end
        if last_title > len(titles):
            last_title = None
        # Get the title subset for this query
        titles_subset = titles[title_offset:last_title]
        title_offset += title_count        

        # Generate the MAK query (OR statement of titles (Ti))
        expr = ["Ti='"+t+"'" for _,t in titles_subset]
        print("Posting",len(expr),"queries")
        expr = ','.join(expr)
        expr = "expr=OR("+expr+")"
        
        # Write and launch the query
        query = expr+"&count="+str(query_count)+"&attributes="+",".join(FIELDS)
        r = requests.post('https://westus.api.cognitive.microsoft.com/academic/v1.0/evaluate',
                          data=query.encode("utf-8"), headers=HEADERS)
        try:
            js = r.json()
        except json.decoder.JSONDecodeError as err:
            print("Error with status code ",r.status_code)
            print(r.text)
            raise err
        # Print out some stats
        print("Got",len(js["entities"]),"results")
        
        # Append the results to the output
        for pid,t in titles_subset:
            # Flag in case no match is found
            matched = False
            for row in js["entities"]:                
                if t != row["Ti"]:
                    continue
                matched = True
                break
            # Default in case no match is found
            if not matched:
                data.append(dict(pid=pid,title=t,matched=False))
                continue
            # If a match was found, extract info        
            insts = list(set(author["AfN"] for author in row["AA"] if "AfN" in author))

            # Convert "extended metadata" (E) to json, then extract arxiv IDs
            arxiv_sources = []
            if "E" in row:
                if type(row["E"]) is not dict:
                    row["E"] = json.loads(row["E"])
                if 'S' in row["E"]:
                    for source in row["E"]["S"]:
                        if "U" not in source:
                            continue
                        if not source['U'].startswith("https://arxiv.org/"):
                            continue
                        arxiv_sources.append(source['U'])
                # Add then mandatory fields
            data_row = dict(pid=pid,title=t, institutes=insts, arxiv_sources=arxiv_sources,
                            citations=row["CC"], date=row["D"], matched=True)            
            # Then add optional fields
            for long, short in optional_columns.items():                
                second = None
                if "." in short:
                    short, second = short.split(".")
                if short in row:
                    if second is None:
                        data_row[long] = row[short]
                    elif second in row[short]:
                        data_row[long] = row[short][second]
            data.append(data_row)
            
    # Print summary statistics
    nmatch = 0 
    nboth = 0
    for row in data:
        if not row["matched"]:
            continue
        nmatch += 1
        if row["citations"] > 0 and len(row["institutes"]) > 0:
            nboth += 1
    print("Made",calls,"calls")
    print("Got",nmatch,"matches from",len(data),"queries, of which",
          nboth,"contained both institutes and citation information")
    # Done
    return data

# Stolen from https://stackoverflow.com/a/434328/1571593
def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

# Execute IN statements in chunks
def execute_IN_in_chunks(con, query, chunkable, chunk_size):
    output = []
    for chunk in chunker(chunkable, chunk_size):
        result = con.execute(sql_text(query), values=tuple(chunk))
        output += result.fetchall()
    return output

## Step 1: Match CS arxiv articles to MAG API

In [143]:
# df_arxiv = pd.read_json("data/cs_arxiv.json", orient="records")
# raw_titles = [(row.id, row.raw_title) for _, row in df_arxiv.iterrows()]

optional_columns = dict(language="L", full_title="E.DN",
                        conference="CN", journal="E.BV", doi="E.DOI")

data = mak_from_titles(raw_titles, call_limit=300, optional_columns=optional_columns)
df_magapi = pd.DataFrame(data)

Posting 600 queries
Got 613 results
Posting 600 queries
Got 595 results
Posting 600 queries
Got 436 results
Posting 600 queries
Got 552 results
Posting 600 queries
Got 611 results
Posting 600 queries
Got 647 results
Posting 600 queries
Got 643 results
Posting 600 queries
Got 636 results
Posting 600 queries
Got 652 results
Posting 600 queries
Got 643 results
Posting 600 queries
Got 628 results
Posting 600 queries
Got 636 results
Posting 600 queries
Got 659 results
Posting 600 queries
Got 617 results
Posting 600 queries
Got 635 results
Posting 600 queries
Got 645 results
Posting 600 queries
Got 631 results
Posting 600 queries
Got 619 results
Posting 600 queries
Got 590 results
Posting 600 queries
Got 535 results
Posting 600 queries
Got 578 results
Posting 600 queries
Got 616 results
Posting 600 queries
Got 603 results
Posting 600 queries
Got 593 results
Posting 600 queries
Got 626 results
Posting 600 queries
Got 638 results
Posting 600 queries
Got 604 results
Posting 600 queries
Got 618 

In [167]:
df_magapi.to_json("data/magapi_arxiv_match.json", orient="records")

In [200]:
len(df_magapi)

168527

In [145]:
df_magapi.head()

Unnamed: 0,arxiv_sources,citations,date,doi,full_title,institutes,journal,language,matched,pid,title
0,[https://arxiv.org/abs/0704.0002],17.0,2009-05-01,10.1007/s00373-008-0834-4,Sparsity-certifying Graph Decompositions,"[smith college, university of massachusetts am...",Graphs and Combinatorics,en,True,oai:arXiv.org:0704.0002,sparsity certifying graph decompositions
1,,,,,,,,,False,oai:arXiv.org:0704.0046,a limit relation for entropy and channel capac...
2,[https://arxiv.org/abs/0704.0047],8.0,2003-02-01,10.1108/00022660310457248,Intelligent location of simultaneously active ...,[university of ljubljana],Aircraft Engineering and Aerospace Technology,en,True,oai:arXiv.org:0704.0047,intelligent location of simultaneously active ...
3,[],0.0,2007-01-01,,Intelligent location of simultaneously active ...,[university of ljubljana],arXiv preprint arXiv:0704.0050,en,True,oai:arXiv.org:0704.0050,intelligent location of simultaneously active ...
4,,,,,,,,,False,oai:arXiv.org:0704.0062,on line viterbi algorithm and its relationship...


## Step 2a: Match MAG API to MAG DB on DOI

In [146]:
# Open a DB connections
with open('/Users/jklinger/Nesta-AWS/AWS-RDS-config/open-academic-graph.config') as f:
        host, port, _, user, password = f.read().split(':')
database_uri = 'postgresql://{}:{}@{}/{}'.format(user, password, host, "microsoft_academic_graph")
con = create_engine(database_uri)

# 
query = '''select paper from microsoft_academic_graph where ((((paper ->> 'doi'::text))::character varying(255))) in :values;'''
dois = df_magapi.loc[~pd.isnull(df_magapi.doi),"doi"]

papers = execute_IN_in_chunks(con, query, dois, 500)


In [169]:
df_papers = pd.DataFrame([p[0] for p in papers]).add_prefix('oag_')
df_papers.head()

Unnamed: 0,oag_abstract,oag_authors,oag_doc_type,oag_doi,oag_fos,oag_id,oag_issue,oag_keywords,oag_lang,oag_n_citation,oag_page_end,oag_page_start,oag_publisher,oag_references,oag_title,oag_url,oag_venue,oag_volume,oag_year
0,In most discussions about information and know...,"[{'org': 'GRDS-EBSI, Université de Montréal, C...",Journal,10.1002/asi.v60:9,"[Natural language processing, Design, Social s...",f65c2f5a-648f-4dcc-a974-f841a7d59f4b,9,"[lenguaje natural, langage naturel, conception...",en,50.0,1906,1895,"Wiley Subscription Services, Inc., A Wiley Com...","[01671f95-823d-41d3-96e0-78d17875260d, 02601ff...",Intertextual semantics: A semantics for inform...,[http://onlinelibrary.wiley.com/doi/10.1002/as...,Journal of the Association for Information Sci...,60,2009
1,Many algorithms have been implemented for the ...,[{'org': 'Arab Academy for Banking and Financi...,Journal,10.1002/asi.v60:9,"[Natural language processing, Speech recogniti...",efc8ba6d-0902-4492-8925-4c852235574d,9,"[search result, rocchio and naive bayes, estud...",en,50.0,1844,1836,"Wiley Subscription Services, Inc., A Wiley Com...","[01e036ec-11c7-4251-98cc-13d11b59d0f0, 2082b5a...",A comparison of text-classification techniques...,[http://onlinelibrary.wiley.com/doi/10.1002/as...,Journal of the Association for Information Sci...,60,2009
2,,[{'org': 'Royal School of Library and Informat...,Journal,10.1002/asi.v60:9,"[Computer Science, Interactive media, Informat...",d2b585ff-07b7-4a44-a184-872f91906fa4,9,[interactive information retrieval],en,,1945,1944,"Wiley Subscription Services, Inc., A Wiley Com...",,Interactive Information Retrieval in Digital E...,[http://onlinelibrary.wiley.com/doi/10.1002/as...,Journal of the Association for Information Sci...,60,2009
3,,"[{'org': '435 East 70th Street, Apartment 30D,...",Journal,10.1002/asi.v60:9,,c89cd70c-9336-4b78-b3de-eb3b26d33c1c,9,,en,50.0,1942,1942,"Wiley Subscription Services, Inc., A Wiley Com...","[14d853c3-57d7-4163-ba84-f9e8fa911a38, 50429db...",Design: The vision and the plans: Additional r...,"[http://dl.acm.org/citation.cfm?id=1598912, ht...",Journal of the Association for Information Sci...,60,2009
4,The universe of information has been enriched ...,"[{'org': 'Department of Information Science, B...",Journal,10.1002/asi.v60:9,"[Method, Web query classification, Computer Sc...",b4dcacf4-265c-47ef-ba2f-57e8d027c3cd,9,"[dynamic change, infometrie, web evolve 2008, ...",en,50.0,1740,1730,"Wiley Subscription Services, Inc., A Wiley Com...","[06a3f364-4630-402c-8ed7-7f0ede34ff1a, 0b489d9...",A method for measuring the evolution of a topi...,[http://journal.webscience.org/32/1/WebEvolve2...,Journal of the Association for Information Sci...,60,2009


In [181]:
len(df_papers), len(set(df_papers.oag_doi)), len(df_papers.drop_duplicates(subset="oag_doi"))

(63964, 59948, 59948)

## Step 3: Join it all together and save

In [196]:
_df = df_papers.drop_duplicates(subset="oag_doi")
condition = ~pd.isnull(df_magapi.doi)
df_magapi_magdb = _df.join(df_magapi.loc[condition].set_index("doi").add_prefix("mag_"), on="oag_doi")
df_magapi_magdb.rename(columns = {'mag_pid': 'arxiv_id'}, inplace=True)

In [197]:
df_magapi_magdb.head()

Unnamed: 0,oag_abstract,oag_authors,oag_doc_type,oag_doi,oag_fos,oag_id,oag_issue,oag_keywords,oag_lang,oag_n_citation,...,mag_arxiv_sources,mag_citations,mag_date,mag_full_title,mag_institutes,mag_journal,mag_language,mag_matched,arxiv_id,mag_title
0,In most discussions about information and know...,"[{'org': 'GRDS-EBSI, Université de Montréal, C...",Journal,10.1002/asi.v60:9,"[Natural language processing, Design, Social s...",f65c2f5a-648f-4dcc-a974-f841a7d59f4b,9,"[lenguaje natural, langage naturel, conception...",en,50.0,...,[https://arxiv.org/abs/0812.4332?context=cs],161.0,2009-09-01,Content-based and algorithmic classifications ...,"[university of amsterdam, university of sussex]",Journal of the Association for Information Sci...,en,True,oai:arXiv.org:0812.4332,content based and algorithmic classifications ...
28,In this paper we investigate the effects of te...,"[{'org': 'Department of Finance, The Chinese U...",Journal,10.1002/asmb.v25:3,"[Financial economics, Econometrics, Characteri...",ed6b3532-1de5-40a4-b811-5f05d9e618c3,3,"[high order moments, markov switching, mixing ...",en@@@zh_cht,50.0,...,[],0.0,2009-05-01,Assessment and propagation of input uncertaint...,[instituto de estudios superiores de administr...,Applied Stochastic Models in Business and Indu...,en,True,oai:arXiv.org:0704.1768,assessment and propagation of input uncertaint...
38,Map the vertices of a graph to (not necessaril...,[{'org': '(Chargée de Recherches du F.R.S.—FNR...,Journal,10.1002/jgt.20554,"[Graph power, Petersen graph, Unit disk graph,...",d1c359e7-bcb1-4433-b77b-a3e393eec667,3,"[packing non overlapping unit discs, circular ...",en,50.0,...,[https://arxiv.org/abs/0812.4346?context=cs],4.0,2011-11-01,The plane-width of graphs,"[university of toronto, university of primorsk...",Journal of Graph Theory,en@@@fr,True,oai:arXiv.org:0812.4346,the plane width of graphs
39,A new notion of partition-determined functions...,"[{'org': 'Department of Statistics, Yale Unive...",Journal,10.1002/rsa.20385,"[Combinatorics, Number theory, Discrete mathem...",f78e174e-8566-451d-8113-7ee891750c18,4,"[cardinality inequalities, entropy inequalitie...",en,50.0,...,[https://arxiv.org/abs/0901.0055?context=cs],24.0,2012-07-01,Entropy and set cardinality inequalities for p...,"[yale university, georgia institute of technol...",Random Structures and Algorithms,en,True,oai:arXiv.org:0901.0055,entropy and set cardinality inequalities for p...
40,Abstract#R##N##R##N#Biometrics make human iden...,"[{'org': 'Sagem Sécurité, Osny, France', 'name...",Journal,10.1002/sec.206,"[Identification, Computer Science, Bloom filte...",6b66e31d-a3ba-4075-a16a-cc9f500bb2f0,5,"[locality sensitive hashing, bloom filter, bio...",en,50.0,...,[https://arxiv.org/abs/0901.1062?context=cs],28.0,2011-01-05,Identification with encrypted biometric data,[sagem],Security and Communication Networks,en,True,oai:arXiv.org:0901.1062,identification with encrypted biometric data


In [198]:
df_magapi_magdb.to_json("data/magapi_oag_arxiv_match.json", orient="records")

In [199]:
len(df_magapi_magdb)

61701