This will extract year, citation, and abstract for the research data 
Input: .txt file of research
Output: {ID: (year, citation, abstract)}

In [None]:
#id  string: S2 generated research paper ID
#title  string: Research paper title.
#paperAbstract  string: Extracted abstract of the paper
#entities  list: S2 extracted list of relevant entities or topics.
#s2Url  string: URL to S2 research paper details page
#s2PdfUrl  string: URL to PDF on S2 if available.
#pdfUrls  list: URLs related to this PDF scraped from the web.
#authors  list: List of authors with an S2 generated author ID and name.
#inCitations  list: List of S2 paperId's which cited this paper.
#outCitations  list: List of paperId's which this paper cited.
#year  int: Year this paper was published as integer.
#venue  string: Extracted venue published.
#journalName  string: Name of the journal that published this paper.
#journalVolume  string: The volume of the journal where this paper was published.
#journalPages  string: The pages of the journal where this paper was published.
#sources  list: Identifies papers sourced from DBLP or Medline.
#doi  string: Digital Object Identifier registered at doi.org.
#doiUrl  string: DOI link for registered objects.
#pmid  string: Unique identifier used by PubMed.

In [20]:
import os
import re

cwd = os.getcwd() # get the current working directory 
with open(cwd + '/sample-S2-records.txt') as myfile:
    data = myfile.read().splitlines() # a list with each line as an entry

In [17]:
first_entry = data[0] # first entry
print(data[0])

{"entities":["Epithelial ovarian cancer","Excision","Extraction","Hospital admission","Malignant neoplasm of ovary","Morbidity - disease rate","Neoadjuvant Therapy","Neoplasms","Overall Survival","Patients","Postoperative Complications","Residual Tumor","SLC13A5 gene","Stage IV Ovarian Carcinoma","Tumor Debulking","intensive care unit","ovarian neoplasm","stage IV childhood Hodgkin's lymphoma"],"journalVolume":"19","journalPages":"959-965","pmid":"21994038v1","year":2011,"outCitations":["166ac4b6f694c68dafdc912ca0c336b4c444fd9e","f0223f8d1920009d1afccffe2d4129f2211711cf","aaee9e127e63a4ee8baae2bc8a960f4a42afce78","03029e4427cfe66c3da6257979dc2d5b6eb3a0e4","d14434966fd87e94c97ed88938ea3dd5282d7652","d6fa05d67f9a6fc3256d05c81a4c55c472e78b0c","e9b9d08937fd2e603d5f8106ccaf81589dfa9cdf","943c32265fa79b20b925f6cc450db19b21c47bc0","07407593b49bc95f8d0c3c3ab912d78564db3302","fedaf6c7ea8b58e501b667603b8a23f1756ce375","ce84de64d8388258243244bbb4aef290e6022d30","399e7acd3fb9fa3f7bcf75e378e6176f7f

In [19]:
print("Length of data:", len(data))

Length of data: 102


In [161]:
# looking for inCitations, year, and paperAbstract
# The label is the descriptor. Ex. "year", "id", "abstract"
# The tag is the unique identifier. Ex "2011", "4cbba8127c8747a3b2cfb9c1f48c43e5c15e323e"
parenthesis = "\"" # string literal for "

paper_mappings = {}

# finding the id tag 
paper_id_tag = "\"id\"" # find the first occurence of "id"
id_label_start = first_entry.find(paper_id_tag) # this is the index that the id label starts
id_label_end = id_tag_start + 4 # this is the index that the id label ends 
id_tag_start = id_label_start + 6 # this is the index that the id tag starts. Always be 6. 
id_tag_end = first_entry.find(parenthesis, id_tag_start) - 1 #this is the index that the id tag ends
id_tag = first_entry[id_tag_start:id_tag_end] # id tag string 

# find year 
paper_year_tag = "\"year\"" # find the first occurence of "year"
year_label_start = first_entry.find(paper_year_tag) # index that the year label starts 
year_label_end = year_label_start + 7 # index that the year label ends 
year_tag_start = year_label_start + 7 # index that the year tag starts 
year_tag_end = first_entry.find(parenthesis, year_tag_start) - 1 # this is the index that the year tag ends
year_tag = first_entry[year_tag_start:year_tag_end] # year tag string

# find number of citations 
# check edge cases where "inCitations":[], "inCitations":[aienf23f9j023], "inCitations":[anea39rjna3rj, a3r98a3hrn9a38hr]
# we'll count by commas, but work the edge cases
bracket = r"]" # look for "]"
paper_citation_tag = "\"inCitations\"" # find the occurence of "inCitations"
citation_label_start = first_entry.find(paper_citation_tag) # index that the citation label starts 
citation_label_end = citation_label_start + 15 # index that the citation label ends
citation_tag_start = citation_label_start + 15 # index that the citation tag starts
citation_tag_end = first_entry.find(bracket, citation_tag_start)  # this is the index that the citation tag ends 
citation_list = first_entry[citation_tag_start:citation_tag_end].split(",")
num_citations = len(citation_list)

# find abstract 
paper_abstract_tag = "\"paperAbstract\""
abstract_label_start = first_entry.find(paper_abstract_tag)
abstract_label_end = abstract_label_start + 17
abstract_tag_start = abstract_label_start + 17 # the start of the abstract tag 
abstract_tag_end = first_entry.find(parenthesis, abstract_tag_start) # the end of the abstract tag
abstract_tag = first_entry[abstract_tag_start:abstract_tag_end]

# mapping = {id:(year, number of citations, abstract)}
paper_mappings[id_tag] = (year_tag, num_citations, abstract_tag)

In [162]:
print(paper_mappings)

{'4cbba8127c8747a3b2cfb9c1f48c43e5c15e323': ('2011', 27, 'Primary debulking surgery (PDS) has historically been the standard treatment for advanced ovarian cancer. Recent data appear to support a paradigm shift toward neoadjuvant chemotherapy with interval debulking surgery (NACT-IDS). We hypothesized that stage IV ovarian cancer patients would likely benefit from NACT-IDS by achieving similar outcomes with less morbidity. Patients with stage IV epithelial ovarian cancer who underwent primary treatment between January 1, 1995 and December 31, 2007, were identified. Data were retrospectively extracted. Each patient record was evaluated to subclassify stage IV disease according to the sites of tumor dissemination at the time of diagnosis. The Kaplan–Meier method was used to compare overall survival (OS) data. A total of 242 newly diagnosed stage IV epithelial ovarian cancer patients were included in the final analysis; 176 women (73%) underwent PDS, 45 (18%) NACT-IDS, and 21 (9%) chemoth

In [185]:
# mapping ids to (year, number of citations, and abstract tag)
# TO DO 
# some research papers don't have years! Write an exception for this 
# some research papers may have no citations! Write exceptiosn for this 
# some research papers just don't have any abstracts lol. Fix this. 

paper_mappings = {} # set up the dictionary to store information 

# setting up important variables 
parenthesis = "\"" # string literal for "
bracket = r"]" # look for "]"
paper_id_tag = "\"id\"" # find the first occurence of "id"
paper_year_tag = "\"year\"" # find the first occurence of "year"
paper_citation_tag = "\"inCitations\"" # find the occurence of "inCitations"
paper_abstract_tag = "\"paperAbstract\""

# loop through each line of the data 
for line in data: 
    # look for the labels "id", "year", "inCitations", and "paperAbstract"
    id_label_start = line.find(paper_id_tag) # this is the index that the id label starts
    year_label_start = line.find(paper_year_tag) # index that the year label starts 
    citation_label_start = line.find(paper_citation_tag) # index that the citation label starts 
    abstract_label_start = line.find(paper_abstract_tag) # index that the abstrat label starts 
    
    # look for the tag of each label 
    id_tag_start = id_label_start + 6 # this is the index that the id tag starts. Always be 6. 
    year_tag_start = year_label_start + 7 # index that the year tag starts 
    citation_tag_start = citation_label_start + 15 # index that the citation tag starts
    abstract_tag_start = abstract_label_start + 17 # the start of the abstract tag 
    
    # look for the last index of each tag 
    id_tag_end = line.find(parenthesis, id_tag_start) - 1 #this is the index that the id tag ends
    year_tag_end = line.find(parenthesis, year_tag_start) - 1 # this is the index that the year tag ends
    citation_tag_end = line.find(bracket, citation_tag_start)  # this is the index that the citation tag ends 
    abstract_tag_end = line.find(parenthesis, abstract_tag_start) # the end of the abstract tag
    
    # extract the tag
    id_tag = line[id_tag_start:id_tag_end] # id tag string 
    year_tag = line[year_tag_start:year_tag_end] # year tag string
    citation_list = line[citation_tag_start:citation_tag_end].split(",") # make it a list, count number of entries
    num_citations = len(citation_list) # number of citations 
    abstract_tag = line[abstract_tag_start:abstract_tag_end] # abstract tag string 
    
    if id_tag not in paper_mappings: 
        paper_mappings[id_tag] = (year_tag, num_citations, abstract_tag)

In [189]:
print("Useful information for last entry")
print()
print("ID label index start:",id_label_start)
print("Year label index start:",year_label_start)
print("Citation label index start:",citation_label_start)
print("Abstract label index start:",abstract_label_start)
print()
print("ID tag index start:", id_tag_start)
print("Year tag index start:", year_tag_start)
print("Citation tag index start:", citation_tag_start)
print("Abstract tag index start:", abstract_tag_start)
print()
print("ID tag index end:", id_tag_end)
print("Year tag index end:", year_tag_end)
print("Citation tag index end:", citation_tag_end)
print("Abstract tag index end:", abstract_tag_end)
print()
print("ID tag:", id_tag)
print("Year tag:", year_tag)
print("Citation tag:", num_citations)
print("Abstract tag:", abstract_tag)
print()
print("Number of mappings in paper_mappings", len(paper_mappings))


Useful information for last entry

ID label index start: 1743
Year label index start: 248
Citation label index start: 2295
Abstract label index start: 1861

ID tag index start: 1749
Year tag index start: 255
Citation tag index start: 2310
Abstract tag index start: 1878

ID tag index end: 1788
Year tag index end: 259
Citation tag index end: 2825
Abstract tag index end: 2293

ID tag: 043f91aa4e90077b17231adc6e6e98ceb8e88f6
Year tag: 2007
Citation tag: 12
Abstract tag: We introduce a Sumii-Pierce-Koutavas-Wand-style bisimulation for Pitts and Stark’s nucalculus, a simply-typed lambda calculus with fresh name generation. This bisimulation coincides with contextual equivalence and provides a usable and elementary method for establishing all the subtle equivalences given by Stark [29]. We also describe the formalization of soundness and of the examples in the Coq proof assistant.

Number of mappings in paper_mappings 102


In [192]:
print(paper_mappings.keys())

dict_keys(['4cbba8127c8747a3b2cfb9c1f48c43e5c15e323', '4c61478345166be0d917854bd5e5f42a6ade236', '34ca6d85db744543ddc27d74d7f225b13c66b95', '3316b8b97c1e17ac93f220f4b64842905c40cd9', '58ff17c7d8ca006731facf7771761946350db06', 'f487c60cc4b4637584f89d4bc892aae4eb8c8a1', '69662bd2a2f5ff9cf2b0230bcc852290eadfbfc', '3f1b5aa320422a4df10ed248898125e6fd3440d', '453ae606c4c5f2dd278b8892b1eed9ecd5154ec', 'c739d07173f366ba99a948c7888459c5d70adcf', 'cb61fc1ebdeb5835460c18044d331388d5b1067', '50ef31b58a30dfefb624db6f72cda7bc242cde5', '10b5e8d1ab6f8002c89b2bfe4002a22c4f6ea9d', 'ccc67f73db54afc3f016ba5449d30235b7c256b', '7e1dc0e805fbb13112da580c37c50d117944afe', '7331b32342d54e97ca7e32f01b9e1ae94ed563d', '177e662ac662c21b02f28d8bf6c35ef98487411', 'ac1ccf2f8d373a4964b127ba651a21b75e184be', '06f3d20b2c9191b4f03761a61312a1c71345a00', 'b048c1886c86e65fdfa489038372f938cf62a59', 'a19381d3b56fbe1b7e5426c92ee1e140cd752e5', '213a87540ca26cc97a356689654dd4338cb327a', 'dabb12f12f5ea266754c7b966af3f588a885ff2', 

In [198]:
# testing years 
keys = list(paper_mappings.keys())
for key in keys: 
    if paper_mappings[key][0] == "tie":
        print(key)
    print(paper_mappings[key][0])


2011
1990
1976
1996
1995
2009
2004
2010
2009
1989
1989
2013
2006
2006
2011
1981
2001
2004
1993
2001
2015
2014
2016
1981
2006
2016
1987
2015
1956
2017
2008
1990
2014
1975
2006
6739f08784213707b6330dfd4594adee325deb2
tie
2001
2013
2006
2014
1992
1955
1986
2011
2014
2012
2002
1986
2012
2014
2010
2013
1990
2014
2018
2006
2016
2009
2005
1966
2013
1997
2011
1967
2007
1937
1996
2001
1944
2004
2007
2001
2012
2004
2010
2012
2002
2009
2014
2013
2018
1968
2015
2014
2018
2009
1989
2017
2006
2017
2014
2002
1998
2011
2014
2015
2013
2009
2012
2005
d169494be0045e845470ae04a84f022021fbb5b
tie
2007


In [201]:
# testing citations
keys = list(paper_mappings.keys())
for key in keys: 
    print(paper_mappings[key][1])


27
24
1
20
1
1
26
15
1
1
1
1
1
17
1
1
4
64
25
9
7
20
2
8
6
1
5
1
1
1
18
18
1
4
4
1
1
1
1
9
13
1
30
2
1
1
6
1
1
14
14
1
1
1
1
1
1
3
13
1
1
1
1
1
2
1
1
1
1
1
1
1
7
1
7
1
4
4
8
2
1
1
3
4
1
1
1
1
1
2
5
1
12
1
6
1
12
4
1
12
1
12


In [204]:
# testing keys
keys = list(paper_mappings.keys())
for key in keys: 
    print(len(paper_mappings[key][2]))

1698
0
0
762
0
0
628
603
360
0
840
1675
477
384
939
0
1497
1605
1808
908
1379
1910
813
1648
790
3255
1312
0
0
514
2242
779
0
729
1253
230
0
1381
2053
904
857
0
1404
1278
993
344
735
1500
493
1026
1092
642
369
3118
1270
0
1117
1847
1073
0
0
108
1250
0
1126
749
0
913
0
1342
453
0
1084
461
1385
488
990
0
2081
1277
707
0
1153
968
1163
895
0
1434
803
1206
364
0
0
1308
1922
474
1380
1085
0
1202
413
415


In [205]:
# testing keys
keys = list(paper_mappings.keys())
for key in keys: 
    print(paper_mappings[key][2])

Primary debulking surgery (PDS) has historically been the standard treatment for advanced ovarian cancer. Recent data appear to support a paradigm shift toward neoadjuvant chemotherapy with interval debulking surgery (NACT-IDS). We hypothesized that stage IV ovarian cancer patients would likely benefit from NACT-IDS by achieving similar outcomes with less morbidity. Patients with stage IV epithelial ovarian cancer who underwent primary treatment between January 1, 1995 and December 31, 2007, were identified. Data were retrospectively extracted. Each patient record was evaluated to subclassify stage IV disease according to the sites of tumor dissemination at the time of diagnosis. The Kaplan–Meier method was used to compare overall survival (OS) data. A total of 242 newly diagnosed stage IV epithelial ovarian cancer patients were included in the final analysis; 176 women (73%) underwent PDS, 45 (18%) NACT-IDS, and 21 (9%) chemotherapy only. The frequency of achieving complete resection 