This will extract year, citation, and abstract for the research data 
Input: .txt file of research
Output: {ID: (year, citation, abstract)}

In [None]:
#id  string: S2 generated research paper ID
#title  string: Research paper title.
#paperAbstract  string: Extracted abstract of the paper
#entities  list: S2 extracted list of relevant entities or topics.
#s2Url  string: URL to S2 research paper details page
#s2PdfUrl  string: URL to PDF on S2 if available.
#pdfUrls  list: URLs related to this PDF scraped from the web.
#authors  list: List of authors with an S2 generated author ID and name.
#inCitations  list: List of S2 paperId's which cited this paper.
#outCitations  list: List of paperId's which this paper cited.
#year  int: Year this paper was published as integer.
#venue  string: Extracted venue published.
#journalName  string: Name of the journal that published this paper.
#journalVolume  string: The volume of the journal where this paper was published.
#journalPages  string: The pages of the journal where this paper was published.
#sources  list: Identifies papers sourced from DBLP or Medline.
#doi  string: Digital Object Identifier registered at doi.org.
#doiUrl  string: DOI link for registered objects.
#pmid  string: Unique identifier used by PubMed.

In [20]:
import os
import re

cwd = os.getcwd() # get the current working directory 
with open(cwd + '/sample-S2-records.txt') as myfile:
    data = myfile.read().splitlines() # a list with each line as an entry

In [17]:
first_entry = data[0] # first entry
print(data[0])

{"entities":["Epithelial ovarian cancer","Excision","Extraction","Hospital admission","Malignant neoplasm of ovary","Morbidity - disease rate","Neoadjuvant Therapy","Neoplasms","Overall Survival","Patients","Postoperative Complications","Residual Tumor","SLC13A5 gene","Stage IV Ovarian Carcinoma","Tumor Debulking","intensive care unit","ovarian neoplasm","stage IV childhood Hodgkin's lymphoma"],"journalVolume":"19","journalPages":"959-965","pmid":"21994038v1","year":2011,"outCitations":["166ac4b6f694c68dafdc912ca0c336b4c444fd9e","f0223f8d1920009d1afccffe2d4129f2211711cf","aaee9e127e63a4ee8baae2bc8a960f4a42afce78","03029e4427cfe66c3da6257979dc2d5b6eb3a0e4","d14434966fd87e94c97ed88938ea3dd5282d7652","d6fa05d67f9a6fc3256d05c81a4c55c472e78b0c","e9b9d08937fd2e603d5f8106ccaf81589dfa9cdf","943c32265fa79b20b925f6cc450db19b21c47bc0","07407593b49bc95f8d0c3c3ab912d78564db3302","fedaf6c7ea8b58e501b667603b8a23f1756ce375","ce84de64d8388258243244bbb4aef290e6022d30","399e7acd3fb9fa3f7bcf75e378e6176f7f

In [19]:
print("Length of data:", len(data))

Length of data: 102


In [161]:
# looking for inCitations, year, and paperAbstract
# The label is the descriptor. Ex. "year", "id", "abstract"
# The tag is the unique identifier. Ex "2011", "4cbba8127c8747a3b2cfb9c1f48c43e5c15e323e"
parenthesis = "\"" # string literal for "

paper_mappings = {}

# finding the id tag 
paper_id_tag = "\"id\"" # find the first occurence of "id"
id_label_start = first_entry.find(paper_id_tag) # this is the index that the id label starts
id_label_end = id_tag_start + 4 # this is the index that the id label ends 
id_tag_start = id_label_start + 6 # this is the index that the id tag starts. Always be 6. 
id_tag_end = first_entry.find(parenthesis, id_tag_start) - 1 #this is the index that the id tag ends
id_tag = first_entry[id_tag_start:id_tag_end] # id tag string 

# find year 
paper_year_tag = "\"year\"" # find the first occurence of "year"
year_label_start = first_entry.find(paper_year_tag) # index that the year label starts 
year_label_end = year_label_start + 7 # index that the year label ends 
year_tag_start = year_label_start + 7 # index that the year tag starts 
year_tag_end = first_entry.find(parenthesis, year_tag_start) - 1 # this is the index that the year tag ends
year_tag = first_entry[year_tag_start:year_tag_end] # year tag string

# find number of citations 
# check edge cases where "inCitations":[], "inCitations":[aienf23f9j023], "inCitations":[anea39rjna3rj, a3r98a3hrn9a38hr]
# we'll count by commas, but work the edge cases
bracket = r"]" # look for "]"
paper_citation_tag = "\"inCitations\"" # find the occurence of "inCitations"
citation_label_start = first_entry.find(paper_citation_tag) # index that the citation label starts 
citation_label_end = citation_label_start + 15 # index that the citation label ends
citation_tag_start = citation_label_start + 15 # index that the citation tag starts
citation_tag_end = first_entry.find(bracket, citation_tag_start)  # this is the index that the citation tag ends 
citation_list = first_entry[citation_tag_start:citation_tag_end].split(",")
num_citations = len(citation_list)

# find abstract 
paper_abstract_tag = "\"paperAbstract\""
abstract_label_start = first_entry.find(paper_abstract_tag)
abstract_label_end = abstract_label_start + 17
abstract_tag_start = abstract_label_start + 17 # the start of the abstract tag 
abstract_tag_end = first_entry.find(parenthesis, abstract_tag_start) # the end of the abstract tag
abstract_tag = first_entry[abstract_tag_start:abstract_tag_end]

# mapping = {id:(year, number of citations, abstract)}
paper_mappings[id_tag] = (year_tag, num_citations, abstract_tag)

In [162]:
print(paper_mappings)

{'4cbba8127c8747a3b2cfb9c1f48c43e5c15e323': ('2011', 27, 'Primary debulking surgery (PDS) has historically been the standard treatment for advanced ovarian cancer. Recent data appear to support a paradigm shift toward neoadjuvant chemotherapy with interval debulking surgery (NACT-IDS). We hypothesized that stage IV ovarian cancer patients would likely benefit from NACT-IDS by achieving similar outcomes with less morbidity. Patients with stage IV epithelial ovarian cancer who underwent primary treatment between January 1, 1995 and December 31, 2007, were identified. Data were retrospectively extracted. Each patient record was evaluated to subclassify stage IV disease according to the sites of tumor dissemination at the time of diagnosis. The Kaplan–Meier method was used to compare overall survival (OS) data. A total of 242 newly diagnosed stage IV epithelial ovarian cancer patients were included in the final analysis; 176 women (73%) underwent PDS, 45 (18%) NACT-IDS, and 21 (9%) chemoth

In [163]:
# mapping ids to (year, number of citations, and abstract tag)

paper_mappings = {} # set up the dictionary to store information 

# setting up important variables 
parenthesis = "\"" # string literal for "
bracket = r"]" # look for "]"
paper_id_tag = "\"id\"" # find the first occurence of "id"
paper_year_tag = "\"year\"" # find the first occurence of "year"
paper_citation_tag = "\"inCitations\"" # find the occurence of "inCitations"
paper_abstract_tag = "\"paperAbstract\""

# loop through each line of the data 
for line in data: 
    # look for the labels 
    id_label_start = line.find(paper_id_tag) # this is the index that the id label starts
    year_label_start = line.find(paper_year_tag) # index that the year label starts 
    
    
    
    