---
title: "CAS 570 Proposal"
author: "Nicole Brewer"
toc: false
number-sections: false
highlight-style: pygments
format:
  html: 
    code-fold: true
    html-math-method: katex
  pdf:
    geometry: 
      - top=30mm
      - left=30mm
jupyter: python3
---

In [1]:
class Publication():
    
    def __init__(self, id, doi, code_avail, authors, documentation, platforms, sponsors):
        self.id = id
        self.doi = doi
        self.code_avail = code_avail
        self.authors = authors
        self.documentation = documentation
        self.platforms = platforms
        self.sponsors = sponsors
        self.eid = None # elsi

In [2]:
import openpyxl

# Open Workbook
wb = openpyxl.load_workbook(filename='Catalogdatabase-till2018b.xlsx', data_only=True)

In [45]:
sheet = wb['publication']
ncol = sheet.max_column  
nrow = sheet.max_row - 2 # there is an extraneous cell at the bottom of the sheet that gives us two extra columns

In [46]:
display_through = 11
[(i, sheet.cell(row = 1, column = i).value) for i in range(1, print_through)]

[(1, 'id'),
 (2, 'citations'),
 (3, 'mod'),
 (4, 'title'),
 (5, 'abstract'),
 (6, 'short_title'),
 (7, 'contact_email'),
 (8, 'email_sent_count'),
 (9, 'contact_author_name'),
 (10, 'is_primary'),
 (11, 'doi'),
 (12, 'series_text'),
 (13, 'series_title'),
 (14, 'series'),
 (15, 'issue'),
 (16, 'volume'),
 (17, 'pages'),
 (18, 'author_names'),
 (19, 'number authors'),
 (20, 'year_published'),
 (21, 'container__issn'),
 (22, 'container__name'),
 (23, 'code_archival_status'),
 (24, 'code'),
 (25, 'Documentation (AORML)'),
 (26, 'Documentation (Flow charts)'),
 (27, 'Documentation (Mathematical description)'),
 (28, 'Documentation (None)'),
 (29, 'Documentation (ODD)'),
 (30, 'Documentation (Ontologies)'),
 (31, 'Documentation (Other Narrative)'),
 (32, 'Documentation (Pseudocode)'),
 (33, 'Documentation (UML)'),
 (34, 'Platform (.NET)'),
 (35, 'Platform (ALMaSS)'),
 (36, 'Platform (Agent Analyst)'),
 (37, 'Platform (AnyLogic)'),
 (38, 'Platform (ArcGIS)'),
 (39, 'Platform (C#)'),
 (40, 'P

In [47]:
DOC_COLS = (25, 34)
PLAT_COLS = (34, 85)
SPON_COLS = (85, 136)
STRING_COLS = (1, 11) # id, doi
BOOL_COLS = (24, ) # code_avail
AUTH_COL = 18

In [48]:
# remove extra text surrounding type names
# ex 'Documentation (UML)' -> 'UML'
document_types = [sheet.cell(row = 1, column = i).value for i in range(*DOC_COLS)]
document_types = {i: s[s.find("(")+1:s.rfind(")")] for i, s in enumerate(document_types)} 
platform_types = [sheet.cell(row = 1, column = i).value for i in range(*PLAT_COLS)]
platform_types = {i: s[s.find("(")+1:s.rfind(")")] for i, s in enumerate(platform_types)}
sponsor_types = [sheet.cell(row = 1, column = i).value for i in range(*SPON_COLS)]
sponsor_types = {i: s[s.find("(")+1:s.rfind(")")] for i, s in enumerate(sponsor_types)}

In [49]:
def createTypeListFromBinaryEntries(indicies, types):
    ret = [bool(sheet.cell(row = i, column = j).value) for j in range(*indicies)]
    ret = [types[i] for i, present in enumerate(ret) if present]
    if all(val is None for val in ret):
        return None
    else:
        return ret

In [50]:
# turn the string representation of a list into an actual list
# "['Geoff Podger', ' Robert Power', ' Shane Seaton', ' Ang Yang']" -> ['Geoff Podger', ' Robert Power', ' Shane Seaton', ' Ang Yang']
def createAuthorList(row):
    authors_string = sheet.cell(row = row, column = AUTH_COL).value
    if not authors_string:
        return None
    authors_string = authors_string.replace("'", "").strip('"[] ')
    if len(authors_string) > 0:
        return authors_string.split(',')
    else:
        return 0

In [51]:
publications = []
total_none_values = [0 for i in range(0, len(STRING_COLS) + len(BOOL_COLS) + 4)] # 4 = authors + doc + plat + sponsor
for i in range(2, nrow + 1):
    values = [sheet.cell(row = i, column = j).value for j in STRING_COLS]
    values.append([bool(sheet.cell(row = i, column = j).value) for j in BOOL_COLS])
    authors = createAuthorList(i)
    d = createTypeListFromBinaryEntries(DOC_COLS, document_types)
    p = createTypeListFromBinaryEntries(PLAT_COLS, platform_types)
    s = createTypeListFromBinaryEntries(SPON_COLS, sponsor_types)
    total_none_values = [total_none_values[j] if param else total_none_values[j] + 1 for j, param in enumerate([*values, authors, d, p, s])]
    if values[0]:
        publications.append(Publication(*values, authors, d, p, s))

In [52]:
total_none_values
# id, doi, code_avail, authors, documentation, platform, sponsor

[0, 504, 0, 24, 0, 4284, 3084]

In [53]:
nrow

7501

In [54]:
# use scopus to get citation data
from os.path import isfile

key_present = isfile("SCOPUS_API_KEY")

if key_present: 
    print("Scopus API key found...")
    with open("SCOPUS_API_KEY", "r+") as file:
        api_key = file.readline()

Scopus API key found...


In [74]:
import requests

num_found_on_scopus = 0
for publication in publications:
    if publication.doi:
        r = requests.get('https://api.elsevier.com/content/abstract/doi/' + publication.doi ,params={'apiKey': api_key, 'httpAccept': 'application/json'} )
        if r.status_code == 200:
            num_found_on_scopus += 1
            # add print for every 50 or so additions
        
num_found_on_scopus

6840

In [None]:
import pickle

with open('publication_nodes.pickle', 'wb') as f:
    pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)