In [29]:
import os, sys, json
from pathlib import Path
import requests

In [30]:
import pandas as pd
import numpy as np

In [2]:
sys.path.append('..')

In [3]:
from markdown_edits import MarkdownDoc, MarkdownSection

In [4]:
entity_names = [
    "works",
    "authors",
    "sources",
    "institutions",
    "publishers",
    "funders",
    "concepts",
]

In [5]:
dirpath = Path('../../the-data/')
markdown_files = list(dirpath.rglob('*.md'))

In [6]:
markdown_docs = [MarkdownDoc(fp.read_text()) for fp in markdown_files]

In [16]:
ignore_docs = [
    'Geo',
    'Get N-grams',
    'Works with more than 100 authors are truncated',
    'Location attributes',
    'Authorship attributes',
]

In [23]:
def yield_attrs(doc, prefix):
    for sec in doc.sections:
        if 'attributes' in sec.title.lower():
            attrs_subdoc = MarkdownDoc(sec.content)
            attrs_subsections = [
                    MarkdownSection(sec, title, parent=attrs_subdoc, level=3)
                    for title, sec in attrs_subdoc.split_into_sections(attrs_subdoc.txt, level=3)
            ]
            # attrs = {}
            for sec in attrs_subsections:
                if sec.title:
                    title = sec.title.replace('\\', '').strip('`')
                    desc = sec.lines[2]
                    # attrs[f"source.{title}"] = desc
                    yield f"{prefix}.{title}", desc
    #         return attrs
    # return None

In [26]:
descriptions = {}
for doc in markdown_docs:
    try:
        sec = doc.sections[1]
        content_lines = [line for line in sec.content.split("\n") if line]
        desc = content_lines[0]
        title = ''.join([char for char in sec.title if char.isascii()])
        title = title.strip()
        if title in ignore_docs:
            print(f"ignoring {doc}")
            continue
        descriptions[title] = desc
        for attr_name, attr_desc in yield_attrs(doc, prefix=title.lower()):
            descriptions[attr_name] = attr_desc
    except IndexError:
        print(f"skipping {doc}")

skipping <class 'markdown_edits.MarkdownDoc'>(# Data overview\\The OpenAlex dataset describes sc...)
ignoring <class 'markdown_edits.MarkdownDoc'>(---\description: Where things are in the world\---...)
skipping <class 'markdown_edits.MarkdownDoc'>(# Continents\\Countries are mapped to continents u...)
skipping <class 'markdown_edits.MarkdownDoc'>(# Regions\\### **Global South**\\The Global South ...)
ignoring <class 'markdown_edits.MarkdownDoc'>(# Locations\\The locations online where a work liv...)
ignoring <class 'markdown_edits.MarkdownDoc'>(# Authorships\\An authorship is a single author an...)
ignoring <class 'markdown_edits.MarkdownDoc'>(---\description: N-grams are groups of sequential ...)
ignoring <class 'markdown_edits.MarkdownDoc'>(# Limitations\\## Works with more than 100 authors...)


In [27]:
for k, v in descriptions.items():
    print(f"{k}: {v}")

Sources: Sources are where works are hosted.
sources.abbreviated_title: An abbreviated title for this source.
sources.alternate_titles: Alternate titles for this source, commonly abbreviations or translations of the source's canonical name.
sources.apc_prices: Article processing charge (APC) information
sources.apc_usd: The source's article processing charge in US Dollars
sources.cited_by_count: The total number of works that cite a work hosted in this source.
sources.country_code: The country that this source is associated with.
sources.counts_by_year: The works count and cited-by count of this source for the last ten years, binned by year.
sources.created_date: The date this source was created in the OpenAlex dataset.
sources.display_name: The name of the source.
sources.homepage_url: The starting page for navigating the contents of this source; the homepage for this source's website.
sources.host_organization: The host organization for this source—either a publisher or an institutio

In [31]:
fields_from_api = {}
for entity_name in entity_names:
    filter_names = requests.get(f"https://api.openalex.org/{entity_name}/valid_fields").json()
    schema_names = requests.get(f"https://api.openalex.org/{entity_name}/flattened_schema").json()
    fields_from_api[entity_name] = {
        'filter_names': filter_names,
        'schema_names': schema_names,
    }

In [60]:
def get_description_or_parent(name):
    if name in descriptions:
        return name, descriptions[name]
    parts = name.split('.')
    if len(parts) > 1:
        for i in range(len(parts)):
            name = ".".join(parts[:-i-1])
            if name in descriptions:
                return name, descriptions[name]
    return None, None

In [62]:
data = {}
for entity_name, v in fields_from_api.items():
    for k, field_names in v.items():
        for field_name in field_names:
            this_name = f"{entity_name}.{field_name}"
            if this_name in data:
                data[this_name].update({
                    f'in_{k}': True
                })
            else:
                data[this_name] = {
                    'name': this_name,
                    'entity_name': entity_name,
                    f'in_{k}': True,
                }
                desc_key, desc = get_description_or_parent(this_name)
                if desc:
                    data[this_name].update({
                        'in_docs_scrape': True,
                        'docs_description': desc,
                        'docs_desc_key': desc_key,
                    })
df = pd.DataFrame(data.values())

In [64]:
df.to_csv('examine_docs_scrape.csv', index=False)

In [63]:
df

Unnamed: 0,name,entity_name,in_filter_names,in_docs_scrape,docs_description,docs_desc_key,in_schema_names
0,works.abstract.search,works,True,,,,
1,works.apc_list.currency,works,True,True,The list-price APC (article processing charge)...,works.apc_list,True
2,works.apc_list.provenance,works,True,True,The list-price APC (article processing charge)...,works.apc_list,True
3,works.apc_list.value,works,True,True,The list-price APC (article processing charge)...,works.apc_list,True
4,works.apc_list.value_usd,works,True,True,The list-price APC (article processing charge)...,works.apc_list,True
...,...,...,...,...,...,...,...
423,concepts.related_concepts,concepts,,True,Concepts that are similar to this one.,concepts.related_concepts,True
424,concepts.counts_by_year,concepts,,True,The works count and cited-by count of this con...,concepts.counts_by_year,True
425,concepts.works_api_url,concepts,,True,A URL that will get you a list of all the work...,concepts.works_api_url,True
426,concepts.updated_date,concepts,,True,The last time anything in this concept's data ...,concepts.updated_date,True


In [68]:
glossary_sections = []
for entity_name, gdf in df.sort_values("entity_name").groupby("entity_name"):
    subset = gdf[['docs_desc_key', 'docs_description']].dropna().drop_duplicates().sort_values('docs_desc_key')
    this_section = f'## {entity_name.title()}\n\n'
    for _, row in subset.iterrows():
        subsection = f'### {row["docs_desc_key"].replace(f"{entity_name}.", "")}\n\n{row["docs_description"]}\n\n'
        this_section += subsection
    glossary_sections.append(this_section)

In [69]:
with Path("tmpglossary.md").open("w") as outf:
    for glossary_section in glossary_sections:
        outf.write(glossary_section)
        outf.write("\n")