# Routing

In [1]:
import os
%load_ext autoreload
%autoreload 1

In [2]:
import polars as pl
from genie.config import Locations
from pathlib import Path
from prepare.sqlite import get_query_df
import requests

In [3]:
base = Path("..")
print(base.absolute())

/home/antonkulaga/sources/longevity-genie/notebooks/..


In [4]:
requests.get('https://api.semanticscholar.org/datasets/v1/release/latest').json()

{'release_id': '2023-07-11',
 'README': 'Semantic Scholar Academic Graph Datasets\n\nThese datasets provide a variety of information about research papers taken from a snapshot in time of the Semantic Scholar corpus.\n\nThis site is provided by The Allen Institute for Artificial Intelligence (“AI2”) as a service to the\nresearch community. The site is covered by AI2 Terms of Use and Privacy Policy. AI2 does not claim\nownership of any materials on this site unless specifically identified. AI2 does not exercise editorial\ncontrol over the contents of this site. AI2 respects the intellectual property rights of others. If\nyou believe your copyright or trademark is being infringed by something on this site, please follow\nthe "DMCA Notice" process set out in the Terms of Use (https://allenai.org/terms).\n\nSAMPLE DATA ACCESS\nSample data files can be downloaded with the following UNIX command:\n\nfor f in $(curl https://s3-us-west-2.amazonaws.com/ai2-s2ag/samples/MANIFEST.txt)\n  do curl 

In [5]:
locations = Locations(base)
locations

<genie.config.Locations at 0x7f885ac258d0>

## Getting indexes

In [6]:
from pycomfort.files import *
tprint(locations.index)

index
	openai_6000_chunk
		modules
			chroma-collections.parquet
			chroma-embeddings.parquet
			index
				index_metadata_aafbe923-7c3b-46a8-b35b-e4b9dd6b2074.pkl
				id_to_uuid_aafbe923-7c3b-46a8-b35b-e4b9dd6b2074.pkl
				index_aafbe923-7c3b-46a8-b35b-e4b9dd6b2074.bin
				uuid_to_id_aafbe923-7c3b-46a8-b35b-e4b9dd6b2074.pkl


In [7]:
def traverse_indexes(folder: Path):
    return traverse(folder, lambda p: p.is_dir() and files(p).exists(lambda f: f.name == "chroma-collections.parquet"))
traverse_indexes(locations.index)

[PosixPath('/home/antonkulaga/sources/longevity-genie/data/index/openai_6000_chunk/modules')]

In [8]:
from getpaper.config import load_environment_keys
load_environment_keys()
import os
semantic_scholar_api_key = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
auth_header = {'x-api-key': semantic_scholar_api_key}

environment found at /home/antonkulaga/sources/longevity-genie/.env


In [9]:
from typing import List

def get_samantic_headers():
    load_environment_keys()
    semantic_scholar_api_key = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
    return {'x-api-key': semantic_scholar_api_key}

def get_sch_release() -> List:
    return requests.get('https://api.semanticscholar.org/datasets/v1/release/latest', headers=get_samantic_headers()).json()["datasets"]

def get_files():
    requests.get('https://api.semanticscholar.org/datasets/v1/release/latest/dataset/s2orc', headers=get_samantic_headers()).json()

def get_s2orc() -> dict:
    datasets = get_sch_release()
    result = seq(datasets).find(lambda d: d["name"] == "s2orc")
    return result

def get_release(dataset: str = "s2orc") -> dict:
    return requests.get(f'https://api.semanticscholar.org/datasets/v1/release/latest/dataset/{dataset}', headers=get_samantic_headers()).json()

def get_release_files(dataset: str = "s2orc"):
    return get_release(dataset)["files"]

get_release()

environment found at /home/antonkulaga/sources/longevity-genie/.env


{'name': 's2orc',
 'description': 'Full-body paper text parsed from open-access PDFs. Identifies structural elements such as paragraphs, sections, and bibliography entries.\n5M records in 30 4GB files.',
 'README': 'Semantic Scholar Academic Graph Datasets\n\nThe "s2orc" dataset contains parsed full-body text from selected papers.\n\nA subset of this data was previously released (in a different format) as S2ORC https://github.com/allenai/s2orc\n\nThe body text is parsed from PDF documents using Grobid, documented at https://grobid.readthedocs.io.\nIts output is converted from XML into a single string with a set of annotation spans.\n\nSCHEMA\n - externalIds: IDs of this paper in different catalogs\n - content:\n   - source:\n\t   - pdfUrls: URLs to the PDF\n\t   - oaInfo: license/url/status information from Unpaywall\n   - text: Full body text as a single string\n   - annotations: Annotated spans of the full body text\n\n\nLICENSE\nThis collection is licensed under ODC-BY. (https://ope

In [10]:
files = get_release_files()
files[0]

environment found at /home/antonkulaga/sources/longevity-genie/.env


'https://ai2-s2ag.s3.amazonaws.com/staging/2023-07-11/s2orc/20230714_111942_00012_e64uq_061ba37d-7776-4179-ae0f-a97563a170e4.gz?AWSAccessKeyId=ASIA5BJLZJPWQ2YO4EPH&Signature=VDQZQKJlp4ATm94JKfZ%2Fet8VQAI%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEP3%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQDb5gJN8A9B2QCqqX8yQIHbyo3TD%2BnfquZZjrVjABtHmAIhALMtB6V9hAKAJDmlkZFAF%2FQAiz2R3YptFSQBQ35xF5wAKogECIb%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEQABoMODk2MTI5Mzg3NTAxIgyrAG6ZlxPhO6o6oRkq3AP2VwIBqPbw0%2FRlbOOVJM1uhucvdu8%2BrVA8hgGefru0iF9gesdrdQznGm2gHAFXDSKo5V7tPMhlPUAZTUevdQVYnwDRTNfplDp4wqeIEfC0bmdowUzPx%2FpmiNw3RVNlcVafXZHA0J9Uedfh8fArbCSMOdcBle%2FhkcJzy4Uz%2F3jH9YF1hy3zzudkDM6fmaiyfSI6uajfEjIfIVra2q2G4vBoQD%2BRsz4RoZ0iYJ3plWuD3SEEeALJ2wR4JdJ73iOmP3wxEtrrTL9p2jWHTEH2WTQns3iCH6G7D%2BEL6wtycrYFg4PuPHdpYFrtwo2oGDuWaqWntIF84sRwlv8uG5sWmisiBzPI9U0PFuvWGrx00ofCw2uIDgth1eIFe%2F42QV9eMUNa7CXp%2BOgtpjwphjIBccxIdvuRYxKOGJsxK4uAFlr88Jy0gqOUWDaqoV5c%2FnFA%2F8Y0QxYPhp8lPHutPAb8G4EfRbt166hsE%2FetY5zu3dBm9uRhZIUN

In [11]:
(locations.data / "files.txt").write_text("\n".join(files))

42689

In [12]:
locations.data / "files.txt"


PosixPath('/home/antonkulaga/sources/longevity-genie/data/files.txt')