# Converting a CC atlas into a Sciterra Atlas
N.B.: this requires the environment 'cc-sciterra'.

In [3]:
import numpy as np
import pandas as pd
import plotnine as pn

from collections import Counter

import sciterra
import cc

ModuleNotFoundError: No module named 'cc'

In [33]:
# sciterra_atlas_dir = "outputs/atlas_from_cc_region_0/"
sciterra_atlas_dir = "outputs/atlas_from_cc_region_7/"

In [34]:
# cc_atlas_dir = "/Users/nathanielimel/data/literature_topography/regions/region_0/"
cc_atlas_dir = "/Users/nathanielimel/data/literature_topography/regions/region_8/"

In [35]:
cc_atlas = cc.atlas.Atlas(cc_atlas_dir)

Loading bibliography entries.
Storing bibliography entries.


100%|█████████████████████████████████| 34407/34407 [00:00<00:00, 70841.36it/s]


Loading saved atlas data.


100%|████████████████████████████████| 34401/34401 [00:00<00:00, 282886.52it/s]
100%|███████████████████████████████| 34401/34401 [00:00<00:00, 1363602.66it/s]


In [None]:
len(cc_atlas.data)

In [36]:
bibcodes = list(cc_atlas.data.keys())
bibcodes[:5]

['2002OptEn..41.1158D',
 '2019A&A...621A..31K',
 '2007ITAS...17.3789Y',
 '2007ITAS...17..289B',
 '2005ITAS...15..571K']

In [37]:
cc_atlas[bibcodes[0]].__dict__.keys()


dict_keys(['citation_key', 'notes_categories', 'ignore_failed', 'stored_parameters', 'notes', 'cached_bibtex_fp', 'citation', 'abstract', 'citations', 'references', 'bibcode', 'entry_date', 'stemmed_content_words'])

In [38]:
cc_atlas[bibcodes[0]].entry_date

'2002-07-16T00:00:00Z'

In [39]:
cc_atlas[bibcodes[0]].abstract

Dict, {
'str' : "Currently operating optical superconducting tunnel junction (STJ) detectors, developed at the European Space Agency (ESA), can simultaneously measure the wavelength ((Delta) (gamma) equals 50 nm at 500 nm) and arrival time (to within approximately 5 microsecond(s) ) of individual photons in the range 310 to 720 nm with an efficiency of approximately 70%, and with count rates of the order of 5000 photons s<SUP>-1</SUP> per junction. A number of STJs placed in an array format generates 4-D data: photon arrival time, energy, and array element (X,Y). Such STJ cameras are ideally suited for, e.g., high-time-resolution spectrally resolved monitoring of variable sources or low- resolution spectroscopy of faint extragalactic objects. The reduction of STJ data involves detector efficiency correction, atmospheric extinction correction, sky background subtraction, and, unlike that of data from CCD-based systems, a more complex energy calibration, barycentric arrival time correcti

In [40]:
# define how to convert a cc publication to a sciterra publication.
# copy paste and edit the ads librarian convert function

from datetime import date, datetime

def convert_cc_publication(cc_publication, *args, **kwargs) -> sciterra.mapping.publication.Publication:
    """Convert a ADS Article object to a sciterra.publication.Publication."""
    if cc_publication is None:
        return

    # to be consistent with identifiers (e.g., to avoid storing the same publication twice), we always use the bibcode.
    identifier = cc_publication.bibcode

    def process_date(date_str: str) -> str:
        # sometimes there is extra data
        date_str = date_str[:10]  # e.g. yyyy-mm-dd
        # sometimes ads has 00 for month or day
        if date_str[-2:] == "00":
            date_str[-2:] = "01"
        if date_str[-5:-3] == "00":
            date_str[-5:-3] = "01"
        date_ = datetime.strptime(date_str, "%Y-%m-%d")
        return date_

    # Parse date from datetime or year
    if hasattr(cc_publication, "entry_date"):
        publication_date = process_date(cc_publication.entry_date)
    elif hasattr(cc_publication, "pubdate"):
        publication_date = process_date(cc_publication.pubdate)
    elif hasattr(cc_publication, "year"):
        publication_date = date(cc_publication.year, 1, 1)
    else:
        publication_date = None

    # Process citation data
    citations = cc_publication.citations
    references = cc_publication.references

    citation_count = len(citations) if citations is not None else None

    data = {
        # primary fields
        "identifier": identifier,
        "abstract": cc_publication.abstract["str"],
        "publication_date": publication_date,
        "citations": citations,
        "references": references,
        "citation_count": citation_count,
    }
    data = {k: v for k, v in data.items() if v is not None}

    return sciterra.mapping.publication.Publication(data)

convert_cc_publication(cc_atlas[bibcodes[0]])

sciterra.publication.Publication:2002OptEn..41.1158D

In [41]:
sciterra_publications = [convert_cc_publication(
    cc_atlas[key]
) for key in cc_atlas.data]
sciterra_publications = [pub for pub in sciterra_publications if pub is not None]
len(sciterra_publications)

34401

In [42]:
# construct sciterra atlas
sciterra_atlas = sciterra.Atlas(publications=sciterra_publications)
len(sciterra_atlas)

34401

## Project publications using SciBERT

In [43]:
# project with scibert
from sciterra.vectorization import SciBERTVectorizer

sciterra_crt = sciterra.Cartographer(vectorizer=SciBERTVectorizer(device="mps"))

Using device: mps.


In [44]:
# n.b.: for 24,448 pubs, takes 2h 28 mins
sciterra_atlas = sciterra_crt.project(sciterra_atlas)

embedding documents: 34432it [2:21:22,  4.06it/s]                             


### Save

In [46]:
sciterra_atlas.save(sciterra_atlas_dir)