In [None]:
from sciterra import Atlas
from sciterra.vectorization import vectorizers

import sys
sys.path.append("/Users/nathanielimel/uci/projects/citesim/src")

from analysis.plot import atlas_to_measurements
from analysis.plot import search_converged_ids

In [None]:
atlas_dirs = {
    "Physics": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=hafenLowredshiftLymanLimit2017",

    "Linguistics": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=Imeletal2022",

    "Medicine": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=Torres2013",

    "Education": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=Ololube2012",

    "Philosophy": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=Bacon2019",

    "Economics": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=West2003",

    "Materials Science": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=Miele2022",

    "Geology": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=ForeroOrtega2021",

    "Mathematics": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=Andre2018",

}

Set new vectorizer

In [None]:
new_vectorizer = "BOW"

In [None]:
# Load the source atlas for the field
field = "Physics"
atl = Atlas.load(atlas_dirs[field])
len(atl)

In [None]:
# Set the target atlas dir 
save_dir = atlas_dirs[field].replace("vectorizer=SciBERT", f"vectorizer={new_vectorizer}")
save_dir

In [None]:

if new_vectorizer == "Word2Vec" or new_vectorizer == "BOW":
    # Create corpus for Word2Vec
    import os

    corpus_path = f"{save_dir}/corpus.txt"
    model_path = f"{save_dir}/w2v.model"
    overwrite = False

    if not os.path.exists(corpus_path) or overwrite:
        abstracts = "\n".join([pub.abstract for pub in atl.publications.values()])
        with open(corpus_path, "w") as f:
            f.writelines(abstracts)

    vectorizer_kwargs = dict(
        corpus_path=corpus_path,
        model_path=model_path,        
    )
else:
    vectorizer_kwargs = dict(device="mps")

In [None]:
# Initialize the vectorizer

vectorizer = vectorizers[new_vectorizer](
        **vectorizer_kwargs,
)
vectorizer

In [None]:
from sciterra import Cartographer
from sciterra.vectorization.vectorizer import Vectorizer
def copy_atlas(atl: Atlas, vectorizer: Vectorizer, **kwargs) -> Atlas:
    """Create a new atlas containing the publications of an old one, and having a projection with a different vectorizer."""

    atl_new = Atlas(
        publications=list(atl.publications.values()),
    )
    crt = Cartographer(vectorizer=vectorizer)
    atl_new = crt.project(atl_new, **kwargs)
    return atl_new

In [None]:
# Not bad, gpt2 took < 2 hours
atl_new: Atlas = copy_atlas(atl, vectorizer, batch_size=4) # try 4 if 8 leads to memory issues
atl_new.projection

In [None]:
len(atl_new.projection)

In [None]:
# save the atlas
# TODO: create an atlas.copy or from_atlas classmethod
atl_new.center = atl.center
atl_new.history = atl.history
atl_new.save(save_dir)


In [None]:
# This step is necessary for word2vec which has failed embeddings

# Take converged pubs that are in atlas
converged_pub_ids = search_converged_ids(
    atl, 
    num_pubs_added=1000,
)
converged_pub_ids = [id for id in converged_pub_ids if id in atl_new.ids]

In [None]:
# measure with diff vectorizer



df = atlas_to_measurements(
    atl_new,
    vectorizer=vectorizer,
    converged_pub_ids=converged_pub_ids,
    fields_of_study=[field],
)
df


In [None]:
import os

save_fn = "all_data.csv"
save_fn = os.path.join(save_dir, save_fn)

df.to_csv(save_fn, index=False)