In [1]:
from sciterra import Atlas
from sciterra.vectorization import vectorizers

import sys
sys.path.append("/Users/nathanielimel/uci/projects/citesim/src")

from analysis.plot import atlas_to_measurements
from analysis.plot import search_converged_ids

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


In [2]:
atlas_dirs = {
    "Physics": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=hafenLowredshiftLymanLimit2017",

    "Linguistics": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=Imeletal2022",

    "Medicine": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=Torres2013",

    "Education": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=Ololube2012",

    "Philosophy": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=Bacon2019",

    "Economics": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=West2003",

    "Materials Science": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=Miele2022",

    "Geology": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=ForeroOrtega2021",

    "Mathematics": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=SciBERT/center=Andre2018",

}

Set new vectorizer

In [3]:
new_vectorizer = "BOW"

In [4]:
# Load the source atlas for the field
field = "Geology"
atl = Atlas.load(atlas_dirs[field])
len(atl)

30223

In [5]:
# Set the target atlas dir 
save_dir = atlas_dirs[field].replace("vectorizer=SciBERT", f"vectorizer={new_vectorizer}")
save_dir

'/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=BOW/center=ForeroOrtega2021'

In [6]:

if new_vectorizer == "Word2Vec" or new_vectorizer == "BOW":
    # Create corpus for Word2Vec
    import os

    corpus_path = f"{save_dir}/corpus.txt"
    model_path = f"{save_dir}/w2v.model"
    overwrite = False

    if not os.path.exists(corpus_path) or overwrite:
        abstracts = "\n".join([pub.abstract for pub in atl.publications.values()])
        with open(corpus_path, "w") as f:
            f.writelines(abstracts)

    vectorizer_kwargs = dict(
        corpus_path=corpus_path,
        model_path=model_path,        
        min_count=5,
    )
else:
    vectorizer_kwargs = dict(device="mps")

In [7]:
# Initialize the vectorizer

vectorizer = vectorizers[new_vectorizer](
        **vectorizer_kwargs,
)
vectorizer

Loading and tokenizing data from /Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=BOW/center=ForeroOrtega2021/corpus.txt for vocabulary and training...


tokenizing lines: 50203it [40:03, 20.89it/s] 


Training Word2Vec model...
Loaded corpus and trained model in 2533.67 seconds.
Saving Word2Vec model at /Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=BOW/center=ForeroOrtega2021/w2v.model.


<sciterra.vectorization.bow.BOWVectorizer at 0x29e673f70>

In [8]:
from sciterra import Cartographer
from sciterra.vectorization.vectorizer import Vectorizer
def copy_atlas(atl: Atlas, vectorizer: Vectorizer, **kwargs) -> Atlas:
    """Create a new atlas containing the publications of an old one, and having a projection with a different vectorizer."""

    atl_new = Atlas(
        publications=list(atl.publications.values()),
    )
    crt = Cartographer(vectorizer=vectorizer)
    atl_new = crt.project(atl_new, **kwargs)
    return atl_new

In [9]:
# Not bad, gpt2 took < 2 hours
atl_new: Atlas = copy_atlas(atl, vectorizer, batch_size=4) # try 4 if 8 leads to memory issues
atl_new.projection

embedding documents: 100%|██████████| 30223/30223 [00:07<00:00, 4168.91it/s]


<sciterra.vectorization.projection.Projection at 0x2b46aa140>

In [10]:
len(atl_new.projection)

30144

In [11]:
# save the atlas
# TODO: create an atlas.copy or from_atlas classmethod
atl_new.center = atl.center
atl_new.history = atl.history
atl_new.save(save_dir)




In [12]:
# This step is necessary for word2vec which has failed embeddings

# Take converged pubs that are in atlas
converged_pub_ids = search_converged_ids(
    atl, 
    num_pubs_added=1000,
)
converged_pub_ids = [id for id in converged_pub_ids if id in atl_new.ids]

Between update 132 and the final update (136) there were 1231 publications added to the Atlas.
Convergence criterion dict_items([('num_pubs_added', 1000), ('kernel_size', 16)]) (=> index 132 out of 137 total updates) yields 14616 ids out of 30223 total ids.


In [13]:
# measure with diff vectorizer



df = atlas_to_measurements(
    atl_new,
    vectorizer=vectorizer,
    converged_pub_ids=converged_pub_ids,
    fields_of_study=[field],
)
df


computing cosine similarity for 14581 embeddings with batch size 1000.


100%|██████████| 15/15 [08:43<00:00, 34.88s/it]


Computing ['density', 'edginess'] for 14581 publications.


100%|██████████| 14581/14581 [00:40<00:00, 358.28it/s]


There are 8288 total observations after filtering.


Unnamed: 0,density,edginess,references,citations_per_year,is_center,identifier,year
0,12.816424,0.657398,35,1.400000,False,57d0d51c3a15e7f229a5b3a83b8a18babe116da0,2015
1,14.605489,0.574547,50,5.333333,False,04fafa3eb4b7ffb2f68b2615ddf4aff2b02348fe,2014
3,13.477564,0.656371,69,3.625000,False,b212fdfa258b30082525e57e980d041592bf47be,2012
4,14.143324,0.664589,32,1.400000,False,59729f67c63dfd3e5d3d535bda173c3474eb0d64,2015
7,14.335489,0.763988,13,0.714286,False,df0b98de865966ed596fa236c9b882ce84968edc,2013
...,...,...,...,...,...,...,...
14573,14.456651,0.645458,16,0.500000,False,d7132b0b4e90701427d654b375b75a3aa17e01ee,2018
14574,12.925983,0.638020,182,1.882353,False,2e5b74e04ccca8306d7fbf5834786a340769a8cb,2003
14575,13.725363,0.709486,70,1.736842,False,6a609348b16f68036c3cf56d8f43f34f8d2ac4db,2001
14577,12.790771,0.789096,47,0.529412,False,0f9ab27d0f3aee27c9a4ec52c23b55ddcd0940ed,2003


In [14]:
import os

save_fn = "all_data.csv"
save_fn = os.path.join(save_dir, save_fn)

df.to_csv(save_fn, index=False)