In [6]:
import os
import plotnine as pn
import pandas as pd
import numpy as np

from sciterra import Atlas

With default Word2Vec parameters, we get a vocabulary of roughly 17,000 words. This is intuitively too large for our dataset of 30k abstracts (rougly 3M words), so we should explore how to make this vocab smaller. However, the original Word2Vec paper evaluated on datasets of orders of magnitude larger than ours, so perhaps this is not a problem for word2vec, but just for BOW.

Specifically, they have a ratio of using only most frequent 30k words.

In [2]:
atlas_dirs_w2v = {
    "Physics": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=Word2Vec/center=hafenLowredshiftLymanLimit2017",

    "Linguistics": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=Word2Vec/center=Imeletal2022",

    "Medicine": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=Word2Vec/center=Torres2013",

    "Education": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=Word2Vec/center=Ololube2012",

    "Philosophy": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=Word2Vec/center=Bacon2019",

    "Economics": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=Word2Vec/center=West2003",

    "Materials Science": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=Word2Vec/center=Miele2022",

    "Geology": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=Word2Vec/center=ForeroOrtega2021",

    "Mathematics": "/Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=Word2Vec/center=Andre2018",

}

In [7]:
# This might be memory intensive.

atlases_by_field: dict[str, Atlas] = {
    key: Atlas.load(atlas_dirs_w2v[key]) for key in atlas_dirs_w2v
}

In [8]:

from sciterra.vectorization import Word2VecVectorizer

In [9]:
field = "Physics"
model_path = os.path.join(atlas_dirs_w2v[field], "w2v.model")

vectorizer = Word2VecVectorizer(
    corpus_path=None,
    model_path=model_path,
)

Loading saved Word2Vec model from /Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=Word2Vec/center=hafenLowredshiftLymanLimit2017/w2v.model.
Saving Word2Vec model at /Users/nathanielimel/uci/projects/citesim/outputs/librarian=S2/vectorizer=Word2Vec/center=hafenLowredshiftLymanLimit2017/w2v.model.


In [11]:
# Inspect
vectorizer.model.corpus_count

39742

In [12]:
vectorizer.model.max_vocab_size

In [15]:
len(vectorizer.model.wv)

16975

In [16]:
vectorizer.model.corpus_total_words

3626685

In [17]:
len(vectorizer.model.wv) / vectorizer.model.corpus_total_words

0.004680582956611892

In [18]:
# How do we get the frequencies?
vectorizer.model.effective_min_count

2

In [27]:
# vocab_obj = w2v.vocab["word"]
# vocab_obj.count

vectorizer.model.wv.sort_by_descending_frequency()
vectorizer.model.wv.key_to_index # INDICES, not counts!
vectorizer.model.wv.expandos['count']

array([113302,  45570,  38499, ...,      2,      2,      2])

Can we reduce vocab using min count 5?

In [30]:
vectorizer.model.wv.expandos.keys()

dict_keys(['count', 'sample_int'])

In [32]:
# The first 10,000 include words of freq 4
vectorizer.model.wv.expandos['count'][:10000]

array([113302,  45570,  38499, ...,      4,      4,      4])

In [33]:
# The first 1000 include words of freq 549
vectorizer.model.wv.expandos['count'][:1000]

array([113302,  45570,  38499,  29350,  29328,  28699,  28348,  28313,
        28241,  28071,  27628,  27062,  25481,  23306,  23041,  20018,
        19986,  18933,  18766,  18242,  17959,  17426,  17332,  16945,
        16403,  16248,  15165,  14817,  14673,  14607,  14601,  14239,
        14219,  13417,  12551,  12501,  12371,  12341,  12104,  11901,
        11620,  11510,  11474,  11294,  11119,  11040,  10986,  10839,
        10824,  10750,  10534,  10418,  10396,  10307,  10187,  10094,
        10016,   9988,   9942,   9926,   9637,   9345,   9177,   9097,
         9070,   8688,   8616,   8610,   8393,   8305,   8182,   8135,
         8020,   7944,   7932,   7882,   7821,   7807,   7795,   7641,
         7540,   7411,   7381,   7362,   7357,   7310,   7309,   7278,
         7253,   7245,   7192,   7189,   7143,   7065,   7064,   6907,
         6904,   6894,   6862,   6834,   6819,   6714,   6712,   6687,
         6517,   6475,   6474,   6435,   6307,   6298,   6285,   6262,
      

In [34]:
# Perhaps, at least for Word2Vec, we should aim for a max vocab size of 10,000.