In [2]:
import os

import sys
sys.path.append(os.path.join('..', 'src'))

from utils import pdfs_to_df, tokenize_df_of_texts
from gensim.models import Word2Vec
from transformers import BertTokenizer
bert_base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
from nltk.tokenize import sent_tokenize, word_tokenize

  from .autonotebook import tqdm as notebook_tqdm


#### Get text from test corpus
- Specify tokenizer, keep consistent with downstream Q&A model

In [3]:
# From the test pdf dir, extract the text and tokenize it. Store in pandas dataframe

directory = os.path.join("..", "data", "test_pdfs")
df = pdfs_to_df(directory)
df = tokenize_df_of_texts(df, bert_base_tokenizer)

drop_cols = [col for col in df.columns if col not in ['Document', 'Text', 'Original_Text', 'Path', 'tokens']]
print(drop_cols)

df = df.drop(columns=drop_cols)

../data/test_pdfs/2101.00031.pdf
../data/test_pdfs/2101.01089.pdf
../data/test_pdfs/2101.00182.pdf
../data/test_pdfs/2101.00525.pdf
../data/test_pdfs/2101.01017.pdf
../data/test_pdfs/2101.00005.pdf
../data/test_pdfs/2101.00763.pdf
../data/test_pdfs/2101.01291.pdf
../data/test_pdfs/2101.00831.pdf
../data/test_pdfs/2101.01094.pdf
../data/test_pdfs/2101.00572.pdf
processing text...
making lower-case...
Removing non-text elements (extra whitespaces)...
Removing unnecessary whitespace and special characters...
Removing line breaks...
Removing gibberish...
Removing unicode...
remove single letters or super large words (so big they don't make sense)...
done cleaning.

tokenize the processed text...
['Abstract', 'Abstract_Original', 'sha_256', 'language', 'language_probability', 'Authors', 'Title', 'url', 'date', 'token_embeddings']


In [4]:
df

Unnamed: 0,Document,Text,Original_Text,Path,tokens
0,2101.00031.pdf,lagrangian cobordisms between legendrian knots...,. Lagrangian cobordisms between Legendrian kno...,../data/test_pdfs/2101.00031.pdf,"[la, ##gra, ##ng, ##ian, co, ##bor, ##dis, ##m..."
1,2101.01089.pdf,cotangent sums play signiﬁcant role in the nym...,. Cotangent sums play a signiﬁcant role in the...,../data/test_pdfs/2101.01089.pdf,"[cot, ##ange, ##nt, sums, play, sign, ##i, ##ﬁ..."
2,2101.00182.pdf,"let be an open subset of rn, and let p, [1, ∞]...",". Let Ω be an open subset of RN, and let p, q ...",../data/test_pdfs/2101.00182.pdf,"[let, be, an, open, subset, of, rn, ,, and, le..."
3,2101.00525.pdf,the multivariable autoregressive ﬁlter problem...,\nThe multivariable autoregressive ﬁlter probl...,../data/test_pdfs/2101.00525.pdf,"[the, multi, ##var, ##iable, auto, ##re, ##gre..."
4,2101.01017.pdf,we derive an upper bound for the assouad dimen...,. We derive an upper bound for the Assouad dim...,../data/test_pdfs/2101.01017.pdf,"[we, derive, an, upper, bound, for, the, ass, ..."
5,2101.00005.pdf,"while teaching course on integral equations, n...",\nWhile teaching a course on integral equation...,../data/test_pdfs/2101.00005.pdf,"[while, teaching, course, on, integral, equati..."
6,2101.00763.pdf,let is certain tensor product of simple dyadic...,. Let T is a certain tensor product of simple ...,../data/test_pdfs/2101.00763.pdf,"[let, is, certain, tensor, product, of, simple..."
7,2101.01291.pdf,we generalize our previous new deﬁnition of eu...,. We generalize our previous new deﬁnition of ...,../data/test_pdfs/2101.01291.pdf,"[we, general, ##ize, our, previous, new, de, #..."
8,2101.00831.pdf,"given constants x, and the space of entire fun...",". Given constants x, ν ∈ C and the space H0 of...",../data/test_pdfs/2101.00831.pdf,"[given, constant, ##s, x, ,, and, the, space, ..."
9,2101.01094.pdf,logarithmic potentials and many other potentia...,. Logarithmic potentials and many other potent...,../data/test_pdfs/2101.01094.pdf,"[log, ##ari, ##th, ##mic, potential, ##s, and,..."


#### Train model on tokenized text
- Set:
    - Vector Size: length of word embeddings
    - Window Size: span of sorrounding words to train model
    - Min Count: minimum number of occurances of word to be be viable

In [5]:
# Load your DataFrame with tokenized texts
# Train Word2Vec model
model = Word2Vec(sentences=df['tokens'].to_list(), vector_size=100, window=5, min_count=1, sg=0)

# Save the trained model
model.save(os.path.join("..", "models", "word_embeddings", "word2vec_model.bin"))

#### Examine Model

In [6]:
from collections import Counter

# Count token frequencies
token_frequencies = Counter(df['tokens'].to_list())

# Print the frequency of "number"
print("Frequency of 'number':", token_frequencies["number"])

TypeError: unhashable type: 'list'

In [16]:
# Load the trained Word2Vec model
model = Word2Vec.load("word2vec_model.bin")

# Access the embedding of a word
embedding = model.wv["manifold"]
print(embedding)
# Find similar words based on embedding similarity
similar_words = model.wv.most_similar("manifold")
print(similar_words)

# You can also perform vector arithmetic operations
# result = model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)

[ 6.50522113e-02  7.75543526e-02 -8.95991474e-02 -1.86093464e-01
  8.59568715e-02 -1.37554884e-01 -4.07235883e-02  1.45292446e-01
 -7.10864216e-02 -6.32060394e-02  2.07130127e-02 -1.79543078e-01
 -2.39383861e-01 -3.73561010e-02  4.36756946e-02  5.16275242e-02
 -1.59504488e-01 -1.03847787e-01  6.49311543e-02 -4.41587061e-01
 -3.52784097e-02  6.61083311e-02  2.50223696e-01 -5.47317648e-03
 -1.99339658e-01 -8.27215165e-02  2.91567110e-02 -1.79247096e-01
  1.18086323e-01  9.77727864e-03  1.70967519e-01  1.92541592e-02
  2.49241024e-01  7.86685124e-02  1.71444342e-02  8.05471931e-03
 -5.90218492e-02  5.57746142e-02  1.29359856e-01 -1.01037314e-02
  1.69563130e-01 -2.50092298e-01  1.74485728e-01  7.36795142e-02
  9.38774720e-02  3.18660997e-02 -2.03889050e-02 -1.47375315e-01
 -2.41411166e-04  1.45220965e-01 -5.30875847e-02  2.34161764e-01
  1.24412812e-01  5.06878309e-02 -6.93341121e-02  1.41992196e-01
  1.47458076e-01 -3.54603343e-02  1.14661060e-01 -4.90062125e-02
  1.29437506e-01  1.65118

In [13]:
vocabulary = model.wv.index_to_key
print("Number of words in vocabulary:", len(vocabulary))
print("Is 'number' in vocabulary?", 'number' in vocabulary)

Number of words in vocabulary: 3207
Is 'number' in vocabulary? True


In [14]:
token = '[PAD]'

print(model.wv['[PAD]'])

KeyError: "Key '[PAD]' not present"