### Notebook: Set the embedding layer, update Mongo database
1. From Mongodb
2. From Directory

In [1]:
import os
import json
import csv
import sys
# Get the current working directory (notebooks directory)
current_dir = os.getcwd()

# Go up one level to the project directory
project_dir = os.path.dirname(current_dir)

# Assuming your project structure is as described before
src_path = os.path.abspath(os.path.join(project_dir, 'src'))
hyperparam_path = os.path.abspath(os.path.join(project_dir, 'vars'))

# Add the 'src' directory to the Python path
sys.path.append(src_path)

from question_answer_site.question_answer.parse_document import pdfs_to_df, tokenize_df_of_texts
from question_answer_site.question_answer.embedding_layer import load_custom_vectors, update_mongo_document
from question_answer_site.question_answer.mongodb import MongoDb
from question_answer_site.question_answer.utils import tokens_to_embeddings
from question_answer_site.question_answer.config import TOKENIZER, TOKENS_TYPE, EMBEDDING_MODEL_TYPE, EMBEDDING_MODEL_FNAME,\
VECTOR_SIZE, WINDOW, MIN_COUNT, SG, DOCUMENT_EMBEDDING, INPUT_FOLDER, special_characters


from gensim.models import Word2Vec
from transformers import BertTokenizer, RobertaTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize

import spacy
from spacy.vocab import Vocab
from spacy.language import Language
from spacy.tokens import Doc

import numpy as np
from urllib.parse import quote_plus
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


#### Set embedding layer from mongodb
- Specify tokenizer, keep consistent with downstream Q&A model (TOKENIZER)
- Secify the data filepath (directory)

In [2]:
username = "new_user_1"
password = "password33566"
# Escape the username and password
escaped_username = quote_plus(username)
escaped_password = quote_plus(password)

cluster_url = "cluster0"
database_name = "question_answer"

In [3]:
# Get tokens and counter values from all documents in the mongodb
collection_name = "parsed_documents"

# Create a MongoClient and connect to the server
mongodb = MongoDb(escaped_username, escaped_password, cluster_url, database_name, collection_name)
if mongodb.connect():
    cursor = mongodb.get_collection().find({}, {TOKENS_TYPE:1, 'counter':1, '_id':0})

# Assuming 'data' is your list of dictionaries
df = pd.DataFrame(list(cursor))

In [4]:
df.head()

Unnamed: 0,tokens_less_sw,counter
0,"[Ġcommercial, Ġaviation, Ġcustomers, Ġmarch, Ġ...",115
1,"[Ġoriginal, mber, Ġretrieved, Ġse, pt, ember, ...",284
2,"[Ġj, ames, Ġweb, b, Ġspace, Ġtelescope, Ġwik, ...",142
3,"[ember, )., activ, ating, Ġstar, link, ..."", w...",292
4,"[priority, Ġscience, Ġgoal, Ġbeyond, Ġh, st, '...",154


In [5]:
# Train Word2Vec model
if EMBEDDING_MODEL_TYPE == 'Word2Vec':
    kwargs = {
     'sentences':df[TOKENS_TYPE].to_list(),
     'vector_size':VECTOR_SIZE,
     'window':WINDOW,
     'min_count':MIN_COUNT,
     'sg':SG
    }
    
    # Train the Word2Vec model
    model = Word2Vec(**kwargs)
    
    # Save the model
    model.save(os.path.join("..", "models", "word_embeddings", EMBEDDING_MODEL_FNAME))
    
elif EMBEDDING_MODEL_TYPE == 'glove':
    # Specify the file path for the output text file
    output_file = os.path.join("..", "models", "word_embeddings", "glove", 'training_data.txt')

    # Write the "tokens" column to a text file with each row on a separate line
    if os.getcwd().endswith('glove'):
        os.chdir(os.path.join("..", "..", "..", "notebooks"))
    df[TOKENS_TYPE].apply(lambda x: ' '.join(x)).to_csv(output_file, header=False, index=False, sep='\n', quoting=csv.QUOTE_NONE)

    os.environ["VECTOR_SIZE"] = str(VECTOR_SIZE)
    os.environ["WINDOW_SIZE"] = str(WINDOW)
    os.environ["VOCAB_MIN_COUNT"] = str(MIN_COUNT)
    sys.path.append(os.path.join("..", "models", "word_embeddings", "glove"))
    
    # Train the model
    os.chdir(os.path.join("..", "models", "word_embeddings", "glove"))
    !./demo.sh
    if os.getcwd().endswith('glove'):
        os.chdir(os.path.join("..", "..", "..", "notebooks"))
    
    # Path to your GloVe vectors file
    vectors_file = os.path.join("..", "models", "word_embeddings", "glove", "vectors.txt")

    # Load the custom spaCy model with GloVe vectors
    custom_nlp = load_custom_vectors(vectors_file)

    # Save the custom spaCy model to a directory
    custom_nlp.to_disk(os.path.join("..", "models", "word_embeddings", EMBEDDING_MODEL_FNAME.split(".bin")[0]))
    

mkdir -p build
BUILDDIR path: /Users/peterargo/Documents/projects/question_and_answer/models/word_embeddings/glove/build

$ /Users/peterargo/Documents/projects/question_and_answer/models/word_embeddings/glove/build/vocab_count -min-count 3 -verbose 2 < training_data.txt > vocab.txt
BUILDING VOCABULARY
Processed 0 tokens.[0GProcessed 83741 tokens.
Counted 8056 unique words.
Truncating vocabulary at min count 3.
Using vocabulary of size 3672.

$ /Users/peterargo/Documents/projects/question_and_answer/models/word_embeddings/glove/build/cooccur -memory 4.0 -vocab-file vocab.txt -verbose 2 -window-size 3 < training_data.txt > cooccurrence.bin
COUNTING COOCCURRENCES
window size: 3
context: symmetric
max product: 13752509
overflow length: 38028356
Reading vocab from file "vocab.txt"...loaded 3672 words.
Building lookup table...table contains 13483585 elements.
Processing token: 0[0GProcessed 83741 tokens.
Writing cooccurrences to disk.......2 files in total.
Merging cooccurrence files: proc

##### Add the embeddings model to to the dataframe

In [6]:
# Load your trained Word2Vec model
if EMBEDDING_MODEL_TYPE == 'Word2Vec':
    model = Word2Vec.load(os.path.join("..", "models", "word_embeddings", EMBEDDING_MODEL_FNAME))

elif EMBEDDING_MODEL_TYPE.lower() == 'glove':
    # Load the custom spaCy model
    model = spacy.load(os.path.join("..", "models", "word_embeddings", EMBEDDING_MODEL_FNAME.split(".bin")[0]))

# Update dataframe with token embeddings
df[DOCUMENT_EMBEDDING] = df[TOKENS_TYPE].apply(tokens_to_embeddings, args=(model,))

In [7]:
# Apply the function to update MongoDB for each row in the DataFrame
df.apply(update_mongo_document, args=(mongodb,), axis=1)

0      None
1      None
2      None
3      None
4      None
       ... 
364    None
365    None
366    None
367    None
368    None
Length: 369, dtype: object

#### Set the embedding layer from directory

In [None]:
# Set the directory 
directory = os.path.join("..", "data", INPUT_FOLDER)

In [17]:
# From the test pdf dir, extract the text and tokenize it. Store in pandas dataframe
df = pdfs_to_df(directory)
df = tokenize_df_of_texts(df, tokenizers[TOKENIZER], REMOVE_SW_COL=True, additional_stopwords=special_characters)

drop_cols = [col for col in df.columns if col not in ['Document', 'Text', 'Original_Text', 'Path', 'tokens', 'tokens_less_sw']]
print(drop_cols)

df = df.drop(columns=drop_cols)

../data/space_based_pdfs/Galaxy 15 - Wikipedia.pdf
../data/space_based_pdfs/Swarm Technologies - Wikipedia.pdf
../data/space_based_pdfs/Fengyun - Wikipedia.pdf
../data/space_based_pdfs/Falcon 9 - Wikipedia.pdf
../data/space_based_pdfs/Cygnus NG-19 - Wikipedia.pdf
../data/space_based_pdfs/Atlas V - Wikipedia.pdf
../data/space_based_pdfs/Inmarsat - Wikipedia.pdf
../data/space_based_pdfs/Kepler-11 - Wikipedia.pdf
../data/space_based_pdfs/James Webb Space Telescope - Wikipedia.pdf
../data/space_based_pdfs/Space-Based Infrared System - Wikipedia.pdf
../data/space_based_pdfs/Yaogan - Wikipedia.pdf
../data/space_based_pdfs/Starlink - Wikipedia.pdf
../data/space_based_pdfs/Atlas (rocket family) - Wikipedia.pdf
processing text...
making lower-case...
Removing non-text elements (extra whitespaces)...
Removing unnecessary whitespace and special characters...
Removing line breaks...
Removing gibberish...
Removing unicode...
remove single letters or super large words (so big they don't make sense).

In [18]:
df

Unnamed: 0,Document,Path,Text,Original_Text,tokens,tokens_less_sw
0,Galaxy 15 - Wikipedia.pdf,../data/space_based_pdfs/Galaxy 15 - Wikipedia...,"8/27/23, 9:28 galaxy 15 wikipedia 1/8 galaxy 1...","8/27/23, 9:28 PM\nGalaxy 15 - Wikipedia\nhttps...","[Ġ8, /, 27, /, 23, ,, Ġ9, :, 28, Ġgalaxy, Ġ15,...","[Ġgalaxy, Ġwik, ipedia, Ġgalaxy, Ġanimation, Ġ..."
1,Swarm Technologies - Wikipedia.pdf,../data/space_based_pdfs/Swarm Technologies - ...,"8/27/23, 9:31 swarm technologies wikipedia 1/5...","8/27/23, 9:31 PM\nSwarm Technologies - Wikiped...","[Ġ8, /, 27, /, 23, ,, Ġ9, :, 31, Ġswarm, Ġtech...","[Ġswarm, Ġtechnologies, Ġwik, ipedia, Ġswarm, ..."
2,Fengyun - Wikipedia.pdf,../data/space_based_pdfs/Fengyun - Wikipedia.pdf,"8/27/23, 9:29 fengyun wikipedia 1/4 fengyun ⻛云...","8/27/23, 9:29 PM\nFengyun - Wikipedia\nhttps:/...","[Ġ8, /, 27, /, 23, ,, Ġ9, :, 29, Ġf, en, gy, u...","[Ġf, en, gy, un, Ġwik, ipedia, Ġf, en, gy, un,..."
3,Falcon 9 - Wikipedia.pdf,../data/space_based_pdfs/Falcon 9 - Wikipedia.pdf,"8/27/23, 9:33 falcon wikipedia 1/24 falcon fal...","8/27/23, 9:33 PM\nFalcon 9 - Wikipedia\nhttps:...","[Ġ8, /, 27, /, 23, ,, Ġ9, :, 33, Ġfal, con, Ġw...","[Ġfal, con, Ġwik, ipedia, Ġfal, con, Ġfal, con..."
4,Cygnus NG-19 - Wikipedia.pdf,../data/space_based_pdfs/Cygnus NG-19 - Wikipe...,"8/27/23, 9:29 cygnus ng-19 wikipedia 1/4 ng-19...","8/27/23, 9:29 PM\nCygnus NG-19 - Wikipedia\nht...","[Ġ8, /, 27, /, 23, ,, Ġ9, :, 29, Ġcy, gn, us, ...","[Ġcy, gn, us, Ġng, Ġwik, ipedia, Ġng, Ġartists..."
5,Atlas V - Wikipedia.pdf,../data/space_based_pdfs/Atlas V - Wikipedia.pdf,"8/27/23, 9:36 atlas wikipedia 1/23 atlas launc...","8/27/23, 9:36 PM\nAtlas V - Wikipedia\nhttps:/...","[Ġ8, /, 27, /, 23, ,, Ġ9, :, 36, Ġat, las, Ġwi...","[las, Ġwik, ipedia, las, Ġlaunch, las, Ġcarryi..."
6,Inmarsat - Wikipedia.pdf,../data/space_based_pdfs/Inmarsat - Wikipedia.pdf,"8/27/23, 9:36 inmarsat wikipedia 1/20 inmarsat...","8/27/23, 9:36 PM\nInmarsat - Wikipedia\nhttps:...","[Ġ8, /, 27, /, 23, ,, Ġ9, :, 36, Ġin, m, ars, ...","[ars, Ġwik, ipedia, ars, Ġglobal, Ġtype, Ġsubs..."
7,Kepler-11 - Wikipedia.pdf,../data/space_based_pdfs/Kepler-11 - Wikipedia...,"8/27/23, 9:34 kepler-11 wikipedia 1/3 kepler-1...","8/27/23, 9:34 PM\nKepler-11 - Wikipedia\nhttps...","[Ġ8, /, 27, /, 23, ,, Ġ9, :, 34, Ġke, pler, -,...","[Ġke, pler, Ġwik, ipedia, Ġke, pler, Ġartist, ..."
8,James Webb Space Telescope - Wikipedia.pdf,../data/space_based_pdfs/James Webb Space Tele...,"8/27/23, 9:40 james webb space telescope wikip...","8/27/23, 9:40 PM\nJames Webb Space Telescope -...","[Ġ8, /, 27, /, 23, ,, Ġ9, :, 40, Ġj, ames, Ġwe...","[Ġj, ames, Ġweb, b, Ġspace, Ġtelescope, Ġwik, ..."
9,Space-Based Infrared System - Wikipedia.pdf,../data/space_based_pdfs/Space-Based Infrared ...,"8/27/23, 8:44 space-based infrared system wiki...","8/27/23, 8:44 PM\nSpace-Based Infrared System ...","[Ġ8, /, 27, /, 23, ,, Ġ8, :, 44, Ġspace, -, ba...","[Ġspace, based, Ġinfrared, Ġsystem, Ġwik, iped..."


#### Train model on tokenized text
- Set:
    - Input data: Either "tokens" or "tokens_less_sw" (TOKENS_TYPE)
    - Vector Size: length of word embeddings
    - Window Size: span of sorrounding words to train model
    - Min Count: minimum number of occurances of word to be be viable
    - Ouput model file name: (model_fname)

In [18]:
# Train Word2Vec model
embedding_model_type = hyperparams['embedding_model_type']
if embedding_model_type == 'Word2Vec':
    kwargs = {
     'sentences':df[TOKENS_TYPE].to_list(),
     'vector_size':VECTOR_SIZE,
     'window':WINDOW,
     'min_count':MIN_COUNT,
     'sg':hyperparams["sg"]
    }
    
    # Train the Word2Vec model
    model = Word2Vec(**kwargs)
    
    # Save the model
    model.save(os.path.join("..", "models", "word_embeddings", EMBEDDING_MODEL_FNAME))
    
elif embedding_model_type == 'glove':
    # Specify the file path for the output text file
    output_file = os.path.join("..", "models", "word_embeddings", "glove", 'training_data.txt')

    # Write the "tokens" column to a text file with each row on a separate line
    if os.getcwd().endswith('glove'):
        os.chdir(os.path.join("..", "..", "..", "notebooks"))
    df[TOKENS_TYPE].apply(lambda x: ' '.join(x)).to_csv(output_file, header=False, index=False, sep='\n', quoting=csv.QUOTE_NONE)

    os.environ["VECTOR_SIZE"] = str(VECTOR_SIZE)
    os.environ["WINDOW_SIZE"] = str(WINDOW)
    os.environ["VOCAB_MIN_COUNT"] = str(MIN_COUNT)
    sys.path.append(os.path.join("..", "models", "word_embeddings", "glove"))
    
    # Train the model
    os.chdir(os.path.join("..", "models", "word_embeddings", "glove"))
    !./demo.sh
    if os.getcwd().endswith('glove'):
        os.chdir(os.path.join("..", "..", "..", "notebooks"))
    
    # Path to your GloVe vectors file
    vectors_file = os.path.join("..", "models", "word_embeddings", "glove", "vectors.txt")

    # Load the custom spaCy model with GloVe vectors
    custom_nlp = load_custom_vectors(vectors_file)

    # Save the custom spaCy model to a directory
    custom_nlp.to_disk(os.path.join("..", "models", "word_embeddings", EMBEDDING_MODEL_FNAME.split(".bin")[0]))
    

mkdir -p build
BUILDDIR path: /Users/peterargo/Documents/projects/question_and_answer/models/word_embeddings/glove/build

$ /Users/peterargo/Documents/projects/question_and_answer/models/word_embeddings/glove/build/vocab_count -min-count 3 -verbose 2 < training_data.txt > vocab.txt
BUILDING VOCABULARY
Processed 0 tokens.[0GProcessed 72779 tokens.
Counted 7751 unique words.
Truncating vocabulary at min count 3.
Using vocabulary of size 3410.

$ /Users/peterargo/Documents/projects/question_and_answer/models/word_embeddings/glove/build/cooccur -memory 4.0 -vocab-file vocab.txt -verbose 2 -window-size 3 < training_data.txt > cooccurrence.bin
COUNTING COOCCURRENCES
window size: 3
context: symmetric
max product: 13752509
overflow length: 38028356
Reading vocab from file "vocab.txt"...loaded 3410 words.
Building lookup table...table contains 11628101 elements.
Processing token: 0[0GProcessed 72779 tokens.
Writing cooccurrences to disk.......2 files in total.
Merging cooccurrence files: proc

In [15]:
embedding_model_fname

'roberta_space_based_pdfs_glove_model.bin'

#### Examine Model

In [6]:
from collections import Counter

# Count token frequencies
token_frequencies = Counter(df['tokens'].to_list())

# Print the frequency of "number"
print("Frequency of 'number':", token_frequencies["number"])

TypeError: unhashable type: 'list'

In [15]:
# Load the trained Word2Vec model
model = Word2Vec.load(os.path.join("..", "models", "word_embeddings", EMBEDDING_MODEL_FNAME))

word = "revenue"

# Add the special preface character if the tokenizer for roberta was used
word = f"Ġ{word}" if TOKENIZER == 'roberta' else word

# Access the embedding of a word
embedding = model.wv[word]
print(embedding)
# Find similar words based on embedding similarity
similar_words = model.wv.most_similar(word)
print(similar_words)

# You can also perform vector arithmetic operations
# result = model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)

KeyError: "Key 'Ġrevenue' not present"

In [16]:
vocabulary = model.wv.index_to_key
print("Number of words in vocabulary:", len(vocabulary))
print("Is 'number' in vocabulary?", 'number' in vocabulary)

Number of words in vocabulary: 3410
Is 'number' in vocabulary? True


In [14]:
token = '[PAD]'

print(model.wv['[PAD]'])

KeyError: "Key '[PAD]' not present"