In [1]:
import os
import json

import sys
sys.path.append(os.path.join('..', 'src'))

from utils import pdfs_to_df, tokenize_df_of_texts
from gensim.models import Word2Vec
from transformers import BertTokenizer, RobertaTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize

  from .autonotebook import tqdm as notebook_tqdm


#### Get the Hyperparameters

In [9]:
with open(os.path.join("..", "vars", "hyperparameters1.json")) as json_file:
    hyperparams = json.load(json_file)
    print(json.dumps(hyperparams, indent=4))

{
    "TOKENIZER": "roberta",
    "input_folder": "space_based_pdfs",
    "embedding_model_type": "Word2Vec",
    "embedding_model_fname": "roberta_space_based_pdfs_Word2Vec_model.bin",
    "vector_size": 50,
    "window": 3,
    "min_count": 3,
    "sg": 0,
    "TOKENS_TPYE": "tokens_less_sw",
    "chunk_size": 350,
    "chunk_overlap": 0,
    "max_query_length": 20,
    "top_N": 20,
    "TOKENS_EMBEDDINGS": "query_search_less_sw",
    "DOCUMENT_EMBEDDING": "token_embeddings_less_sw",
    "DOCUMENT_TOKENS": "tokens_less_sw",
    "METHOD": "MEAN_MAX",
    "transformer_model_name": "deepset/roberta-base-squad2",
    "context_size": 350
}


#### Get text from test corpus
- Specify tokenizer, keep consistent with downstream Q&A model (TOKENIZER)
- Secify the data filepath (directory)

In [10]:
# Set the Tokenizer
TOKENIZER = hyperparams['TOKENIZER']

bert_base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2", add_prefix_space = True)

tokenizers = {'bert': bert_base_tokenizer, 'roberta': roberta_tokenizer}

# Set the directory 
input_folder = hyperparams['input_folder']
directory = os.path.join("..", "data", input_folder)

In [11]:
# Specify additional stopwords to remove from the chunk cleaned for the candidate document search
special_characters = [
    "!", "\"", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", "://", "https",'"', '"...', "/)","www",
    ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`", "{", "|", "}", "~", ".[", ",[", "-,", "][", "com",
    "),", ',"'
]

special_characters += list(map(lambda x: "Ġ" + x, special_characters))
print(special_characters)

# Add numbers to remove
special_characters += list(map(lambda x: str(x), range(100000)))
special_characters += list(map(lambda x: "Ġ" + str(x), range(100000)))

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '://', 'https', '"', '"...', '/)', 'www', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '.[', ',[', '-,', '][', 'com', '),', ',"', 'Ġ!', 'Ġ"', 'Ġ#', 'Ġ$', 'Ġ%', 'Ġ&', "Ġ'", 'Ġ(', 'Ġ)', 'Ġ*', 'Ġ+', 'Ġ,', 'Ġ-', 'Ġ.', 'Ġ/', 'Ġ://', 'Ġhttps', 'Ġ"', 'Ġ"...', 'Ġ/)', 'Ġwww', 'Ġ:', 'Ġ;', 'Ġ<', 'Ġ=', 'Ġ>', 'Ġ?', 'Ġ@', 'Ġ[', 'Ġ\\', 'Ġ]', 'Ġ^', 'Ġ_', 'Ġ`', 'Ġ{', 'Ġ|', 'Ġ}', 'Ġ~', 'Ġ.[', 'Ġ,[', 'Ġ-,', 'Ġ][', 'Ġcom', 'Ġ),', 'Ġ,"']


In [12]:
# From the test pdf dir, extract the text and tokenize it. Store in pandas dataframe
df = pdfs_to_df(directory)
df = tokenize_df_of_texts(df, tokenizers[TOKENIZER], REMOVE_SW_COL=True, additional_stopwords=special_characters)

drop_cols = [col for col in df.columns if col not in ['Document', 'Text', 'Original_Text', 'Path', 'tokens', 'tokens_less_sw']]
print(drop_cols)

df = df.drop(columns=drop_cols)

../data/space_based_pdfs/Galaxy 15 - Wikipedia.pdf
../data/space_based_pdfs/Swarm Technologies - Wikipedia.pdf
../data/space_based_pdfs/Fengyun - Wikipedia.pdf
../data/space_based_pdfs/Falcon 9 - Wikipedia.pdf
../data/space_based_pdfs/Cygnus NG-19 - Wikipedia.pdf
../data/space_based_pdfs/Atlas V - Wikipedia.pdf
../data/space_based_pdfs/Inmarsat - Wikipedia.pdf
../data/space_based_pdfs/Kepler-11 - Wikipedia.pdf
../data/space_based_pdfs/James Webb Space Telescope - Wikipedia.pdf
../data/space_based_pdfs/Space-Based Infrared System - Wikipedia.pdf
../data/space_based_pdfs/Yaogan - Wikipedia.pdf
../data/space_based_pdfs/Starlink - Wikipedia.pdf
../data/space_based_pdfs/Atlas (rocket family) - Wikipedia.pdf
processing text...
making lower-case...
Removing non-text elements (extra whitespaces)...
Removing unnecessary whitespace and special characters...
Removing line breaks...
Removing gibberish...
Removing unicode...
remove single letters or super large words (so big they don't make sense).

In [13]:
df.head(2)

Unnamed: 0,Document,Path,Text,Original_Text,tokens,tokens_less_sw
0,Galaxy 15 - Wikipedia.pdf,../data/space_based_pdfs/Galaxy 15 - Wikipedia...,"8/27/23, 9:28 galaxy 15 wikipedia 1/8 galaxy 1...","8/27/23, 9:28 PM\nGalaxy 15 - Wikipedia\nhttps...","[Ġ8, /, 27, /, 23, ,, Ġ9, :, 28, Ġgalaxy, Ġ15,...","[Ġgalaxy, Ġwik, ipedia, Ġgalaxy, Ġanimation, Ġ..."
1,Swarm Technologies - Wikipedia.pdf,../data/space_based_pdfs/Swarm Technologies - ...,"8/27/23, 9:31 swarm technologies wikipedia 1/5...","8/27/23, 9:31 PM\nSwarm Technologies - Wikiped...","[Ġ8, /, 27, /, 23, ,, Ġ9, :, 31, Ġswarm, Ġtech...","[Ġswarm, Ġtechnologies, Ġwik, ipedia, Ġswarm, ..."


#### Train model on tokenized text
- Set:
    - Input data: Either "tokens" or "tokens_less_sw" (TOKENS_TYPE)
    - Vector Size: length of word embeddings
    - Window Size: span of sorrounding words to train model
    - Min Count: minimum number of occurances of word to be be viable
    - Ouput model file name: (model_fname)

In [14]:
# TOKENS_TPYE = "tokens"
TOKENS_TPYE = hyperparams["TOKENS_TPYE"]

kwargs = {
     'sentences':df[TOKENS_TPYE].to_list(),
     'vector_size':hyperparams["vector_size"],
     'window':hyperparams["window"],
     'min_count':hyperparams["min_count"],
     'sg':hyperparams["sg"]
    }

# Train Word2Vec model
embedding_model_type = hyperparams['embedding_model_type']
if embedding_model_type == 'Word2Vec':
    model = Word2Vec(**kwargs)

embedding_model_fname = hyperparams["embedding_model_fname"]
print(embedding_model_fname)

# Save the trained model
model.save(os.path.join("..", "models", "word_embeddings", embedding_model_fname))

roberta_space_based_pdfs_Word2Vec_model.bin


#### Examine Model

In [6]:
from collections import Counter

# Count token frequencies
token_frequencies = Counter(df['tokens'].to_list())

# Print the frequency of "number"
print("Frequency of 'number':", token_frequencies["number"])

TypeError: unhashable type: 'list'

In [15]:
# Load the trained Word2Vec model
model = Word2Vec.load(os.path.join("..", "models", "word_embeddings", embedding_model_fname))

word = "revenue"

# Add the special preface character if the tokenizer for roberta was used
word = f"Ġ{word}" if TOKENIZER == 'roberta' else word

# Access the embedding of a word
embedding = model.wv[word]
print(embedding)
# Find similar words based on embedding similarity
similar_words = model.wv.most_similar(word)
print(similar_words)

# You can also perform vector arithmetic operations
# result = model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)

KeyError: "Key 'Ġrevenue' not present"

In [16]:
vocabulary = model.wv.index_to_key
print("Number of words in vocabulary:", len(vocabulary))
print("Is 'number' in vocabulary?", 'number' in vocabulary)

Number of words in vocabulary: 3410
Is 'number' in vocabulary? True


In [14]:
token = '[PAD]'

print(model.wv['[PAD]'])

KeyError: "Key '[PAD]' not present"