In [44]:
from transformers import BertTokenizer
import os
import json

#### Set hyperparameters for the data processing
1. Set embedding layer
    - TOKENIZER: Type of tokenizer used 
    - input_folder: input data to create the embedding layer
    - embedding_model_type: Wordembedding algorithm
    - embedding_model_fname: filname of output embedding layer model
    - vector_size: length of the word embeddings
    - window: Size of wondow to creat 
    - min_count: minimum frequency of word occuring in training data to exist in word embedding
    - sg
    - TOKENS_TPYE: Tokens to use for building the embedding layer (with or without stopwords)
2. PDF PARSER
    - TOKENIZER: Tokenizer used to split the text
    - input_folder: input data to parse
    - embedding_model_fname: filname of input embedding layer model
    - chunk_size: Size of candidate documents
    - chunk_overlap: by how much the chunks overlap
3. Build Query
    - TOKENIZER: Tokenizer used to split the query
    - embedding_model_fname: filename of input embedding layer to get embeddings of tokenized query
    - max_query_length: Size to pad of truncate query
4. Get Candidate Documents
    - top_N: Number of candidate documents to save
    - TOKENS_EMBEDDINGS: Which tokens/embeddings of the query to use
    - DOCUMENT_EMBEDDING: Which embeddings of the chunks to use
    - DOCUMENT_TOKENS: Which tokens of the chunks to use (same as DOCUMENT_EMBEDDING)
    - METHOD: method of cosine similarity
5. Question and Answers
    - transformer_model_name: What model used to fine the answer
    - TOKENIZER: Tokenizer used to create the model inputs and the decode the message
    - context_size: Size of the context vector to fit the candidate documents into

In [45]:
# Set the global vars
TOKENIZER = 'roberta' # 'bert' or 'roberta'
input_folder = "space_based_pdfs" # 'space_based_pdfs' or 'math_based_pdfs'
embedding_model_type = 'Word2Vec' # Word2Vec, Fastrack, GLove
embedding_model_fname = f"{TOKENIZER}_{input_folder}_{embedding_model_type}_model.bin"
vector_size = 50
window = 3
min_count = 3
sg = 0
TOKENS_TPYE = "tokens_less_sw" # "tokens_less_sw", "tokens"
chunk_size = 350
chunk_overlap = 0
max_query_length = 20
top_N = 20
TOKENS_EMBEDDINGS = "query_search_less_sw" # "query_search_less_sw", "query_search", "query"
DOCUMENT_EMBEDDING = "token_embeddings_less_sw" # "token_embeddings_less_sw", "token_embeddings"
DOCUMENT_TOKENS = "tokens_less_sw" # "tokens_less_sw", "tokens"
METHOD = "MEAN_MAX" # 'MEAN_MAX', 'MEAN_MEAN', 'COMBINE_MEAN'
transformer_model_name = "deepset/roberta-base-squad2" # "deepset/roberta-base-squad2", "bert-base-uncased"
context_size = 350 

hyperparameters = {
    'TOKENIZER': TOKENIZER,
    'input_folder': input_folder,
    'embedding_model_type': embedding_model_type,
    'embedding_model_fname': embedding_model_fname,
    'vector_size': vector_size,
    'window': window,
    'min_count': min_count,
    'sg': sg,
    'TOKENS_TPYE': TOKENS_TPYE,
    'chunk_size': chunk_size,
    'chunk_overlap': chunk_overlap,
    'max_query_length': max_query_length,
    'top_N': top_N,
    'TOKENS_EMBEDDINGS': TOKENS_EMBEDDINGS,
    'DOCUMENT_EMBEDDING': DOCUMENT_EMBEDDING,
    'DOCUMENT_TOKENS': DOCUMENT_TOKENS,
    'METHOD': METHOD,
    'transformer_model_name': transformer_model_name,
    'context_size': context_size
    }

In [46]:
# File path for the JSON file
json_file_path = os.path.join("..", "vars", "hyperparameters1.json")

# Check if the JSON file exists
if not os.path.exists(json_file_path):
    # If the file doesn't exist, create and write to it
    with open(json_file_path, "w") as json_file:
        json.dump(hyperparameters, json_file, indent=4)
    print(f"JSON file '{json_file_path}' created and data written.")
else:
    # If the file exists, update its contents
    with open(json_file_path, "w") as json_file:
        json.dump(hyperparameters, json_file, indent=4)
    print(f"JSON file '{json_file_path}' updated with new data.")


JSON file '../vars/hyperparameters1.json' updated with new data.


In [47]:
# Set the Tokenizer for your specific BERT model variant
TOKENIZER = hyperparams["TOKENIZER"]

bert_base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = RobertaTokenizer.from_pretrained("deepset/roberta-base-squad2", add_prefix_space = True)

tokenizers = {'bert': bert_base_tokenizer, 'roberta': roberta_tokenizer}

tokenizer = tokenizers[TOKENIZER]

NameError: name 'hyperparams' is not defined

In [38]:
fname = os.path.join("..", "data", "space_based_pdfs_roberta_parsed", "1431.json")
with open(fname) as json_file:
    data = json.load(json_file)
    
current_chunk_less_sw = data['tokens_less_sw']
print(current_chunk_less_sw)

['Ġbeginnings', 'Ġameric', "'s", 'Ġfirst', 'Ġinter', 'continental', 'Ġreading', 'las', 'rocket', 'Ġfamily', 'Ġwik', 'ipedia', 'Ġballistic', 'Ġmissile', ',"', 'Ġtechnology', 'Ġculture', 'ap', 'ril', '),', 'âĢĵ', 'las', 'Ġencyclopedia', 'Ġastronaut', 'ica', 'htm', 'Ġretrieved', 'Ġexternal', 'Ġlinks']


In [39]:
tokenizer.decode(tokenizer.convert_tokens_to_ids(current_chunk_less_sw))

' beginnings americ\'s first intercontinental readinglasrocket family wikipedia ballistic missile," technology cultureapril),–las encyclopedia astronauticahtm retrieved external links'