In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import h5py
import json
import pandas as pd
import arxiv
import nltk

Init Plugin
Init Graph Optimizer
Init Kernel


In [2]:
print(nltk.__version__)

3.8.1


In [10]:
arxiv.Search?

In [18]:
# use arxiv API to get abstracts from hep-ph papers

search = arxiv.Search(query="cat:hep-ph",
                     max_results=30000,
                     sort_order=arxiv.SortOrder.Descending)
abs_list = [str(result.summary) for result in search.results()]

print(len(abs_list))

KeyboardInterrupt: 

In [None]:
# Let's do some preprocessing on this data with NLTK

In [None]:
# start by tokenizing the text
tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=True,lower=True)
tokenizer.fit_on_texts(abs_list)

# convert text to sequences

sequences = tokenizer.texts_to_sequences(abs_list)[0]

# prepare input and target sequences
input_seq = []
output_seq = []
seq_length = 100
for i in range(len(sequences)-seq_length):
    input_seq.append(sequences[i:i+seq_length])
    output_seq.append(sequences[i+seq_length])
input_seq = np.array(input_seq)
output_seq = np.array(output_seq)

In [None]:
# Set up everything needed to define the model architecture

vocab_size = len(tokenizer.word_index)+1

es = tf.keras.callbacks.EarlyStopping(monitor='loss',patience=10,verbose=1,min_delta=1e-4)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='loss',patience=5,factor=0.1,verbose=1,min_delta=1e-3)
epochs = 150
batch_size = 16

# define model architecture

mini_llm = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, 50, input_length=seq_length),
    tf.keras.layers.Dense(500, activation='elu'),
    #tf.keras.layers.Dense(500, activation='elu'),
    tf.keras.layers.LSTM(2048, return_sequences=True, dropout=0.15, recurrent_dropout=0),
    #tf.keras.layers.LSTM(2048, return_sequences=True, dropout=0.15, recurrent_dropout=0),
    tf.keras.layers.LSTM(2048, dropout=0.15, recurrent_dropout=0),
    #tf.keras.layers.Dense(500, activation='elu'),
    tf.keras.layers.Dense(500, activation='elu'),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

# compile model, print summary

mini_llm.compile(loss='sparse_categorical_crossentropy',
                 optimizer=tf.keras.optimizers.Adam(1e-3),
                 metrics=['accuracy'])
mini_llm.summary()

In [None]:
# train model
mini_llm.fit(input_seq, output_seq, epochs=epochs, batch_size=batch_size,
             verbose=1, callbacks=[reduce_lr,es])

In [None]:
#mini_llm.save('dm_abstract_mini_llm_2.h5')

In [None]:
# Let's evaluate the model and generate some new text!

def generate_text(seed, model, tokenizer, seq_length, num_char_to_gen=300):
    generated_text = seed
    
    for _ in range(num_char_to_gen):
        token_list = tokenizer.texts_to_sequences([generated_text])
        token_list = tf.keras.preprocessing.sequence.pad_sequences(token_list, maxlen=seq_length, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)
        predicted_token = np.argmax(predicted_probs, axis=-1)[0]
        #print(predicted_token)
        
        #output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_token:
                output_word = word
                #print(output_word)
                break
        
        generated_text += output_word
        
    return generated_text

In [None]:
seed_text = 'Dark matter '

gen_text = generate_text(seed_text, mini_llm, tokenizer, seq_length, num_char_to_gen=500-len(seed_text))

print(gen_text)

We roure mixing parameter
scales linearly with the dark photon mass
of order $10^{-20}$ ev. furthermore, the constraint on the mixing parameter
scales linearly with the dark photon mass and so new significant constraints
can be placed on the dark matter mass all the way up to $10^{-14}$ ev. future
experiments measuring $g-2$ will probe even smaller gauge mixing parameters.ithe cork matter is a dark photon,
the correction to the anomalous magnetic moment is larger than experimental
uncertainties 

Dark matter is a dark photon,
the correction to the anomalous magnetic moment is larger than experimental
uncertainties for a mixing parameter of order $10^{-16}$ and a dark photon mass
of order $10^{-20}$ ev. furthermore, the constraint on the mixing parameter
scales linearly with the dark photon mass and so new significant constraints
can be placed on the dark matter mass all the way up to $10^{-14}$ ev. future
experiments measuring $g-2$ will probe even smaller gauge mixing parameters.

In [None]:
mini_llm_2 = tf.keras.models.load_model('dm_abstract_mini_llm_2.h5')

seed_text = 'Dark '

print(generate_text(seed_text, mini_llm_2, tokenizer, seq_length, num_char_to_gen=16))