##importing the necessary labraries

In [4]:
!pip install pypdf



In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, LSTM, Dense

##Read the pdf text file

In [6]:
# importing required classes
from pypdf import PdfReader

# creating a pdf reader object
reader = PdfReader('/content/2407.12220v1.pdf')

In [7]:
# printing number of pages in pdf file
print(len(reader.pages))

48


In [8]:
# creating a page object
page = reader.pages[0]

In [9]:
# extracting text from page
# print(page.extract_text())
reader.pages[12].extract_text()

'text fragments, with extreme label noise [Zellers et al., 2019b]) is often rounded off to “a test of (general)\ncommonsense reasoning” [Gemini et al., 2023, Edwards, 2023, Wang and Zhao, 2023].\nMore subjective but less artificial and gameable tests are being explored. The LMSYS Arena [Chiang et al.,\n2024] (which crowdsources binary human preferences between models on arbitrary prompts) solves many\nproblems – though we note that it too is susceptible to hacking (for instance by improving the style of a model\nwithout improving accuracy or reasoning, or even by paying raters to score your model preferentially using\ntell-tale tokens).\n3.2.5 Subset hacking: picking the easy part of a hard task\nA more subtle degree of freedom than simply the choice of training dataset arises because model evaluation\nhas become very costly due to the number of benchmarks available, dataset sizes, and the associated inference\ncosts (see Liang et al. [2022] for a detailed discussion). A common solutio

In [10]:
text = ""
for page_num in range(len(reader.pages)):
  # print(reader.pages[page_num].extract_text())
  text += reader.pages[page_num].extract_text()

##preprocess the data

In [11]:
tokenizer = Tokenizer()

In [12]:
tokenizer.fit_on_texts([text])

In [13]:
num_classes = len(tokenizer.word_index)+1
# tokenizer.word_index


In [14]:
input_sequences = []
for sentence in text.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1,len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])

In [15]:
tokenizer.sequences_to_texts(input_sequences[80:81])

['language models llms on public']

In [16]:
max_len = max([len(x) for x in input_sequences])

In [17]:

padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding='pre')

In [18]:
padded_input_sequences

array([[  0,   0,   0, ...,   0, 274, 151],
       [  0,   0,   0, ..., 274, 151,   6],
       [  0,   0,   0, ..., 151,   6,  75],
       ...,
       [  0,   0,   0, ..., 492,  31, 307],
       [  0,   0,   0, ...,  31, 307, 223],
       [  0,   0,   0, ..., 307, 223,  60]], dtype=int32)

In [19]:
X = padded_input_sequences[:,:-1]

y = padded_input_sequences[:,-1]

In [28]:
X.shape

(23309, 31)

In [21]:

y = to_categorical(y,num_classes=num_classes)

#MODEL TRAINING

In [24]:
model = Sequential()
model.add(Embedding(num_classes, 100,input_length = max_len-1))
model.add(LSTM(150))
model.add(Dense(num_classes,activation= 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 31, 100)           581000    
                                                                 
 lstm (LSTM)                 (None, 150)               150600    
                                                                 
 dense (Dense)               (None, 5810)              877310    
                                                                 
Total params: 1608910 (6.14 MB)
Trainable params: 1608910 (6.14 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [26]:
model.fit(X,y,epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.src.callbacks.History at 0x7c76c426cf70>

##PREDICT THE NEXT WORDS

In [31]:
import time
text = " While the most contamination is likely to happen at pre/post training"
import numpy as np
for i in range(10):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=max_len-1, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)
      time.sleep(2)

 While the most contamination is likely to happen at pre/post training it
 While the most contamination is likely to happen at pre/post training it is
 While the most contamination is likely to happen at pre/post training it is now
 While the most contamination is likely to happen at pre/post training it is now common
 While the most contamination is likely to happen at pre/post training it is now common to
 While the most contamination is likely to happen at pre/post training it is now common to use
 While the most contamination is likely to happen at pre/post training it is now common to use multiple
 While the most contamination is likely to happen at pre/post training it is now common to use multiple relevant
 While the most contamination is likely to happen at pre/post training it is now common to use multiple relevant for
 While the most contamination is likely to happen at pre/post training it is now common to use multiple relevant for liao


In [29]:
max_len-1

32