In [1]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
!pip install wikipedia-api
!pip install wikipedia

Collecting wikipedia-api
  Downloading wikipedia_api-0.8.1.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia-api
  Building wheel for wikipedia-api (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia-api: filename=Wikipedia_API-0.8.1-py3-none-any.whl size=15384 sha256=ab576f3ddf3824dd0398c60e50338077ea0bc74b90e9ccb618ab1bd94c55c85a
  Stored in directory: /root/.cache/pip/wheels/1d/f8/07/0508c38722dcd82ee355e9d85e33c9e9471d4bec0f8ae72de0
Successfully built wikipedia-api
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.8.1
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11679 sha256=8e058aef819a01e9374603ccdc43d9c0b1754

In [3]:
import wikipedia

page = wikipedia.page("Natural language processing")
text = page.content
print(text[:1000])


Natural language processing (NLP) is a subfield of computer science and especially artificial intelligence. It is primarily concerned with providing computers with the ability to process data encoded in natural language and is thus closely related to information retrieval, knowledge representation and computational linguistics, a subfield of linguistics.
Major tasks in natural language processing are speech recognition, text classification, natural-language understanding, and natural-language generation.


== History ==

Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, though at the time that was not articulated as a problem separate from artificial intelligence. The proposed test includes a task that involves the automated interpretation and generation of natural language.


=== Symbolic NLP (1950s – ea

In [4]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts([text])
total_words=len(tokenizer.word_index)+1
total_words

1361

In [5]:
input_sequences=[]
for line in text.split('\n'):
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [6]:
max_sequence_len=max([len(x) for x in input_sequences])
max_sequence_len

158

In [7]:
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding='pre'))
input_sequences

array([[   0,    0,    0, ...,    0,   16,    8],
       [   0,    0,    0, ...,   16,    8,   46],
       [   0,    0,    0, ...,    8,   46,   18],
       ...,
       [   0,    0,    0, ...,    8,   46,   74],
       [   0,    0,    0, ...,   46,   74, 1359],
       [   0,    0,    0, ...,   74, 1359, 1360]], dtype=int32)

In [8]:
import tensorflow as tf
x,y=input_sequences[:,:-1],input_sequences[:,-1]
#last word is y and rest are x

In [9]:
y=tf.keras.utils.to_categorical(y,num_classes=total_words)

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout

In [16]:
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
model.add(LSTM(150))
#model.add(Dropout(0.2))
#model.add(LSTM(100))
model.add(Dense(total_words,activation="softmax"))

model.compile(loss="categorical_crossentropy",optimizer='adam',metrics=['accuracy'])

In [17]:
history=model.fit(x_train,y_train,epochs=50,validation_data=(x_test,y_test),verbose=1)
model.summary()

Epoch 1/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.0394 - loss: 6.7994 - val_accuracy: 0.0483 - val_loss: 6.4781
Epoch 2/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0479 - loss: 6.1320 - val_accuracy: 0.0483 - val_loss: 6.5401
Epoch 3/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0595 - loss: 5.9431 - val_accuracy: 0.0718 - val_loss: 6.5722
Epoch 4/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0772 - loss: 5.8367 - val_accuracy: 0.0707 - val_loss: 6.6358
Epoch 5/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0823 - loss: 5.6691 - val_accuracy: 0.0909 - val_loss: 6.6635
Epoch 6/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.0862 - loss: 5.5240 - val_accuracy: 0.1021 - val_loss: 6.7245
Epoch 7/50
[1m112/112[0m 

In [18]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [19]:
input_text="The proposed test includes a task that involves the"
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word PRediction:{next_word}")

Input text:The proposed test includes a task that involves the
Next Word PRediction:description


In [20]:
## Save the model
model.save("next_word_wiki.h5")
## Save the tokenizer
import pickle
with open('tokenizer_wiki.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)