In [82]:
from google.colab import drive

drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [83]:
with open('/content/drive/MyDrive/Dataset/Diabetes.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [84]:
print("Size of dataset in characters: ", len(text))

Size of dataset in characters:  67053


In [85]:
print(text[:1000])

The History of Diabetes Mellitus
Robert B. Tattersall
 University of Nottingham, Nottingham, UK
Textbook of Diabetes, 4th edition. Edited by R. Holt, C. Cockram,
A. Flyvbjerg and B. Goldstein. © 2010 Blackwell Publishing.
Keypoints
• Polyuric diseases have been described for over 3500 years. The name
 “ diabetes ” comes from the Greek word for a syphon; the sweet taste
of diabetic urine was recognized at the beginning of the fi rst
millennium, but the adjective “ mellitus ” (honeyed) was only added by
Rollo in the late 18th century.
• The sugar in diabetic urine was identifi ed as glucose by Chevreul in
1815. In the 1840s, Bernard showed that glucose was normally present
in blood, and showed that it was stored in the liver (as glycogen) for
secretion into the bloodstream during fasting.
• In 1889, Minkowski and von Mering reported that pancreatectomy
caused severe diabetes in the dog. In 1893, Laguesse suggested that
the pancreatic “ islets ” described by Langerhans in 1869 produced an

In [86]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [87]:
tokenizer = Tokenizer()

In [88]:
tokenizer.fit_on_texts([text])

In [89]:
tokenizer.word_index

{'the': 1,
 'of': 2,
 'and': 3,
 'in': 4,
 'diabetes': 5,
 'to': 6,
 'a': 7,
 'by': 8,
 'that': 9,
 'was': 10,
 'with': 11,
 'as': 12,
 'insulin': 13,
 '–': 14,
 'for': 15,
 'is': 16,
 'are': 17,
 'from': 18,
 'costs': 19,
 'fi': 20,
 'or': 21,
 '“': 22,
 '”': 23,
 '1': 24,
 'it': 25,
 'were': 26,
 'this': 27,
 'be': 28,
 'health': 29,
 'disease': 30,
 'have': 31,
 'these': 32,
 'but': 33,
 'on': 34,
 'glucose': 35,
 'care': 36,
 'mortality': 37,
 'had': 38,
 'people': 39,
 'complications': 40,
 'an': 41,
 'may': 42,
 'patients': 43,
 's': 44,
 'rst': 45,
 'not': 46,
 'more': 47,
 'such': 48,
 '5': 49,
 'blood': 50,
 'risk': 51,
 'c': 52,
 'only': 53,
 'type': 54,
 '2': 55,
 '’': 56,
 'which': 57,
 'us': 58,
 'because': 59,
 'associated': 60,
 'diabetic': 61,
 'also': 62,
 'cells': 63,
 'been': 64,
 'described': 65,
 'who': 66,
 'they': 67,
 'related': 68,
 'greater': 69,
 'into': 70,
 'developed': 71,
 'all': 72,
 'at': 73,
 'islet': 74,
 'one': 75,
 '4': 76,
 'years': 77,
 'figure': 

In [90]:
len(tokenizer.word_index)

2777

In [91]:
input_sequences = []
for sentence in text.split('\n'):
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1,len(tokenized_sentence)):
    input_sequences.append(tokenized_sentence[:i+1])

In [92]:
input_sequences

[[1, 286],
 [1, 286, 2],
 [1, 286, 2, 5],
 [1, 286, 2, 5, 349],
 [690, 102],
 [690, 102, 469],
 [691, 2],
 [691, 2, 692],
 [691, 2, 692, 692],
 [691, 2, 692, 692, 470],
 [1193, 2],
 [1193, 2, 5],
 [1193, 2, 5, 693],
 [1193, 2, 5, 693, 694],
 [1193, 2, 5, 693, 694, 695],
 [1193, 2, 5, 693, 694, 695, 8],
 [1193, 2, 5, 693, 694, 695, 8, 350],
 [1193, 2, 5, 693, 694, 695, 8, 350, 696],
 [1193, 2, 5, 693, 694, 695, 8, 350, 696, 52],
 [1193, 2, 5, 693, 694, 695, 8, 350, 696, 52, 697],
 [7, 698],
 [7, 698, 3],
 [7, 698, 3, 102],
 [7, 698, 3, 102, 699],
 [7, 698, 3, 102, 699, 700],
 [7, 698, 3, 102, 699, 700, 471],
 [7, 698, 3, 102, 699, 700, 471, 701],
 [7, 698, 3, 102, 699, 700, 471, 701, 702],
 [87, 703],
 [87, 703, 197],
 [87, 703, 197, 31],
 [87, 703, 197, 31, 64],
 [87, 703, 197, 31, 64, 65],
 [87, 703, 197, 31, 64, 65, 15],
 [87, 703, 197, 31, 64, 65, 15, 160],
 [87, 703, 197, 31, 64, 65, 15, 160, 1195],
 [87, 703, 197, 31, 64, 65, 15, 160, 1195, 77],
 [87, 703, 197, 31, 64, 65, 15, 160

In [93]:
max_len = max([len(x) for x in input_sequences])

In [94]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences = pad_sequences(input_sequences, maxlen = max_len, padding='pre')

In [95]:
padded_input_sequences

array([[  0,   0,   0, ...,   0,   1, 286],
       [  0,   0,   0, ...,   1, 286,   2],
       [  0,   0,   0, ..., 286,   2,   5],
       ...,
       [  0,   0,   0, ..., 699, 700, 471],
       [  0,   0,   0, ..., 700, 471, 701],
       [  0,   0,   0, ..., 471, 701, 702]], dtype=int32)

In [96]:
X = padded_input_sequences[:,:-1]

In [97]:
y = padded_input_sequences[:,-1]

In [98]:
X.shape

(9591, 34)

In [99]:
X

array([[  0,   0,   0, ...,   0,   0,   1],
       [  0,   0,   0, ...,   0,   1, 286],
       [  0,   0,   0, ...,   1, 286,   2],
       ...,
       [  0,   0,   0, ..., 102, 699, 700],
       [  0,   0,   0, ..., 699, 700, 471],
       [  0,   0,   0, ..., 700, 471, 701]], dtype=int32)

In [100]:
y.shape

(9591,)

In [101]:
y

array([286,   2,   5, ..., 471, 701, 702], dtype=int32)

In [102]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=9592)

In [103]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [104]:
y.shape

(9591, 9592)

### **Using LSTM**

In [105]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [106]:
model = Sequential()
model.add(Embedding(9592, 100, input_length=34))
model.add(LSTM(150))
model.add(Dense(9592, activation='softmax'))

In [107]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [108]:
model.fit(X,y,epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x7e69cc0b41c0>

In [109]:
import numpy as np

In [117]:
import time
text = "β - cell failure"

for i in range(20):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=34, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)
      time.sleep(2)

β - cell failure the
β - cell failure the same
β - cell failure the same with
β - cell failure the same with collip
β - cell failure the same with collip and
β - cell failure the same with collip and the
β - cell failure the same with collip and the diagnosis
β - cell failure the same with collip and the diagnosis was
β - cell failure the same with collip and the diagnosis was superseded
β - cell failure the same with collip and the diagnosis was superseded by
β - cell failure the same with collip and the diagnosis was superseded by chemical
β - cell failure the same with collip and the diagnosis was superseded by chemical tests
β - cell failure the same with collip and the diagnosis was superseded by chemical tests for
β - cell failure the same with collip and the diagnosis was superseded by chemical tests for reducing
β - cell failure the same with collip and the diagnosis was superseded by chemical tests for reducing agents
β - cell failure the same with collip and the diagnosis was

## **Bidirectional GRU**

In [118]:
from keras.models import Sequential
from keras.layers import Bidirectional, GRU, Dense, Embedding

In [111]:
model = Sequential()

# Add an embedding layer
model.add(Embedding(input_dim=9592, output_dim=100, input_length=34))

# Add first bidirectional GRU layer
model.add(Bidirectional(GRU(units=150, return_sequences=True)))

# Add second bidirectional GRU layer
model.add(Bidirectional(GRU(units=150)))

# Add a dense layer with softmax activation for output
model.add(Dense(9592, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()


Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 34, 100)           959200    
                                                                 
 bidirectional_2 (Bidirecti  (None, 34, 300)           226800    
 onal)                                                           
                                                                 
 bidirectional_3 (Bidirecti  (None, 300)               406800    
 onal)                                                           
                                                                 
 dense_4 (Dense)             (None, 9592)              2887192   
                                                                 
Total params: 4479992 (17.09 MB)
Trainable params: 4479992 (17.09 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [114]:
model.fit(X,y,epochs=150)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.src.callbacks.History at 0x7e69ccf157b0>

In [115]:
import time
text = "Polyuric diseases have been described"

for i in range(15):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=34, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)
      time.sleep(2)

Polyuric diseases have been described for
Polyuric diseases have been described for over
Polyuric diseases have been described for over 3500
Polyuric diseases have been described for over 3500 years
Polyuric diseases have been described for over 3500 years the
Polyuric diseases have been described for over 3500 years the name
Polyuric diseases have been described for over 3500 years the name patients
Polyuric diseases have been described for over 3500 years the name patients with
Polyuric diseases have been described for over 3500 years the name patients with diabetes
Polyuric diseases have been described for over 3500 years the name patients with diabetes medications
Polyuric diseases have been described for over 3500 years the name patients with diabetes medications and
Polyuric diseases have been described for over 3500 years the name patients with diabetes medications and europe
Polyuric diseases have been described for over 3500 years the name patients with diabetes medications an