In [56]:
#importing necessary libraries
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.layers import Dense, LSTM ,Embedding
from keras.models import Sequential

In [57]:
#source text
data='''Data science is an interdisciplinary field that uses scientific methods, processes, algorithms and systems 
to extract knowledge and insights from noisy, structured and unstructured data, and apply knowledge from data across
a broad range of application domains. Data science is related to data mining, machine learning and big data.
Data science is a concept to unify statistics, data analysis, informatics, and their related methods in order to 
understand and analyse actual phenomena with data.It uses techniques and theories drawn from many fields within 
the context of mathematics, statistics, computer science, information science, and domain knowledge.'''

In [60]:
#integer encode text
tokenizer=Tokenizer()
tokenizer.fit_on_texts([data])
encoded_data= tokenizer.texts_to_sequences([data])[0]
encoded_data

[1,
 3,
 5,
 14,
 15,
 16,
 17,
 8,
 18,
 9,
 19,
 20,
 2,
 21,
 4,
 22,
 6,
 2,
 23,
 7,
 24,
 25,
 2,
 26,
 1,
 2,
 27,
 6,
 7,
 1,
 28,
 10,
 29,
 30,
 11,
 31,
 32,
 1,
 3,
 5,
 12,
 4,
 1,
 33,
 34,
 35,
 2,
 36,
 1,
 1,
 3,
 5,
 10,
 37,
 4,
 38,
 13,
 1,
 39,
 40,
 2,
 41,
 12,
 9,
 42,
 43,
 4,
 44,
 2,
 45,
 46,
 47,
 48,
 1,
 49,
 8,
 50,
 2,
 51,
 52,
 7,
 53,
 54,
 55,
 56,
 57,
 11,
 58,
 13,
 59,
 3,
 60,
 3,
 2,
 61,
 6]

In [61]:
#determining the vocabulary size
vocab_size=len(tokenizer.word_index)+1
print("Vocabulary Size is {}".format(vocab_size))

Vocabulary Size is 62


In [62]:
#creating a sequence of words to fitthe model wth one word as input and one word as output
#create word- word sequences
sequences=list()
for i in range(1,len(encoded_data)):
    sequence=encoded_data[i-1:i+1]
    sequences.append(sequence)
    
print('Total Sequences: {}' .format(len(sequences))) 

Total Sequences: 95


In [63]:
#input output pairs
sequences

[[1, 3],
 [3, 5],
 [5, 14],
 [14, 15],
 [15, 16],
 [16, 17],
 [17, 8],
 [8, 18],
 [18, 9],
 [9, 19],
 [19, 20],
 [20, 2],
 [2, 21],
 [21, 4],
 [4, 22],
 [22, 6],
 [6, 2],
 [2, 23],
 [23, 7],
 [7, 24],
 [24, 25],
 [25, 2],
 [2, 26],
 [26, 1],
 [1, 2],
 [2, 27],
 [27, 6],
 [6, 7],
 [7, 1],
 [1, 28],
 [28, 10],
 [10, 29],
 [29, 30],
 [30, 11],
 [11, 31],
 [31, 32],
 [32, 1],
 [1, 3],
 [3, 5],
 [5, 12],
 [12, 4],
 [4, 1],
 [1, 33],
 [33, 34],
 [34, 35],
 [35, 2],
 [2, 36],
 [36, 1],
 [1, 1],
 [1, 3],
 [3, 5],
 [5, 10],
 [10, 37],
 [37, 4],
 [4, 38],
 [38, 13],
 [13, 1],
 [1, 39],
 [39, 40],
 [40, 2],
 [2, 41],
 [41, 12],
 [12, 9],
 [9, 42],
 [42, 43],
 [43, 4],
 [4, 44],
 [44, 2],
 [2, 45],
 [45, 46],
 [46, 47],
 [47, 48],
 [48, 1],
 [1, 49],
 [49, 8],
 [8, 50],
 [50, 2],
 [2, 51],
 [51, 52],
 [52, 7],
 [7, 53],
 [53, 54],
 [54, 55],
 [55, 56],
 [56, 57],
 [57, 11],
 [11, 58],
 [58, 13],
 [13, 59],
 [59, 3],
 [3, 60],
 [60, 3],
 [3, 2],
 [2, 61],
 [61, 6]]

In [64]:
#split the sequences into input element X and output elememnt Y
sequences=np.asarray(sequences)
X,y=sequences[:,0],sequences[:,1]

In [65]:
X[:5]

array([ 1,  3,  5, 14, 15])

In [66]:
y[:5]

array([ 3,  5, 14, 15, 16])

In [67]:
# one hot encode outputs
y = np_utils.to_categorical(y, num_classes=vocab_size)
# define model
y[:5]

array([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0

In [69]:
#Model Buildng
model=Sequential()
model.add(Embedding(vocab_size,10,input_length=1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1, 10)             620       
                                                                 
 lstm_2 (LSTM)               (None, 50)                12200     
                                                                 
 dense_2 (Dense)             (None, 62)                3162      
                                                                 
Total params: 15,982
Trainable params: 15,982
Non-trainable params: 0
_________________________________________________________________
None


In [77]:
#compiling the network
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [78]:
model.fit(X,y,epochs=100)

Epoch 1/100


ValueError: ignored