##### Import the libraries

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
lines = ['It was a nice rainy day.','The things are so beatiful in his point.',
         'When your focus is clear, you won.','Many many happy returns of the day.']

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)

In [5]:
tokenizer.word_docs

defaultdict(int,
            {'it': 1,
             'a': 1,
             'day': 2,
             'rainy': 1,
             'was': 1,
             'nice': 1,
             'beatiful': 1,
             'point': 1,
             'are': 1,
             'his': 1,
             'so': 1,
             'in': 1,
             'things': 1,
             'the': 2,
             'when': 1,
             'is': 1,
             'your': 1,
             'won': 1,
             'you': 1,
             'clear': 1,
             'focus': 1,
             'returns': 1,
             'of': 1,
             'many': 1,
             'happy': 1})

In [7]:
tokenizer.index_word

{1: 'day',
 2: 'the',
 3: 'many',
 4: 'it',
 5: 'was',
 6: 'a',
 7: 'nice',
 8: 'rainy',
 9: 'things',
 10: 'are',
 11: 'so',
 12: 'beatiful',
 13: 'in',
 14: 'his',
 15: 'point',
 16: 'when',
 17: 'your',
 18: 'focus',
 19: 'is',
 20: 'clear',
 21: 'you',
 22: 'won',
 23: 'happy',
 24: 'returns',
 25: 'of'}

In [10]:
tokenizer.index_docs

defaultdict(int,
            {4: 1,
             6: 1,
             1: 2,
             8: 1,
             5: 1,
             7: 1,
             12: 1,
             15: 1,
             10: 1,
             14: 1,
             11: 1,
             13: 1,
             9: 1,
             2: 2,
             16: 1,
             19: 1,
             17: 1,
             22: 1,
             21: 1,
             20: 1,
             18: 1,
             24: 1,
             25: 1,
             3: 1,
             23: 1})

In [11]:
tokenizer.word_index

{'day': 1,
 'the': 2,
 'many': 3,
 'it': 4,
 'was': 5,
 'a': 6,
 'nice': 7,
 'rainy': 8,
 'things': 9,
 'are': 10,
 'so': 11,
 'beatiful': 12,
 'in': 13,
 'his': 14,
 'point': 15,
 'when': 16,
 'your': 17,
 'focus': 18,
 'is': 19,
 'clear': 20,
 'you': 21,
 'won': 22,
 'happy': 23,
 'returns': 24,
 'of': 25}

In [12]:
mat = tokenizer.texts_to_matrix(lines)
mat

array([[0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 1., 1., 1., 1., 1., 1., 0., 0., 0.],
       [0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 1., 1.]])

In [13]:
mat.shape

(4, 26)

In [15]:
seq = tokenizer.texts_to_sequences(lines)

In [16]:
seq

[[4, 5, 6, 7, 8, 1],
 [2, 9, 10, 11, 12, 13, 14, 15],
 [16, 17, 18, 19, 20, 21, 22],
 [3, 3, 23, 24, 25, 2, 1]]

In [17]:
padded = pad_sequences(seq, maxlen=10, padding= 'post' )

In [18]:
padded

array([[ 4,  5,  6,  7,  8,  1,  0,  0,  0,  0],
       [ 2,  9, 10, 11, 12, 13, 14, 15,  0,  0],
       [16, 17, 18, 19, 20, 21, 22,  0,  0,  0],
       [ 3,  3, 23, 24, 25,  2,  1,  0,  0,  0]], dtype=int32)

In [19]:
padded = pad_sequences(seq, maxlen=10, padding= 'pre' )
padded

array([[ 0,  0,  0,  0,  4,  5,  6,  7,  8,  1],
       [ 0,  0,  2,  9, 10, 11, 12, 13, 14, 15],
       [ 0,  0,  0, 16, 17, 18, 19, 20, 21, 22],
       [ 0,  0,  0,  3,  3, 23, 24, 25,  2,  1]], dtype=int32)