In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from tensorflow.keras import models, layers

In [None]:
# Data Feature
texts = [
    "I am playing good cricket",
    "He is playing chess",
    "I like to watch cricket",
    "Chess is a mind game",
    "Cricket is played outdoors",
    "Chess pieces are interesting",
    "We played cricket yesterday",
    "He won the chess match"
]

#Target
labels = [0, 1, 0, 1, 0, 1, 0, 1]

In [None]:
# Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
tokenizer.word_index

{'cricket': 1,
 'chess': 2,
 'is': 3,
 'i': 4,
 'playing': 5,
 'he': 6,
 'played': 7,
 'am': 8,
 'good': 9,
 'like': 10,
 'to': 11,
 'watch': 12,
 'a': 13,
 'mind': 14,
 'game': 15,
 'outdoors': 16,
 'pieces': 17,
 'are': 18,
 'interesting': 19,
 'we': 20,
 'yesterday': 21,
 'won': 22,
 'the': 23,
 'match': 24}

In [None]:
sequences = tokenizer.texts_to_sequences(texts)
sequences

[[4, 8, 5, 9, 1],
 [6, 3, 5, 2],
 [4, 10, 11, 12, 1],
 [2, 3, 13, 14, 15],
 [1, 3, 7, 16],
 [2, 17, 18, 19],
 [20, 7, 1, 21],
 [6, 22, 23, 2, 24]]

In [None]:
sequence_length = []
for seq in sequences:
  sequence_length.append(len(seq))

print(sequence_length)
print(max(sequence_length))

[5, 4, 5, 5, 4, 4, 4, 5]
5


In [None]:
maxlen = max(len(seq) for seq in sequences)

padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding='pre')

padded_sequences

array([[ 4,  8,  5,  9,  1],
       [ 0,  6,  3,  5,  2],
       [ 4, 10, 11, 12,  1],
       [ 2,  3, 13, 14, 15],
       [ 0,  1,  3,  7, 16],
       [ 0,  2, 17, 18, 19],
       [ 0, 20,  7,  1, 21],
       [ 6, 22, 23,  2, 24]], dtype=int32)

In [None]:
labels = np.array(labels)
print(labels)

[0 1 0 1 0 1 0 1]


In [None]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

25


In [None]:
# Model
embedding_dim = 8                #Converting word into a vector of size 8
model = models.Sequential()
model.add(layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



In [None]:
# Train
model.fit(padded_sequences, labels, epochs=30, verbose=1)

Epoch 1/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.3750 - loss: 0.7003
Epoch 2/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 609ms/step - accuracy: 0.3750 - loss: 0.6979
Epoch 3/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.3750 - loss: 0.6955
Epoch 4/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.5000 - loss: 0.6933
Epoch 5/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.5000 - loss: 0.6911
Epoch 6/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.7500 - loss: 0.6891
Epoch 7/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.7500 - loss: 0.6872
Epoch 8/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.8750 - loss: 0.6856
Epoch 9/30
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

<keras.src.callbacks.history.History at 0x7935800c0e10>

In [None]:
new_text = "The batsman scored a century"

seq  = tokenizer.texts_to_sequences([new_text])

pad_seq = pad_sequences(seq, maxlen=maxlen, padding='pre')

print(int(model.predict(pad_seq) > 0.5))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 312ms/step
1


  print(int(model.predict(pad_seq) > 0.5))


In [None]:
# Predict function
def predict_topic(text):
    seq = tokenizer.texts_to_sequences([text])
    pad_seq = pad_sequences(seq, maxlen=maxlen, padding='post')
    pred = model.predict(pad_seq)[0][0]
    return "chess" if pred > 0.5 else "cricket"

print(predict_topic("The batsman scored a century"))  # Output: cricket
print(predict_topic("He moved his knight"))           # Output: chess

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
chess
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
chess


**IMDB sentiment predictions**

In [None]:
n1 = np.array([[1,2,4,7,2] , [4,8,9,3,1] , [9,1,2,3,5]])

In [None]:
import numpy as np
from tensorflow.keras.datasets import imdb
import keras

In [None]:
(xtrain,ytrain),(xtest,ytest) = imdb.load_data()

In [None]:
print(len(xtrain[0]))

218


In [None]:
print(len(xtrain[1]))

189


In [None]:
print(len(xtrain[2]))

141


In [None]:
print(xtrain.shape)

(25000,)


In [None]:
print(xtest.shape)

(25000,)


In [None]:
print(ytrain.shape)

(25000,)


In [None]:
print(ytrain)

[1 0 0 ... 0 1 0]


In [None]:
print(ytest.shape)

(25000,)


In [None]:
imdb.get_word_index()

{'fawn': 34701,
 'tsukino': 52006,
 'nunnery': 52007,
 'sonja': 16816,
 'vani': 63951,
 'woods': 1408,
 'spiders': 16115,
 'hanging': 2345,
 'woody': 2289,
 'trawling': 52008,
 "hold's": 52009,
 'comically': 11307,
 'localized': 40830,
 'disobeying': 30568,
 "'royale": 52010,
 "harpo's": 40831,
 'canet': 52011,
 'aileen': 19313,
 'acurately': 52012,
 "diplomat's": 52013,
 'rickman': 25242,
 'arranged': 6746,
 'rumbustious': 52014,
 'familiarness': 52015,
 "spider'": 52016,
 'hahahah': 68804,
 "wood'": 52017,
 'transvestism': 40833,
 "hangin'": 34702,
 'bringing': 2338,
 'seamier': 40834,
 'wooded': 34703,
 'bravora': 52018,
 'grueling': 16817,
 'wooden': 1636,
 'wednesday': 16818,
 "'prix": 52019,
 'altagracia': 34704,
 'circuitry': 52020,
 'crotch': 11585,
 'busybody': 57766,
 "tart'n'tangy": 52021,
 'burgade': 14129,
 'thrace': 52023,
 "tom's": 11038,
 'snuggles': 52025,
 'francesco': 29114,
 'complainers': 52027,
 'templarios': 52125,
 '272': 40835,
 '273': 52028,
 'zaniacs': 52130,

In [None]:
#Find out the words in first sample
print(xtrain[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [None]:
review_words = []
for val,id in imdb.get_word_index().items():
  #print(val,id)
  if id in xtrain[0]:
    review_words.append(val)

print(review_words)

['want', 'had', 'with', 'tricky', 'more', "show's", "isn't", 'room', 'one', 'get', 'heartfelt', 'watch', 'have', 'story', "gilmore's", 'frog', 'why', 'he', 'current', 'an', 'as', 'moments', 'it', 'half', 'shows', 'real', 'very', 'she', 'never', 'is', 'in', 'but', 'every', 'several', 'except', 'of', 'or', 'whether', 'camp', 'her', 'musicians', 'should', 'after', 'loves', 'history', 'help', 'mine', 'character', 'going', 'name', 'unfortunately', 'reaching', 'here', 'any', 'boat', 'odd', 'scary', 'atmosphere', 'not', 'two', 'you', 'while', 'does', 'becomes', 'chest', 'potentially', 'critics', 'acting', 'at', 'thought', 'journalist', 'over', 'will', 'years', 'sometimes', 'barrel', 'titillate', 'now', "'n", 'powerful', 'so', 'enough', 'when', 'lets', 'out', 'for', 'visual', 'br', 'then', 'they', 'nobody', 'the', 'seeing', "wasn't", 'armed', 'from', 'themselves', 'pratfalls', 'was', 'other', 'itself', 'to', 'heart', 'lot', 'wonderful', 'villaronga', 'seen', 'anyone', 'film', 'most', '70s', 's

In [None]:
print(max(len(seq) for seq in xtrain))
print(max(len(seq) for seq in xtest))

2494
2315


In [None]:
#Convert the unstructured xtrain and xtest into structured data => padding

from keras.utils import pad_sequences

xtrainP = pad_sequences(xtrain , padding='pre' , maxlen = 2494)
xtrainP.shape

(25000, 2494)

In [None]:
xtestP = pad_sequences(xtest , padding='pre' , maxlen = 2494)
xtestP.shape

(25000, 2494)

In [None]:
from keras import models,layers

In [None]:
print("Unique number in the matrix representing the collections of words-")
print(np.unique(xtrainP))

Unique number in the matrix representing the collections of words-
[    0     1     2 ... 88584 88585 88586]


In [None]:
print("Length Unique numbers in the matrix representing the collections of words-")
print(len(np.unique(xtrainP)))

Length Unique numbers in the matrix representing the collections of words-
88586


In [None]:
#vocab_size = 58306

In [None]:
xtrainP.shape

(25000, 2494)

In [None]:
#sequential model
imdb_model = models.Sequential()

#embedding layer
imdb_model.add(keras.layers.Embedding(input_dim=88586, output_dim=15, input_length=2494))

#Neural network
imdb_model.add(keras.layers.Flatten())
imdb_model.add(layers.Dense(8))
imdb_model.add(layers.Dense(1, activation = 'sigmoid'))

#compile
imdb_model.compile(optimizer='adam' , loss = 'binary_crossentropy' , metrics = ['accuracy'])

#fit for training
imdb_model.fit(xtrainP,ytrain,epochs=10, validation_data=(xtestP,ytest))

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 10ms/step - accuracy: 0.6995 - loss: 0.5533 - val_accuracy: 0.8811 - val_loss: 0.2828
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.9499 - loss: 0.1420 - val_accuracy: 0.8592 - val_loss: 0.3402
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.9912 - loss: 0.0356 - val_accuracy: 0.8671 - val_loss: 0.3976
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.9995 - loss: 0.0053 - val_accuracy: 0.8663 - val_loss: 0.4720
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 7ms/step - accuracy: 1.0000 - loss: 0.0018 - val_accuracy: 0.8683 - val_loss: 0.4990
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 5ms/step - accuracy: 1.0000 - loss: 4.5114e-04 - val_accuracy: 0.8670 - val_loss: 0.5454
Epoch 7/10
[1m782/78

<keras.src.callbacks.history.History at 0x7935800f8e10>

In [None]:
imdb_model.evaluate(xtestP,ytest)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8663 - loss: 0.6306


[0.6297377943992615, 0.8685200214385986]

In [None]:
xtrainP.shape

(25000, 2494)

In [None]:
xtestP[56].shape

(2494,)

In [None]:
print(imdb_model.predict(xtestP[56].reshape(1,2494)))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[[0.9537959]]


In [None]:
print(int(imdb_model.predict(xtestP[56].reshape(1,2494)) > 0.5))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
1


  print(int(imdb_model.predict(xtestP[56].reshape(1,2494)) > 0.5))


In [None]:
ytest[56]

np.int64(1)