# Neural Language Model

* `imports`

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, LSTM, GRU, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow_datasets as tfds

In [3]:
# Consants

VOCAB_SIZE = 20000
SEQ_LEN = 10
EMBED_DIM = 128
max_length = 300
EPOCHS = 3
BATCH_SIZE = 256


* `Load dataset`

In [4]:
!mkdir -p data
!wget -O data/wiki.train.raw https://raw.githubusercontent.com/pytorch/examples/main/word_language_model/data/wikitext-2/train.txt
!wget -O data/wiki.valid.raw https://raw.githubusercontent.com/pytorch/examples/main/word_language_model/data/wikitext-2/valid.txt
!wget -O data/wiki.test.raw https://raw.githubusercontent.com/pytorch/examples/main/word_language_model/data/wikitext-2/test.txt


--2026-01-29 11:08:58--  https://raw.githubusercontent.com/pytorch/examples/main/word_language_model/data/wikitext-2/train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10797148 (10M) [text/plain]
Saving to: ‘data/wiki.train.raw’


2026-01-29 11:08:59 (71.7 MB/s) - ‘data/wiki.train.raw’ saved [10797148/10797148]

--2026-01-29 11:08:59--  https://raw.githubusercontent.com/pytorch/examples/main/word_language_model/data/wikitext-2/valid.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1121681 (1.1M) [text/plain]
Saving to: ‘

In [5]:
!ls data


wiki.test.raw  wiki.train.raw  wiki.valid.raw


In [6]:
# train data
with open("data/wiki.train.raw", "r", encoding="utf-8") as f:
    train_texts = f.read().split("\n")

train_texts = [t for t in train_texts if len(t.strip()) > 0]



# test data
with open("data/wiki.test.raw", "r", encoding="utf-8") as f:
    test_texts = f.read().split("\n")

test_texts = [t for t in test_texts if len(t.strip()) > 0]



# valid data
with open("data/wiki.valid.raw", "r", encoding="utf-8") as f:
    valid_texts = f.read().split("\n")

valid_texts = [t for t in valid_texts if len(t.strip()) > 0]

# show samples
print(len(train_texts))
print(train_texts[:5])
print("==================================================")
print(len(test_texts))
print(test_texts[:5])
print("==================================================")
print(len(valid_texts))
print(valid_texts[:5])



23767
[' = Valkyria Chronicles III = ', ' Senjō no Valkyria 3 : <unk> Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . <unk> the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk> Raven " . ', " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more <unk> for series newcom

In [7]:
# text cleaning
def clean_text(text):
    text = text.lower()

    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)

    text = re.sub(r"\s+", " ", text).strip()

    return text

train_texts = [clean_text(t) for t in train_texts]
test_texts = [clean_text(t) for t in test_texts]
valid_texts = [clean_text(t) for t in valid_texts]

print(train_texts[:5])
print(test_texts[:5])
print(valid_texts[:5])

['valkyria chronicles iii', 'senj no valkyria 3 unk chronicles japanese 3 lit valkyria of the battlefield 3 commonly referred to as valkyria chronicles iii outside japan is a tactical role playing video game developed by sega and mediavision for the playstation portable released in january 2011 in japan it is the third game in the valkyria series unk the same fusion of tactical and real time gameplay as its predecessors the story runs parallel to the first game and follows the nameless a penal military unit serving the nation of gallia during the second europan war who perform secret black operations and are pitted against the imperial unit unk raven', 'the game began development in 2010 carrying over a large portion of the work done on valkyria chronicles ii while it retained the standard features of the series it also underwent multiple adjustments such as making the game more unk for series newcomers character designer unk honjou and composer hitoshi sakimoto both returned from prev

In [8]:
max_length = max(len(seq) for seq in train_texts)
max_length
# this is too many so we will use 300 as a max_length

3657

* `Tokeization`

In [9]:
# construct tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE, oov_token="<UNK>")

# fit the tokenizer on training data
tokenizer.fit_on_texts(train_texts)

# tokenize train,test,and valid texts
train_tokenized = tokenizer.texts_to_sequences(train_texts)
test_tokenized = tokenizer.texts_to_sequences(test_texts)
valid_tokenized = tokenizer.texts_to_sequences(valid_texts)

#show a sample
print(train_tokenized[:5])


[[3768, 3831, 857], [18088, 76, 3768, 81, 4, 3831, 757, 81, 6066, 3768, 3, 2, 4989, 81, 1810, 986, 7, 11, 3768, 3831, 857, 611, 952, 17, 8, 5636, 285, 556, 219, 60, 433, 16, 12881, 5, 1, 14, 2, 1736, 5531, 147, 6, 233, 341, 6, 952, 18, 17, 2, 224, 60, 6, 2, 3768, 86, 4, 2, 148, 4362, 3, 5636, 5, 708, 51, 2064, 11, 36, 6902, 2, 319, 1057, 3168, 7, 2, 31, 60, 5, 1665, 2, 10766, 8, 18089, 299, 1037, 2042, 2, 1666, 3, 18090, 49, 2, 88, 1, 100, 45, 1906, 1617, 275, 580, 5, 28, 12882, 113, 2, 2280, 1037, 4, 13867], [2, 60, 127, 354, 6, 284, 3210, 58, 8, 175, 1710, 3, 2, 130, 1155, 10, 3768, 3831, 289, 59, 18, 3211, 2, 1148, 562, 3, 2, 86, 18, 37, 4281, 1811, 18091, 82, 11, 390, 2, 60, 54, 4, 14, 86, 18092, 265, 3707, 4, 1, 5, 2998, 1, 18093, 92, 417, 19, 463, 11411, 155, 15, 3768, 3831, 289, 520, 16354, 1, 8, 175, 149, 3, 1114, 3832, 2, 1589, 2, 60, 12, 639, 1041, 9, 3543, 16, 68, 1626], [18, 767, 15, 907, 1418, 6, 952, 5, 9, 714, 16, 92, 757, 5, 405, 502, 38, 306, 18, 194, 5905, 1812, 155, 

* `Input/Target Sequence`

In [10]:
# a function to make input & target sequence
def create_input_target(sequences, seq_len=SEQ_LEN):
    inputs = []
    targets = []

    for seq in sequences:
        if len(seq) <= seq_len:
            continue

        for i in range(len(seq) - seq_len):
            inputs.append(seq[i:i+seq_len])
            targets.append(seq[i+seq_len])

    return np.array(inputs), np.array(targets)

x_train, y_train = create_input_target(train_tokenized, SEQ_LEN)
x_valid, y_valid = create_input_target(valid_tokenized, SEQ_LEN)
x_test, y_test   = create_input_target(test_tokenized, SEQ_LEN)
print("Train inputs shape:", x_train.shape)
print("Train targets shape:", y_train.shape)


Train inputs shape: (1568571, 10)
Train targets shape: (1568571,)


### RNN, GRU & LSTM phase (text generation)

* `simple RNN`

In [11]:
# Building the simple RNN model
model = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM, input_shape=(max_length,)),
    SimpleRNN(64 ,return_sequences=True),
    SimpleRNN(32),
    Dense(VOCAB_SIZE, activation='softmax')
])

model.summary()

  super().__init__(**kwargs)


In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

history = model.fit(
    x_train,
    y_train,
    validation_data=(x_test, y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stopping]
)



Epoch 1/3
[1m6128/6128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1108s[0m 180ms/step - accuracy: 0.0948 - loss: 7.1443 - val_accuracy: 0.1309 - val_loss: 6.4236
Epoch 2/3
[1m6128/6128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1111s[0m 181ms/step - accuracy: 0.1262 - loss: 6.5852 - val_accuracy: 0.1372 - val_loss: 6.3755
Epoch 3/3
[1m3313/6128[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m7:48[0m 166ms/step - accuracy: 0.1292 - loss: 6.4749

In [12]:
# function to test the model
def generate_text1(seed_text, num_words=3):
    text = seed_text

    for _ in range(num_words):
        seq = tokenizer.texts_to_sequences([text])[0]
        seq = pad_sequences([seq], maxlen=max_length)

        preds = model.predict(seq, verbose=0)
        next_word_id = preds.argmax()

        next_word = tokenizer.index_word.get(next_word_id, "")
        text += " " + next_word

    return text


In [None]:
# trying the model
print(generate_text1("deep learnin went"))


deep learnin went to the unk


* `LSTM & GRU (Text generation)`

In [13]:
# Building the LSTM & GRU model
model_v2 = Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM, input_shape=(max_length,)),
    LSTM(64 ,dropout=0.2 ,return_sequences=True),
    GRU(32, dropout=0.2),
    Dense(VOCAB_SIZE, activation='softmax')
])

model.summary()

In [None]:
model_v2.compile(loss='sparse_categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

history = model.fit(
    x_train,
    y_train,
    validation_data=(x_test, y_test),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stopping]
)



Epoch 1/3
[1m6128/6128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1389s[0m 227ms/step - accuracy: 0.0744 - loss: 7.5057 - val_accuracy: 0.0776 - val_loss: 7.1922
Epoch 2/3
[1m6128/6128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1317s[0m 215ms/step - accuracy: 0.0829 - loss: 7.2840 - val_accuracy: 0.1340 - val_loss: 6.3774
Epoch 3/3
[1m6128/6128[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1339s[0m 214ms/step - accuracy: 0.1200 - loss: 6.5774 - val_accuracy: 0.1421 - val_loss: 6.1957


In [14]:
# function to test the model
def generate_text2(seed_text, num_words=3):
    text = seed_text

    for _ in range(num_words):
        seq = tokenizer.texts_to_sequences([text])[0]
        seq = pad_sequences([seq], maxlen=max_length)

        preds = model_v2.predict(seq, verbose=0)
        next_word_id = preds.argmax()

        next_word = tokenizer.index_word.get(next_word_id, "")
        text += " " + next_word

    return text


In [None]:
# trying the model
print(generate_text2("obama is man"))


obama is man routes routes politicians


End of phase 1

-------------------------