<a href="https://colab.research.google.com/github/oayodeji/Predict-Yoruba-Hymn/blob/main/Yoruba_hymn_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Requirements
###Dataset contains 10 popular hymns written in yoruba language with their proper tone marks

In [None]:
!apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following packages will be REMOVED:
  libcudnn8-dev
The following held packages will be changed:
  libcudnn8
The following packages will be upgraded:
  libcudnn8
1 upgraded, 0 newly installed, 1 to remove and 18 not upgraded.
Need to get 430 MB of archives.
After this operation, 3,139 MB disk space will be freed.
Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  libcudnn8 8.1.0.77-1+cuda11.2 [430 MB]
Fetched 430 MB in 9s (47.6 MB/s)
(Reading database ... 155685 files and directories currently installed.)
Removing libcudnn8-dev (8.0.5.39-1+cuda11.1) ...
(Reading database ... 155663 files and directories currently installed.)
Preparing to unpack .../libcudnn8_8.1.0.77-1+cuda11.2_amd64.deb ...
Unpacking libcudnn8 (8.1.0.77-1+c

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/Text Generation

Mounted at /content/gdrive
/content/gdrive/MyDrive/Text Generation


## Load the text file

In [None]:
with open('Ten_Yoruba_Hymns.txt') as f:
    data = f.readlines()                             #Data loads as list
data = ' '.join(data).lower().split('\n')            #Join Data into string,lower words and split along new lines

In [None]:
print(len(data))                                     #Check Length of Data
data[0:6]                                            #View data sample

261


['ìsun kan wa tó kún fẹ́jẹ̀',
 ' mo ti ní jésù lọ́rẹ̀',
 ' enìkan nbẹ tó fẹ́ràn wa',
 ' gba ayé mi, olúwa',
 ' olùgbàlà gbóhùn mi',
 ' árẹ̀ mú ọ, ọkàn re pòrurù']

##Import Libraries

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Data Preprocessing

In [None]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(data)

In [None]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    train_data, label = input_sequences[:,:-1], input_sequences[:,-1]
    label = tf.keras.utils.to_categorical(label, num_classes=total_words)
    return train_data, label, max_sequence_len

train_data, labels, max_sequence_len = generate_padded_sequences(inp_sequences)

In [None]:
buffer_size = len(data)
batch_size = 32
auto = tf.data.AUTOTUNE

train_dataset = tf.data.Dataset.from_tensor_slices((train_data, labels))
train_dataset = train_dataset.cache().shuffle(buffer_size).batch(batch_size).prefetch(auto)

# Transformer

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
embed_dim = 64  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(max_sequence_len - 1,))
embedding_layer = TokenAndPositionEmbedding(max_sequence_len, total_words, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(400, activation="relu")(x)
outputs = layers.Dense(total_words, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

model.summary()

Model: "model_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_18 (InputLayer)       [(None, 7)]               0         
                                                                 
 token_and_position_embeddin  (None, 7, 64)            29888     
 g_17 (TokenAndPositionEmbed                                     
 ding)                                                           
                                                                 
 transformer_block_17 (Trans  (None, 7, 64)            37664     
 formerBlock)                                                    
                                                                 
 global_average_pooling1d_14  (None, 64)               0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_76 (Dense)            (None, 400)               260

In [None]:
adam = tf.keras.optimizers.Adam(lr=0.01)
model.compile(optimizer='rmsprop', loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    train_dataset, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

# Wuraola's Model

In [None]:
model_wura = tf.keras.Sequential()
model_wura.add(tf.keras.layers.Embedding(total_words, 64, input_length=max_sequence_len - 1))
model_wura.add(tf.keras.layers.Bidirectional((tf.keras.layers.LSTM(200))))
model_wura.add(tf.keras.layers.Dense(total_words, activation='softmax'))
model_wura.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 7, 64)             29376     
                                                                 
 bidirectional (Bidirectiona  (None, 400)              424000    
 l)                                                              
                                                                 
 dense_4 (Dense)             (None, 459)               184059    
                                                                 
Total params: 637,435
Trainable params: 637,435
Non-trainable params: 0
_________________________________________________________________


In [None]:
adam = tf.keras.optimizers.Adam(lr=0.01)
model_wura.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history = model_wura.fit(train_dataset, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

# Generator yoruba hymns lyrics

In [None]:
def generate_hymn(model, seed_text,next_words):
    """ A function that takes a 
    seed_text: to prompt next word prediction
    next_word: The number of next words to predict
    and returns the predicted yoruba hymn lyrics"""
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        if model is transformer:
          predicted = model.predict([token_list, token_list], verbose=0)[0]
        else: predicted = model.predict(token_list, verbose=0)[0]
        predicted = np.argmax(predicted)
        output_word = " "
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [None]:
generate_hymn(model_wura, 'olúwa olúwa gbà',4)

"olúwa olúwa gbà sínú omi omi nín'ẹ̀ṣẹ̀"

In [None]:
seed_text_list = ['olúwa gbà','olùgbàlà' ,'Ọlọ́run' , 'ìṣẹ́gun ni' , 'ìyanu mi', 'gbórí', 'ayọ̀ ńbọ̀','ìfẹ́','ìfẹ́ ọkàn', 'olúwa mi','ọ̀rẹ́','ọ̀rẹ́ òtítọ́']
for word in seed_text_list:
    print(generate_hymn(model_wura, word,5))

olúwa gbà mú mí níhìn kúrọ̀ nín'ẹ̀ṣẹ̀
olùgbàlà gbóhùn mi ko ṣì gbọ́ràn
Ọlọ́run la ó rójú 'bùkún rẹ̀
ìṣẹ́gun ni dorin mi d'òpin ìrìn mi
ìyanu mi wón ma sáre fún ọ
gbórí la ó rójú 'bùkún rẹ̀
ayọ̀ ńbọ̀ ọ fún jésù nìkan mi
ìfẹ́ rẹ̀ ju t'ìyekan lọ s'
ìfẹ́ ọkàn tí ó fẹ́ wa ṣègbé
olúwa mi mọ́ mímọ́ jùlọ bí bí
ọ̀rẹ́ òdodo ni jésù lọ́rẹ̀ jésù
ọ̀rẹ́ òtítọ́ òdodo ni jésù lọ́rẹ̀ jésù


In [None]:
seed_text_list = ['olúwa gbà','olùgbàlà' ,'Ọlọ́run' , 'ìṣẹ́gun ni' , 'ìyanu mi', 'gbórí', 'ayọ̀ ńbọ̀','ìfẹ́','ìfẹ́ ọkàn', 'olúwa mi','ọ̀rẹ́','ọ̀rẹ́ òtítọ́']
for word in seed_text_list:
    print(generate_hymn(model, word,5))

olúwa gbà mí ègbè ègbè ègbè ègbè
olùgbàlà gbóhùn mi gbà mí ègbè
Ọlọ́run dípò ọmọ ná gbà mí
ìṣẹ́gun ni fún jésù sọ ọ fún
ìyanu mi kọ́kàn gbogbo fò fún ayọ̀
gbórí dípò ọmọ ná gbà mí
ayọ̀ ńbọ̀ fún mi rù le gbogbo
ìfẹ́ rẹ̀ ju t'ìyekan lọ lọ
ìfẹ́ ọkàn re rẹ̀ k'ọ́mọdé wá nín'ẹ̀ṣẹ̀
olúwa mi wá lọ́wọ́ látóní n'ínú mọ́
ọ̀rẹ́ òdodo ni jésù bí n'ínú
ọ̀rẹ́ òtítọ́ òdodo ni jésù bí n'ínú


In [None]:
seed_text_list = ['ìsun kan','mo ti' ,'enìkan' , 'gba ayé mi']
for word in seed_text_list:
    print(generate_hymn(model_wura, word,5))

ìsun kan wa tó kún fẹ́jẹ̀ fẹ́jẹ̀
mo ti ní jésù lọ́rẹ̀ lọ́rẹ̀ wàhálà
enìkan nbẹ tó fẹ́ràn wa lóní
gba ayé mi olúwa won mọ́ ọ bí


In [None]:
seed_text_list = ['ìsun kan','mo ti' ,'enìkan' , 'gba ayé mi']
for word in seed_text_list:
    print(generate_hymn(model, word,5))

ìsun kan wa tó kún fẹ́jẹ̀ fẹ́jẹ̀
mo ti ní jésù lọ́rẹ̀ lọ́rẹ̀ lọ́rẹ̀
enìkan nbẹ tó fẹ́ràn wa wá
gba ayé mi olúwa gbà mí ègbè nín'ẹ̀ṣẹ̀


In [None]:
data[:5]

['ìsun kan wa tó kún fẹ́jẹ̀',
 ' mo ti ní jésù lọ́rẹ̀',
 ' enìkan nbẹ tó fẹ́ràn wa',
 ' gba ayé mi, olúwa',
 ' olùgbàlà gbóhùn mi']