In [5]:
import pandas as pd
import string
import numpy as np
import json
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
#from keras.preprocessing import Tokenizer
keras.preprocessing.text.Tokenizer as Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku

tf.random.set_seed(2)
from numpy.random import seed
seed(1)

#loading the datasets
df1 = pd.read_csv('/kaggle/input/title-generator-datasets/USvideos.csv')
df2 = pd.read_csv('/kaggle/input/title-generator-datasets/CAvideos.csv')
df3 = pd.read_csv('/kaggle/input/title-generator-datasets/GBvideos.csv')

#Now loading the datasets containing the category names
data1 = json.load(open('/kaggle/input/title-generator-datasets/US_category_id.json'))
data2 = json.load(open('/kaggle/input/title-generator-datasets/CA_category_id.json'))
data3 = json.load(open('/kaggle/input/title-generator-datasets/GB_category_id.json'))


df1.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


Now we need to process our data so that we can use this data to train our machine learning model for the task of title generator. 


Here are all the data cleaning and processing steps that we need to follow:

In [7]:
def category_extractor(data):
    i_d = [data['items'][i]['id'] for i in range(len(data['items']))]
    title = [data['items'][i]['snippet']['title'] for i in range(len(data['items']))]
    i_d = list(map(int,i_d))
    category = zip(i_d,title)
    category = dict(category)
    return category


#Create a new category column by mapping the category names to their id
df1['category_title'] = df1['category_id'].map(category_extractor(data1))
df2['category_title'] = df2['category_id'].map(category_extractor(data2))
df3['category_title'] = df3['category_id'].map(category_extractor(data3))

#NOw join the dataframes
df = pd.concat([df1,df2,df3],ignore_index=True)

#Now drop duplicates
df = df.drop_duplicates('video_id')

#NOW collect the titles of entertainment videos
entertainment = df[df['category_title'] == 'Entertainment']['title']
entertainment = entertainment.tolist()

#remove punctuations and convert text to lowercase
def clean_text(text):
    text = ''.join(e for e in text if e not in string.punctuation).lower()
    
    text = text.encode('utf8').decode('ascii','ignore')
    return text

corpus = [clean_text(e) for e in entertainment]

Generating Sequences
Natural language processing tasks require input data in the form of a sequence of tokens. The first step after data cleansing is to generate a sequence of n-gram tokens.

An n-gram is an adjacent sequence of n elements of a given sample of text or vocal corpus. Elements can be words, syllables, phonemes, letters, or base pairs. In this case, the n-grams are a sequence of words in a corpus of titles. Tokenization is the process of extracting tokens from the corpus

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer


tokenizer = Tokenizer()
def get_sequences_of_tokens(corpus):
    #Get tokens
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    #convert to sequence of tokens
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)
    
    return input_sequences, total_words

inp_sequences, total_words=get_sequences_of_tokens(corpus)

Padding the sequences

Since the sequences can be of variable length, the sequence lengths must be equal. When using neural networks, we usually feed an input into the network while waiting for output. In practice, it is more efficient to process data in batches rather than one at a time.


This is done by using matrices [batch size x sequence length], where the length of the sequence corresponds to the longest sequence. 

In this case, we fill the sequences with a token (usually 0) to fit the size of the matrix. This process of filling sequences with tokens is called filling. To enter the data into a training model, I need to create predictors and labels.

I will create sequences of n-gram as predictors and the following word of n-gram as label:

In [19]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len,padding='pre'))
    predictors,label = input_sequences[:,:-1],input_sequences[:, -1]
    label = ku.to_categorical(label,num_classes=total_words)
    return predictors,label,max_sequence_len

predictors,label,max_sequence_len = generate_padded_sequences(inp_sequences)

Title Generator with LSTM Model

The LSTM model contains an additional state (the state of the cell) which essentially allows the network to learn what to store in 
the long term state, what to delete and what to read.

The LSTM of this model contains three layers:

Input layer: takes the sequence of words as input

LSTM Layer: Calculates the output using LSTM units.

Dropout layer: a regularization layer to avoid overfitting

Output layer: calculates the probability of the next possible word on output

Now I will use the LSTM Model to build a model for the task of Title Generator with Machine Learning:

In [34]:
def create_model(max_sequence_len,total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    #add input embedded layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    #add hidden layer 1 --LSTM layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    #ADd output layer
    model.add(Dense(total_words,activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    return model

model = create_model(max_sequence_len,total_words)
model.fit(predictors,label,epochs=50,verbose=5)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.history.History at 0x785340fbcb80>

In [36]:
# Assume X_test and y_test are your test data and labels
loss, accuracy = model.evaluate(predictors, label)

print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 337ms/step - accuracy: 1.0000 - loss: 0.4205
Test Loss: 0.4204827845096588
Test Accuracy: 1.0
