## Import libraries

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
import html
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Masking, Bidirectional
from keras.models import load_model
import pickle

## Load and Process the data

In [None]:
medium_df = pd.read_csv('medium_data.csv')
news_df = pd.read_csv('ArticlesMarch2018.csv')
large_df = pd.read_csv('train.csv')

In [None]:
print(len(large_df))
large_df = large_df.head(20000)
print(len(large_df))

120000
20000


In [None]:
print(medium_df.shape)
print(news_df.shape)

(6508, 10)
(1385, 15)


In [None]:
# Creating New DataFrame
text = pd.concat([medium_df['title'], news_df['headline'],large_df['Title']], ignore_index=True)
data = pd.DataFrame({
    'text' : text
})

In [None]:
print(data.shape)
data.drop_duplicates(inplace=True)
print(data.shape)

(27893, 1)
(26770, 1)


## Data cleaning

In [None]:
data['text'] = data['text'].apply(lambda x: x.replace(u'\xa0',u' ')) # remove unwanted HTML character (non-breaking space)
data['text'] = data['text'].apply(lambda x: x.replace('\u200a', ' ')) # remove narrow space

def clean_text(text):
    # Unescape HTML escape characters
    cleaned_text = html.unescape(text)

    # Remove HTML tags
    cleaned_text = re.sub(r'<[^>]+>', '', cleaned_text)

    # Remove punctuation
    cleaned_text = re.sub(f'[{re.escape(string.punctuation)}]', '', cleaned_text)

    return cleaned_text

data['text'] = data['text'].apply(clean_text)
len(data)

26770

## Tokenization

In [None]:
tokenizer = Tokenizer(oov_token='<oov>', lower=True)
tokenizer.fit_on_texts(data['text'])
total_words = len(tokenizer.word_index) + 1
print('Total number of words: ', total_words)

Total number of words:  22101


In [None]:
# Tokenize all text at once
tokenized_texts = tokenizer.texts_to_sequences(data['text'])
# Generate n-gram sequences
input_sequences = [
    token_list[:i+1]
    for token_list in tokenized_texts  # Loop over each tokenized line
    for i in range(1, len(token_list)) # Generate n-grams for each line
]

In [None]:
max_sequence_len = max(len(x) for x in input_sequences)
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre', value = -1))

## Prepare Train and Test data

In [None]:
# Create features (xs) and labels
xs, labels = input_sequences[:, :-1], input_sequences[:, -1]

# Split the dataset into training and test sets (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(xs, labels, test_size=0.2, random_state=42)

## Creating the model

In [None]:
model = Sequential()

# Input and Embedding Layer
model.add(Embedding(input_dim=total_words, output_dim=200, input_length=max_sequence_len-1))

# Masking Layer (to ignore padding)
model.add(Masking(mask_value=-1))

# First LSTM Layer
model.add(Bidirectional(LSTM(200, return_sequences=True)))

# Dropout Layer (to prevent overfitting)
model.add(Dropout(0.3))

# Second LSTM Layer
model.add(Bidirectional(LSTM(150)))

# Dense Layer (hidden layer)
model.add(Dense(256, activation='relu'))

# Dropout Layer (to prevent overfitting)
model.add(Dropout(0.3))

# Output Layer (softmax for predicting next word)
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.build(input_shape=(None, max_sequence_len))  # None is for batch size
model.summary()

In [None]:
history = model.fit(
    x_train,
    y_train,
    epochs=100,
    batch_size=128,
    validation_data=(x_test, y_test),
    verbose=1,
)

Epoch 1/100
[1m1013/1013[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 21ms/step - accuracy: 0.0321 - loss: 8.3310 - val_accuracy: 0.0411 - val_loss: 7.9388
Epoch 2/100
[1m1013/1013[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - accuracy: 0.0441 - loss: 7.7404 - val_accuracy: 0.0497 - val_loss: 7.8781
Epoch 3/100
[1m1013/1013[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - accuracy: 0.0513 - loss: 7.5298 - val_accuracy: 0.0563 - val_loss: 7.8542
Epoch 4/100
[1m1013/1013[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - accuracy: 0.0613 - loss: 7.3277 - val_accuracy: 0.0621 - val_loss: 7.8275
Epoch 5/100
[1m1013/1013[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - accuracy: 0.0715 - loss: 7.1280 - val_accuracy: 0.0657 - val_loss: 7.8417
Epoch 6/100
[1m1013/1013[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 20ms/step - accuracy: 0.0830 - loss: 6.9135 - val_accuracy: 0.0714 - val_loss: 7.908

In [18]:
  model.save('final_model.h5')

