In [1]:
# Step 1: Importing necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pickle

In [2]:
# Step 2: Preparing the dataset
data = [
   ("<em>This is emphasized text</em>", "<i>This is emphasized text</i>"),
("<i>This is italicized text</i>", "<em>This is italicized text</em>"),
("<strong>Bold and strong</strong>", "<b>Bold and strong</b>"),
("<u>This is underlined</u>", "<span style='text-decoration: underline;'>This is underlined</span>"),
("<s>This is strikethrough</s>", "<del>This is strikethrough</del>"),
("<sup>Superscript</sup>", "<sup>Superscript</sup>"),
("<sub>Subscript</sub>", "<sub>Subscript</sub>"),
("<mark>Highlighted text</mark>", "<span style='background-color: yellow;'>Highlighted text</span>"),
("<abbr title='Hypertext Markup Language'>HTML</abbr>", "<abbr title='Hypertext Markup Language'>HTML</abbr>"),
("<cite>Book title</cite>", "<cite>Book title</cite>"),
("<ul><li>Item 1</li><li>Item 2</li></ul>", "<ol><li>Item 1</li><li>Item 2</li></ol>"),
("<ol><li>Item 1</li><li>Item 2</li></ol>", "<ul><li>Item 1</li><li>Item 2</li></ul>"),
("<dl><dt>Term 1</dt><dd>Definition 1</dd><dt>Term 2</dt><dd>Definition 2</dd></dl>", "<table><tr><td>Term 1</td><td>Definition 1</td></tr><tr><td>Term 2</td><td>Definition 2</td></tr></table>"),
("<li>List item</li>", "<li style='list-style-type: none;'>List item</li>"),
("<blockquote>Blockquote text</blockquote>", "<q>Blockquote text</q>"),
("<hr>", "<hr style='border: 2px solid black;'>"),
("<pre>Preformatted text</pre>", "<code>Preformatted text</code>"),
("<address>Contact us at: <a href='mailto:info@example.com'>info@example.com</a></address>", "<p>Contact us at: <a href='mailto:info@example.com'>info@example.com</a></p>"),
("<button>Click me</button>", "<button disabled>Click me</button>"),
("<input type='text' placeholder='Enter text'>", "<textarea placeholder='Enter text'></textarea>"),
("<a href='https://www.example.com'>Visit our website</a>", "<a href='https://www.example.com' target='_blank'>Visit our website</a>"),
("<a href='#section1'>Link to Section 1</a>", "<a href='#section1' id='section1-link'>Link to Section 1</a>"),
("<img src='image.jpg' alt='Description of image'>", "<figure><img src='image.jpg' alt='Description of image'><figcaption>Caption for image</figcaption></figure>"),
("<audio controls><source src='audio.mp3' type='audio/mp3'></audio>", "<audio controls><source src='audio.mp3' type='audio/mp3'>Your browser does not support the audio element.</audio>"),
("<video controls><source src='video.mp4' type='video/mp4'></video>", "<video controls><source src='video.mp4' type='video/mp4'>Your browser does not support the video element.</video>"),
("<iframe src='https://www.youtube.com' width='560' height='315'></iframe>", "<iframe src='https://www.youtube.com/embed/VIDEO_ID' width='560' height='315'></iframe>"),
("<progress value='50' max='100'></progress>", "<progress value='50' max='100'></progress>"),
("<details><summary>Show details</summary>Details content</details>", "<details open><summary>Show details</summary>Details content</details>"),
("<nav><a href='#'>Home</a><a href='#about'>About</a></nav>", "<ul class='navigation'><li><a href='#'>Home</a></li><li><a href='#about'>About</a></li></ul>"),
("<time datetime='2023-01-01'>January 1, 2023</time>", "<time datetime='2023-01-01'>January 1, 2023</time>"),
("<h2>This is a subheading</h2>", "<h2 class='sub-heading'>This is a subheading</h2> In Style.css .sub-heading{color:Red;}"),
("<h3>Another subheading</h3>", "<h3 class='sub-heading'>Another subheading</h3> In Style.css .sub-heading{color:Blue;}"),
("<h4>Yet another subheading</h4>", "<h4 class='sub-heading'>Yet another subheading</h4> In Style.css .sub-heading{color:Green;}"),
("<h5>Subheading five</h5>", "<h5 class='sub-heading'>Subheading five</h5> In Style.css .sub-heading{color:Purple;}"),
("<h6>The smallest subheading</h6>", "<h6 class='sub-heading'>The smallest subheading</h6> In Style.css .sub-heading{color:Orange;}"),
("<header>Header content</header>", "<header><h1>Header content</h1></header>"),
("<footer>Footer content</footer>", "<footer><p>Footer content</p></footer>"),
("<main>Main content</main>", "<main><article>Main content</article></main>"),
("<section>Section content</section>", "<section id='section1'>Section content</section>"),
("<aside>Additional content</aside>", "<aside>Additional content</aside>"),
("<font size='3'>This is some text</font>", "<p class='text-size-3'>This is some text</p>"),
("<span style='color: red;'>Red text</span>", "<p style='color: red;'>Red text</p>"),
("<span style='background-color: yellow;'>Yellow background</span>", "<p style='background-color: yellow;'>Yellow background</p>"),
("<span style='font-family: Arial;'>Arial font</span>", "<p style='font-family: Arial;'>Arial font</p>"),
("<span style='text-transform: uppercase;'>Uppercase text</span>", "<p style='text-transform: uppercase;'>Uppercase text</p>"),
("<span style='text-align: center;'>Center-aligned text</span>", "<p style='text-align: center;'>Center-aligned text</p>"),
("<div style='border: 1px solid black;'>Div with border</div>", "<div style='border: 1px solid black; padding: 10px;'>Div with border and padding</div>"),
("<div style='margin: 20px;'>Div with margin</div>", "<div style='margin: 20px; background-color: #eee;'>Div with margin and background</div>"),
("<div style='width: 200px; height: 100px;'>Div with fixed size</div>", "<div style='width: 50%; height: 50px;'>Div with percentage width and fixed height</div>"),
("<div style='position: absolute; top: 10px; left: 20px;'>Absolute positioning</div>", "<div style='position: relative; top: 10px; left: 20px;'>Relative positioning</div>"),
("<div id='header'>Header content</div>", "<header><h1>Header content</h1></header>"),
("<div class='container'>Page content</div>", "<div class='wrapper'>Page content</div>"),
("<div id='sidebar'>Sidebar content</div>", "<aside id='sidebar'>Sidebar content</aside>"),
("<div class='clearfix'>Clearing floats</div>", "<div style='clear: both;'>Clearing floats alternative</div>"),
("<div class='centered'>Centered content</div>", "<div style='margin: 0 auto; width: 80%;'>Centered content alternative</div>"),
("<div class='hidden'>Hidden content</div>", "<div style='display: none;'>Hidden content alternative</div>"),
("<div class='visible'>Visible content</div>", "<div style='display: block;'>Visible content alternative</div>"),
("<div class='hover-effect'>Hover effect</div>", "<div class='hover-effect' onmouseover='this.style.color=\"red\"' onmouseout='this.style.color=\"black\"'>Hover effect alternative</div>"),
("<div class='rotate'>Rotated content</div>", "<div class='rotate' style='transform: rotate(45deg);'>Rotated content alternative</div>"),
("<div class='gradient-bg'>Gradient background</div>", "<div class='gradient-bg' style='background: linear-gradient(to right, #ffcc00, #ff6600);'>Gradient background alternative</div>"),
("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">", "<!DOCTYPE html>"),
("<!DOCTYPE html>", "<!DOCTYPE html>"),
("<!DOCTYPE html>", "<!DOCTYPE html>\n<html lang='en'>"),
("<!DOCTYPE html>", "<!DOCTYPE html>\n<html lang='en'>\n<head>\n<meta charset='UTF-8'>\n<meta name='viewport' content='width=device-width, initial-scale=1.0'>\n<title>My Web Page</title>\n</head>\n<body>\n<h1>Hello, World!</h1>\n</body>\n</html>"),
("<html>", "<html lang='en'>"),
("<html lang='en'>", "<html lang='en' dir='ltr'>"),
("<head>", "<head>\n<meta name='description' content='Description of your web page'>"),
("<meta charset='UTF-8'>", "<meta charset='UTF-8'>\n<meta name='author' content='Your Name'>"),
("<title>My Web Page</title>", "<title>My Awesome Web Page</title>"),
("<link rel='stylesheet' href='styles.css'>", "<link rel='stylesheet' href='styles.css'>"),
("<script src='script.js'></script>", "<script defer src='script.js'></script>"),
]

# Spliting the data into input (X) and target (y)
X, y = zip(*data)

In [3]:
# Step 3: Tokenizing the text data
tokenizer_X = Tokenizer(filters='')
tokenizer_X.fit_on_texts(X)
X_seqs = tokenizer_X.texts_to_sequences(X)

tokenizer_y = Tokenizer(filters='')
tokenizer_y.fit_on_texts(y)
y_seqs = tokenizer_y.texts_to_sequences(y)

In [4]:
# Step 4: Padding the sequences to have the same length
X_pad = pad_sequences(X_seqs)
y_pad = pad_sequences(y_seqs, maxlen=X_pad.shape[1])

In [5]:
# Step 5: `Spliting` the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_pad, test_size=0.2, random_state=42)

In [36]:
# Step 6: Building the sequence-to-sequence model
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer_X.word_index) + 1, output_dim=256, input_length=X_pad.shape[1]),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512, return_sequences=True)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512, return_sequences=True)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(tokenizer_y.word_index) + 1, activation='softmax'))
])






model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [38]:
# Step 7: Training the model
model.fit(X_train, np.expand_dims(y_train, -1), epochs=10, batch_size=16, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1dfc2284c70>

In [40]:
# Step 9: Saving the tokenizer configuration
with open('tokenizer_X_config.pickle', 'wb') as handle:
    pickle.dump(tokenizer_X.get_config(), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [41]:
with open('tokenizer_y_config.pickle', 'wb') as handle:
    pickle.dump(tokenizer_y.get_config(), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [42]:
# Saving the model
model.save('html_correction_model.h5')

  saving_api.save_model(


In [50]:
loaded_model = tf.keras.models.load_model('html_correction_model.h5')


In [69]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

# Loading Tokenizer Configurations
with open('tokenizer_X_config.pickle', 'rb') as handle:
    tokenizer_X_config = pickle.load(handle)

loaded_tokenizer_X = Tokenizer()
loaded_tokenizer_X.word_index = tokenizer_X_config['word_index']
loaded_tokenizer_X.document_count = tokenizer_X_config['document_count']
loaded_tokenizer_X.char_level = tokenizer_X_config['char_level']  # Add this line
loaded_tokenizer_X.oov_token = tokenizer_X_config['oov_token'] 


# Repeating the same process for the 'tokenizer_y' as well
with open('tokenizer_Y_config.pickle', 'rb') as handle:
    tokenizer_Y_config = pickle.load(handle)

loaded_tokenizer_Y = Tokenizer()
loaded_tokenizer_Y.word_index = tokenizer_Y_config['word_index']
loaded_tokenizer_Y.document_count = tokenizer_Y_config['document_count']
loaded_tokenizer_Y.char_level = tokenizer_Y_config['char_level']
loaded_tokenizer_Y.oov_token = tokenizer_Y_config['oov_token']



In [81]:
# Assuming I have a new input text
input_text = "<em>This is emphasized text</em>"

# Tokenizing the input text
input_seq = loaded_tokenizer_X.texts_to_sequences([input_text])

# Padding the sequence to match the model input length
input_pad = pad_sequences(input_seq, maxlen=X_pad.shape[1])

# Making predictions using the loaded model
predictions = loaded_model.predict(input_pad)

# Converting predictions to sequences
predicted_seq = np.argmax(predictions, axis=-1)[0]

# Converting the predicted sequence back to text using the loaded tokenizer
predicted_text = loaded_tokenizer_Y.sequences_to_texts([predicted_seq])[0]

# Printing the original input and the predicted output
print("Original Input Text:", input_text)
print("Predicted Output Text:", predicted_text)


AttributeError: 'str' object has no attribute 'get'

In [None]:
predictions = loaded_model.predict(input_pad)

In [None]:
predicted_seq = np.argmax(predictions, axis=-1)[0]
predicted_text = loaded_tokenizer_y.sequences_to_texts([predicted_seq])[0]