In [None]:
import nltk
import textblob
import spacy

In [None]:
document = """

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Sample Web Page</title>
    <meta name="description" content="This is a sample HTML document for learning purposes.">
    <link rel="stylesheet" href="styles.css">
</head>
<body>

    <header>
        <h1>Welcome to <span style="color:blue;">My Web Page</span></h1>
        <nav>
            <ul>
                <li><a href="#about">About Me</a></li>
                <li><a href="#blog">Blog</a></li>
                <li><a href="#contact">Contact</a></li>
            </ul>
        </nav>
    </header>

    <section id="about">
        <h2>About Me</h2>
        <p>Hello! I'm <strong>Jane Doe</strong>, a web developer and tech enthusiast. I love <em>HTML</em>, <em>CSS</em>, and <em>JavaScript</em>.</p>
        <p>I work at <a href="https://example.com">Example Corp</a>.</p>
    </section>

    <section id="blog">
        <h2>Latest Blog Posts</h2>
        <article>
            <h3>Understanding HTML Semantics</h3>
            <p>HTML semantics helps convey the <mark>meaning</mark> of web content. Learn how to use tags like <code>&lt;article&gt;</code>, <code>&lt;section&gt;</code>, and <code>&lt;aside&gt;</code>.</p>
            <footer><small>Posted on <time datetime="2024-12-01">December 1, 2024</time></small></footer>
        </article>

        <article>
            <h3>JavaScript Basics</h3>
            <p>Learn about <abbr title="Document Object Model">DOM</abbr>, functions, variables, and more!</p>
        </article>
    </section>

    <aside>
        <h2>Quick Tips</h2>
        <ul>
            <li>Always close your tags!</li>
            <li>Use semantic tags.</li>
            <li>Validate your HTML.</li>
        </ul>
    </aside>

    <section id="contact">
        <h2>Contact Me</h2>
        <form action="/submit" method="post">
            <label for="name">Name:</label>
            <input type="text" id="name" name="user_name" required>

            <label for="email">Email:</label>
            <input type="email" id="email" name="user_email" required>

            <label for="msg">Message:</label>
            <textarea id="msg" name="user_message"></textarea>

            <button type="submit">Send</button>
        </form>
    </section>

    <footer>
        <p>&copy; 2025 Jane Doe. All rights reserved.</p>
    </footer>

</body>
</html>



"""

In [None]:
import requests
from bs4 import BeautifulSoup
import re

In [None]:
data = requests.get('https://www.w3schools.com/')

In [None]:
data.status_code

200

In [None]:
w3_text = data.text

In [None]:
w3_text

'\n<!DOCTYPE html>\n<html lang="en-US">\n<head>\n<title>W3Schools Online Web Tutorials</title>\n<meta charset="utf-8">\n<meta name="viewport" content="width=device-width, initial-scale=1">\n<meta name="title" property="og:title" content="W3Schools.com">\n<meta name="Keywords" content="HTML, Python, CSS, SQL, JavaScript, How to, PHP, Java, C, C++, C#, jQuery, Bootstrap, Colors, W3.CSS, XML, MySQL, Icons, NodeJS, React, Graphics, Angular, R, AI, Git, Data Science, Code Game, Tutorials, Programming, Web Development, Training, Learning, Quiz, Exercises, Courses, Lessons, References, Examples, Learn to code, Source code, Demos, Tips, Website">\n<meta name="Description" content="Well organized and easy to understand Web building tutorials with lots of examples of how to use HTML, CSS, JavaScript, SQL, Python, PHP, Bootstrap, Java, XML and more.">\n<meta property="og:image" content="https://www.w3schools.com/images/w3schools_logo_436_2.png">\n<meta property="og:image:type" content="image/png"

In [None]:
#https://www.crummy.com/software/BeautifulSoup/bs4/doc/
soup = BeautifulSoup(w3_text, 'html.parser')

In [None]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en-US">
 <head>
  <title>
   W3Schools Online Web Tutorials
  </title>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="W3Schools.com" name="title" property="og:title"/>
  <meta content="HTML, Python, CSS, SQL, JavaScript, How to, PHP, Java, C, C++, C#, jQuery, Bootstrap, Colors, W3.CSS, XML, MySQL, Icons, NodeJS, React, Graphics, Angular, R, AI, Git, Data Science, Code Game, Tutorials, Programming, Web Development, Training, Learning, Quiz, Exercises, Courses, Lessons, References, Examples, Learn to code, Source code, Demos, Tips, Website" name="Keywords"/>
  <meta content="Well organized and easy to understand Web building tutorials with lots of examples of how to use HTML, CSS, JavaScript, SQL, Python, PHP, Bootstrap, Java, XML and more." name="Description"/>
  <meta content="https://www.w3schools.com/images/w3schools_logo_436_2.png" property="og:image"/>
  <meta content="image/png" propert

In [None]:
# Find all anchor tags with href
links = [a['href'] for a in soup.find_all('a', href=True)]

print(links)

['https://www.w3schools.com', 'javascript:void(0)', 'javascript:void(0)', 'javascript:void(0)', 'javascript:void(0)', 'javascript:void(0)', 'javascript:void(0);', 'https://profile.w3schools.com/log-in', 'https://campus.w3schools.com/collections/course-catalog', '/academy/index.php', '/spaces/index.php', '/plus/index.php', 'https://campus.w3schools.com/collections/course-catalog', '/academy/index.php', 'https://spaces.w3schools.com/space/', '/plus/index.php', 'https://pathfinder.w3schools.com', '/spaces/index.php', 'https://campus.w3schools.com/collections/course-catalog', '/plus/index.php', '/academy/index.php', 'https://profile.w3schools.com/logout', 'https://www.youtube.com/@w3schools', 'https://www.linkedin.com/company/w3schools.com/', 'https://discord.com/invite/w3schools', 'https://www.facebook.com/w3schoolscom/', 'https://www.instagram.com/w3schools.com_official/', '/html/default.asp', '/html/default.asp', '/tags/default.asp', '/css/default.asp', '/css/default.asp', '/cssref/defa

In [None]:
text = soup.text

In [None]:
text

'\n\n\n\nW3Schools Online Web Tutorials\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n        Tutorials\n        \n\n\n\n        Exercises\n        \n\n\n\n        Certificates\n        \n\n\n\n        Services\n        \n\n\n\n\n      Menu\n      \n\n\n\n\n\n          Search field\n        \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n×\n\n\n\n\n\n\ue80b\n\n\n\n\n\n\n\n\n\nSign In\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n★\n+1\n\n\n\n\n\n\n\n\n\n        Get Certified\n      \n\n\n\n\n        For Teachers\n      \n\n\n\n\n        Spaces\n      \n\n\n\n\n\n\n        Plus\n      \n\n\n\n\n\n\n        Get Certified\n      \n\n\n\n\n        For Teachers\n      \n\n\n\n\n        Spaces\n      \n\n\n\n\n\n\n        Plus\n      \n\n\n\n\n\n\nMy W3Schools\n\n\n\nTutorials\n\n\n\n          \xa0\n        \n\n\n\nExercises\n\n\n\n          \xa0\n        \n\n\n\nCertificates\n\n\n\n          \xa0\n        \n\n\n\nServices\n\n\n\n          \xa0\n        \n\n\nSpaces

In [None]:
#text = re.sub(r'\n+', '', text)

In [None]:
text

'\n\n\n\nW3Schools Online Web Tutorials\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n        Tutorials\n        \n\n\n\n        Exercises\n        \n\n\n\n        Certificates\n        \n\n\n\n        Services\n        \n\n\n\n\n      Menu\n      \n\n\n\n\n\n          Search field\n        \n\n\n\n\n\n\n\n\n\n\n\n\n\n\n×\n\n\n\n\n\n\ue80b\n\n\n\n\n\n\n\n\n\nSign In\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n★\n+1\n\n\n\n\n\n\n\n\n\n        Get Certified\n      \n\n\n\n\n        For Teachers\n      \n\n\n\n\n        Spaces\n      \n\n\n\n\n\n\n        Plus\n      \n\n\n\n\n\n\n        Get Certified\n      \n\n\n\n\n        For Teachers\n      \n\n\n\n\n        Spaces\n      \n\n\n\n\n\n\n        Plus\n      \n\n\n\n\n\n\nMy W3Schools\n\n\n\nTutorials\n\n\n\n          \xa0\n        \n\n\n\nExercises\n\n\n\n          \xa0\n        \n\n\n\nCertificates\n\n\n\n          \xa0\n        \n\n\n\nServices\n\n\n\n          \xa0\n        \n\n\nSpaces

In [None]:
# Step 0: Replace non-breaking space (\xa0) with regular space
text = text.replace('\xa0', ' ')  # optional: or just .replace('\xa0', '')

text = text.replace('×', ' ')

# Step 1: Remove private-use Unicode characters like \uf30c, \ue807
text = re.sub(r'[\uE000-\uF8FF]', '', text)

In [None]:
# Step 4: Remove empty lines and trim
lines = text.split('\n')
cleaned_lines = [line.strip() for line in lines if line.strip() != '']

In [None]:
cleaned_lines

['W3Schools Online Web Tutorials',
 'Tutorials',
 'Exercises',
 'Certificates',
 'Services',
 'Menu',
 'Search field',
 'Sign In',
 '★',
 '+1',
 'Get Certified',
 'For Teachers',
 'Spaces',
 'Plus',
 'Get Certified',
 'For Teachers',
 'Spaces',
 'Plus',
 'My W3Schools',
 'Tutorials',
 'Exercises',
 'Certificates',
 'Services',
 'Spaces',
 'Get Certified',
 'Plus',
 'Academy',
 'Logout',
 'Tutorials',
 'Tutorials filter input',
 'HTML and CSS',
 'Learn',
 'HTML',
 'Tutorial',
 'Reference',
 'Learn',
 'CSS',
 'Tutorial',
 'Reference',
 'Learn',
 'RWD',
 'Tutorial',
 'Learn',
 'Bootstrap',
 'Overview',
 'Learn',
 'W3.CSS',
 'Tutorial',
 'Reference',
 'Learn',
 'Sass',
 'Tutorial',
 'Reference',
 'Learn',
 'Colors',
 'Tutorial',
 'Reference',
 'Learn',
 'Icons',
 'Tutorial',
 'Reference',
 'Learn',
 'SVG',
 'Tutorial',
 'Reference',
 'Learn',
 'Canvas',
 'Tutorial',
 'Reference',
 'Learn',
 'Graphics',
 'Tutorial',
 'Learn',
 'Character Sets',
 'Reference',
 'Learn',
 'How To',
 'Tutorial'

In [None]:
clean_text = ' '.join(cleaned_lines)

In [None]:
clean_text = clean_text.lower()

In [None]:
!pip install contractions


Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (118 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.3/118.3 kB[0m 

In [None]:
import contractions
text = "I can't do this. They're coming. She's been here."

# Expand contractions
expanded_text = contractions.fix(text)

print(expanded_text)

I cannot do this. They are coming. She is been here.


In [None]:
clean_text = contractions.fix(clean_text)

In [None]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [None]:
import emoji

text = """I'm happy 😊 but sometimes sad 😢.😀
😃
😄
😁
😆
😅
😂
🤣
🥲
🥹
☺️
😊
😇
🙂
🙃
😉
😌
😍
🥰
😘
😗
😙
😱
"""
converted = emoji.demojize(text, language='en')

print(converted)

I'm happy :smiling_face_with_smiling_eyes: but sometimes sad :crying_face:.:grinning_face:
:grinning_face_with_big_eyes:
:grinning_face_with_smiling_eyes:
:beaming_face_with_smiling_eyes:
:grinning_squinting_face:
:grinning_face_with_sweat:
:face_with_tears_of_joy:
:rolling_on_the_floor_laughing:
:smiling_face_with_tear:
:face_holding_back_tears:
:smiling_face:
:smiling_face_with_smiling_eyes:
:smiling_face_with_halo:
:slightly_smiling_face:
:upside-down_face:
:winking_face:
:relieved_face:
:smiling_face_with_heart-eyes:
:smiling_face_with_hearts:
:face_blowing_a_kiss:
:kissing_face:
:kissing_face_with_smiling_eyes: 
:face_screaming_in_fear:



In [None]:
clean_text = emoji.demojize(clean_text, language='en')

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
stopwords.remove('not')

In [None]:
stopwords.append('no')

In [None]:
list_words_without_stopword = [word for word in clean_text.split(' ') if word not in stopwords]

In [None]:
clean_text = ' '.join(list_words_without_stopword)

In [None]:
clean_text

'w3schools online web tutorials tutorials exercises certificates services menu search field sign ★ +1 get certified teachers spaces plus get certified teachers spaces plus w3schools tutorials exercises certificates services spaces get certified plus academy logout tutorials tutorials filter input html css learn html tutorial reference learn css tutorial reference learn rwd tutorial learn bootstrap overview learn w3.css tutorial reference learn sass tutorial reference learn colors tutorial reference learn icons tutorial reference learn svg tutorial reference learn canvas tutorial reference learn graphics tutorial learn character sets reference learn tutorial data analytics learn ai tutorial learn generative ai tutorial learn chatgpt-3.5 tutorial learn chatgpt-4 tutorial learn google bard tutorial learn machine learning tutorial learn dsa tutorial learn data science tutorial learn numpy tutorial learn pandas tutorial learn scipy tutorial learn matplotlib tutorial learn statistics tutoria

In [None]:
%time
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer
ps = PorterStemmer()

clean_text_words = []
for w in clean_text.split(' '):
  clean_text_words.append(ps.stem(w))

clean_text_words = ' '.join(clean_text_words)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.68 µs


In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
%time
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

clean_text_words = []
for w in clean_text.split(' '):
  clean_text_words.append(lemmatizer.lemmatize(w))

clean_text_words = ' '.join(clean_text_words)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.68 µs


In [None]:
clean_text_words

'w3schools online web tutorial tutorial exercise certificate service menu search field sign ★ +1 get certified teacher space plus get certified teacher space plus w3schools tutorial exercise certificate service space get certified plus academy logout tutorial tutorial filter input html cs learn html tutorial reference learn cs tutorial reference learn rwd tutorial learn bootstrap overview learn w3.css tutorial reference learn sas tutorial reference learn color tutorial reference learn icon tutorial reference learn svg tutorial reference learn canvas tutorial reference learn graphic tutorial learn character set reference learn tutorial data analytics learn ai tutorial learn generative ai tutorial learn chatgpt-3.5 tutorial learn chatgpt-4 tutorial learn google bard tutorial learn machine learning tutorial learn dsa tutorial learn data science tutorial learn numpy tutorial learn panda tutorial learn scipy tutorial learn matplotlib tutorial learn statistic tutorial learn excel tutorial le

In [None]:
!pip install unidecode
import nltk
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import pandas as pd
import unidecode
from nltk.tokenize import word_tokenize
def preprocessing(text ):
    #text parsing from html
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()

    text = emoji.demojize(text, language='en')

    #accented character removal
    unaccented_string = unidecode.unidecode(text)
    text = unidecode.unidecode(text)

    #contraction fix
    text = contractions.fix(text)


    #special character removal
    text = re.sub('[^A-Za-z]+', ' ', text)



    #lower case convertion
    text = text.lower()


    #remove stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.remove('not')
    text_list = [word for word in text.split(' ') if word not in stopwords ]
    text = ' '.join(text_list)


    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    lemma = []
    for w in words:
        lemma.append(lemmatizer.lemmatize(w, pos ="a"))

    text = ' '.join(lemma)

    return text

In [None]:
corpus = """I can't believe it's already 9pm! 😱 Let's grab dinner at <b>Joe's Diner</b> before it closes.
Visit our website at https://www.example.com for more details. 😊
They've been working hard — you should've seen their faces when they won!
I LOVE this movie!!! <3 It's sooo good, better than "Titanic", don't you think?"""


In [None]:
corpus = [
    "MACHINE learning's amazing!! It's a subset of <b>AI</b> and trains modelsss 😎. ai",
    "deep learning is kinda like-- the 'cool kid' in machine Learning, ya know??",
    "Data scientists looove big data!! Check this 👉 https://datascience.com 💡💡",
    "AI, Machine-learning & Data--they're changing EVRYthing!!! <div>Truly.</div>",
    "To build a ML model, you're gonna need: clean data, good features & <script>code</script>!",

    # Restaurant domain (Noisy examples)
    "I can't belieeeeve they charged $25 for a burger!!! 😤 Totally not worth it!!",
    "The service was sooo slowww... waited 40 mins for a salad <br> and no apology 😒",
    "Uhh... the pasta's 'ok', but like, not 🔥🔥 ya feel me? Also, what's up with the music?!?"
]

In [None]:
preprocessed = []
for doc in corpus:
  preprocessed.append(preprocessing(doc))

In [None]:
preprocessed

['machine learning amazing subset ai trains modelsss smiling face sunglasses ai',
 'deep learning kind like cool kid machine learning ya know',
 'data scientists looove big data check backhand index pointing right https datascience com light bulb light bulb',
 'ai machine learning data changing evrything truly',
 'build ml model going need clean data good features',
 'can not belieeeeve charged burger face steam nose totally not worth',
 'service sooo slowww waited mins salad apology unamused face',
 'uhh pasta ok like not fire fire ya feel also music']

In [None]:
# One Hot Encoding

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorize = CountVectorizer(binary = True)

In [None]:
X = vectorize.fit_transform(preprocessed)


In [None]:
df_onehot = pd.DataFrame(X.toarray(), columns = vectorize.get_feature_names_out())

In [None]:
df_onehot

Unnamed: 0,ai,also,amazing,apology,backhand,belieeeeve,big,build,bulb,burger,...,subset,sunglasses,totally,trains,truly,uhh,unamused,waited,worth,ya
0,1,0,1,0,0,0,0,0,0,0,...,1,1,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
6,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
7,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [None]:
Text to Number:

One hot Encoding:
it will in binary

Drawback:
1. Frequency of a word in document is not preserved
2. Frequency of a word in a corpus is not preserved
3. Order of words is not preserved
4. Context of a document is not preserved


In [None]:
# Bag of Words

In [None]:
vectorize = CountVectorizer(binary = False)
X = vectorize.fit_transform(preprocessed)
df_bow = pd.DataFrame(X.toarray(), columns = vectorize.get_feature_names_out())
df_bow

Unnamed: 0,ai,also,amazing,apology,backhand,belieeeeve,big,build,bulb,burger,...,subset,sunglasses,totally,trains,truly,uhh,unamused,waited,worth,ya
0,2,0,1,0,0,0,0,0,0,0,...,1,1,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,1,0,1,0,2,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,1,...,0,0,1,0,0,0,0,0,1,0
6,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
7,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [None]:
Bag of Words:
Drawback:
1. Frequency of a word in a corpus is not preserved
2. Order of words is not preserved
3. Context of a document is not preserved

SyntaxError: invalid syntax (<ipython-input-209-4dc8cf3a4ec5>, line 1)

In [None]:
TF-IDF
Word Embeddings