#lowecasing

In [19]:
# Simple example of lowercasing text in Python

text = "General Motors is BIGGER than general motors."

print("Original:", text)
print("Lower:", text.lower())       # basic lowercase
print("Casefold:", text.casefold()) # more aggressive, handles accents
# Tip: .casefold() is better for multilingual text
# because it handles more Unicode cases (e.g., German “ß” → “ss”


Original: General Motors is BIGGER than general motors.
Lower: general motors is bigger than general motors.
Casefold: general motors is bigger than general motors.


#HTML Strip

In [20]:
import re, html

In [21]:
#Pure stdlib (regex + unescape) — tiny, but brittle on messy HTML
s = '<p>Hello <b>world</b> &amp; <a href="#">friends</a>!</p>'
text = re.sub(r'<[^>]+>', '', s)        # strip tags
text = html.unescape(text).strip()      # decode entities like &amp; -> &
print(text)  # Hello world friends!

Hello world & friends!


In [22]:
from bs4 import BeautifulSoup

In [23]:
s = '<p>Hello <b>world</b> &amp; <a href="#">friends</a>! <script>alert(1)</script></p>'
text = BeautifulSoup(s, 'html.parser').get_text(" ", strip=True)
print(text)  # Hello world friends!

Hello world & friends !


#Strip Punctuation

In [24]:
import string

In [25]:
text = "Hello, world! NLP is fun... right?"
clean = text.translate(str.maketrans('', '', string.punctuation))
print(clean)  # Hello world NLP is fun right

Hello world NLP is fun right


#Stemming

In [26]:
from nltk.stem import PorterStemmer

In [27]:
stemmer = PorterStemmer()
words = ["running", "runner", "runs", "easily", "fairly"]

stems = [stemmer.stem(w) for w in words]
print(stems)  # ['run', 'runner', 'run', 'easili', 'fairli']

['run', 'runner', 'run', 'easili', 'fairli']


#Lemmatizing

In [28]:
from nltk.stem import WordNetLemmatizer

In [29]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [31]:
lemmatizer = WordNetLemmatizer()
words = ["running", "runs", "better", "geese"]

lemmas = [lemmatizer.lemmatize(w) for w in words]
print(lemmas)  # ['running', 'run', 'better', 'goose']

['running', 'run', 'better', 'goose']


#Stopwords

#

In [39]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
text = "This is a simple example showing off stop word filtration."
tokens = word_tokenize(text)

filtered = [w for w in tokens if w.lower() not in stopwords.words("english")]
print(filtered)
# ['This', 'simple', 'example', 'showing', 'stop', 'word', 'filtration', '.']

['simple', 'example', 'showing', 'stop', 'word', 'filtration', '.']


#sentence segmentation

In [53]:
from nltk.tokenize import sent_tokenize

text = "The cat sat on the mat. The dog barked loudly. And then it ran away!"
sentences = sent_tokenize(text)

print(sentences)

['The cat sat on the mat.', 'The dog barked loudly.', 'And then it ran away!']


#Tokenization

**Simple whitespace tokenizer**

In [54]:
# 1. Whitespace tokenization
# Example sentence
text = "The runners were running quickly in the U.S.A."
tokens_ws = text.split()
print("Whitespace:", tokens_ws)

Whitespace: ['The', 'runners', 'were', 'running', 'quickly', 'in', 'the', 'U.S.A.']


**word_tokenize in NLTK** is a wrapper around the TreebankWordTokenizer, based on the Penn Treebank conventions.

It applies a series of regular-expression rules to split text into tokens.

What it does step by step

Whitespace splitting → starts by splitting on spaces.

* Punctuation separation → separates most punctuation from words. "Let's go!" → ["Let", "'s", "go", "!"]

*  Contractions & clitics → splits common English contractions. "can't" → ["ca", "n't"], "he'll" → ["he", "'ll"]

* Special cases → keeps some tokens intact. "U.S.A." stays as "U.S.A." Numbers with decimals "3.14" stay whole.

* Quotes handling → normalizes quotation marks to opening/closing forms.

In [45]:
# 2. NLTK word_tokenize
# Example sentence
text = "The runners were running quickly in the U.S.A."
from nltk.tokenize import word_tokenize
tokens_nltk = word_tokenize(text)
print("NLTK:", tokens_nltk)

NLTK: ['The', 'runners', 'were', 'running', 'quickly', 'in', 'the', 'U.S.A', '.']


**BPE (Byte Pair Encoding) tokenization**

* Start with characters as the smallest units.

* Find the most frequent pair of symbols (letters, or subwords) in the text.

* Merge that pair into a new token.

* Repeat until you reach the desired vocabulary size.

In [51]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
text = "The runners were running quickly in the U.S.A."
print(tokenizer.tokenize(text))

['The', 'Ġrunners', 'Ġwere', 'Ġrunning', 'Ġquickly', 'Ġin', 'Ġthe', 'ĠU', '.', 'S', '.', 'A', '.']


**What does Ġ mean in BPE?**

* Ġ = marker for a space before a word

* Example: "Hello world" → ['Hello', 'Ġworld']

* "world" at start of sentence → ['world']

* Keeps spaces explicit so the model can:

* Reconstruct the original text

* Distinguish "world" vs. " Ġworld"

* (Other tokenizers use different markers — e.g. SentencePiece uses ▁ for space.)

#Bag of Words

In [52]:
from sklearn.feature_extraction.text import CountVectorizer

docs = [
    "The cat sat on the mat.",
    "The dog sat on the log."
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)

print("Vocabulary:", vectorizer.get_feature_names_out())
print("BoW Matrix:\n", X.toarray())

Vocabulary: ['cat' 'dog' 'log' 'mat' 'on' 'sat' 'the']
BoW Matrix:
 [[1 0 0 1 1 1 2]
 [0 1 1 0 1 1 2]]
