#### Installing NLTK and Downloading Corpora

In [2]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sanjaymahto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Tokenization: Breaking Text into Words and Sentences
- Sentence tokenization cuts the text into sentences, like slicing bread.

- Word tokenization cuts each sentence into words, like dicing vegetables.

In [3]:
from nltk.tokenize import word_tokenize
text = "I am learning Natural Language Processing!"
word_tokenize(text)


['I', 'am', 'learning', 'Natural', 'Language', 'Processing', '!']

#### Stopwords Removal
Analogy: Think of stopwords as background noise in a conversation: “the,” “is,” “in.” They’re useful in grammar but don’t usually add deep meaning 

in analysis. Removing them is like tuning out small talk to focus on the key ideas.

In [4]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in word_tokenize(text) if w.lower() not in stop_words]


#### Stemming and Lemmatization
- Analogy: If words were clothes, stemming is like chopping off sleeves to make everything short-sleeved (aggressive but quick), while lemmatization is like ironing and folding each shirt carefully (precise and context-aware).

- Stemming: running, runs, ran → run (by brute force)

- Lemmatization: Uses grammar and vocabulary rules to return the base or dictionary form of a word.

In [7]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem('running')  # returns 'run'



'run'

In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('running', pos='v')  # returns 'run'

'run'

#### Mini Project Idea: Text Cleaner


In [8]:
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Setup
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize
    words = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word, pos='v') for word in words]
    
    return words

# Test
clean_text("The striped bats are hanging on their feet for best")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sanjaymahto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sanjaymahto/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/sanjaymahto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['strip', 'bat', 'hang', 'feet', 'best']