# NLP Preprocessing Basics

This notebook demonstrates fundamental NLP preprocessing techniques using NLTK and spaCy.

## Install Dependencies

In [None]:
!pip install nltk spacy
!python -m spacy download en_core_web_sm

## 1. Tokenization

Tokenization is the process of breaking text into individual words or tokens.

In [None]:
import nltk
import spacy

# Download NLTK resources
nltk.download('punkt')

# Sample text
text = "Hello world! This is an example of tokenization in NLP."

# NLTK Tokenization
nltk_tokens = nltk.word_tokenize(text)
print("NLTK Tokens:", nltk_tokens)

# spaCy Tokenization
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
spacy_tokens = [token.text for token in doc]
print("spaCy Tokens:", spacy_tokens)

## 2. Stemming & Lemmatization

Stemming and lemmatization reduce words to their base or root form.

In [None]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download WordNet for lemmatization
nltk.download('wordnet')

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Example words
words = ["running", "runs", "ran", "easily", "fairly", "computers", "better", "was", "is"]

# Stemming
stemmed_words = [stemmer.stem(word) for word in words]
print("Original words:", words)
print("Stemmed words:", stemmed_words)

# Lemmatization
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print("Lemmatized words:", lemmatized_words)

# spaCy lemmatization
doc = nlp(" ".join(words))
spacy_lemmas = [token.lemma_ for token in doc]
print("spaCy lemmas:", spacy_lemmas)

## 3. Stopword Removal

Stopwords are common words that often don't contribute much meaning (e.g., "the", "is", "at").

In [None]:
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')

# Sample sentence
sentence = "This is an example sentence showing stopword removal in action."

# NLTK stopword removal
nltk_stopwords = set(stopwords.words('english'))
nltk_tokens = nltk.word_tokenize(sentence.lower())
nltk_filtered = [word for word in nltk_tokens if word not in nltk_stopwords]
print("Original sentence:", sentence)
print("After NLTK stopword removal:", nltk_filtered)

# spaCy stopword removal
doc = nlp(sentence)
spacy_filtered = [token.text for token in doc if not token.is_stop]
print("After spaCy stopword removal:", spacy_filtered)

## 4. Part-of-Speech (POS) Tagging

POS tagging identifies the grammatical parts of speech in text.

In [None]:
# Download NLTK resources for POS tagging
nltk.download('averaged_perceptron_tagger')

# Sample sentence
sentence = "The quick brown fox jumps over the lazy dog."

# NLTK POS tagging
tokens = nltk.word_tokenize(sentence)
nltk_pos_tags = nltk.pos_tag(tokens)
print("NLTK POS Tags:", nltk_pos_tags)

# spaCy POS tagging
doc = nlp(sentence)
spacy_pos_tags = [(token.text, token.pos_) for token in doc]
print("spaCy POS Tags:", spacy_pos_tags)

## 5. Named Entity Recognition (NER)

NER identifies entities like people, organizations, locations, etc. in text.

In [None]:
# Sample text for NER
text = "Apple Inc. is planning to open a new store in New York City next month. CEO Tim Cook made the announcement yesterday."

# spaCy NER
doc = nlp(text)
print("Named Entities:")
for ent in doc.ents:
    print(f"- {ent.text} ({ent.label_}): {spacy.explain(ent.label_)}")

# Visualize entities (if in Jupyter notebook)
from spacy import displacy
displacy.render(doc, style="ent", jupyter=True)

## 🚀 Challenge

Take a paragraph from Wikipedia, preprocess it, and compare results between NLTK and spaCy.

In [None]:
# Sample Wikipedia paragraph
wiki_text = """
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence 
concerned with the interactions between computers and human language, in particular how to program computers 
to process and analyze large amounts of natural language data. The goal is a computer capable of understanding 
the contents of documents, including the contextual nuances of the language within them.
"""

# Your preprocessing code here
# 1. Tokenize
# 2. Remove stopwords
# 3. Lemmatize
# 4. Compare results

# NLTK processing
nltk_tokens = nltk.word_tokenize(wiki_text)
nltk_filtered = [word.lower() for word in nltk_tokens if word.lower() not in nltk_stopwords and word.isalnum()]
nltk_lemmatized = [lemmatizer.lemmatize(word) for word in nltk_filtered]

# spaCy processing
doc = nlp(wiki_text)
spacy_filtered = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]

print("NLTK processed tokens (first 20):", nltk_lemmatized[:20])
print("spaCy processed tokens (first 20):", spacy_filtered[:20])

# Compare unique tokens
nltk_set = set(nltk_lemmatized)
spacy_set = set(spacy_filtered)

print(f"\nUnique tokens in NLTK: {len(nltk_set)}")
print(f"Unique tokens in spaCy: {len(spacy_set)}")
print(f"Tokens in NLTK but not in spaCy: {len(nltk_set - spacy_set)}")
print(f"Tokens in spaCy but not in NLTK: {len(spacy_set - nltk_set)}")