# Preprocessing (Stopwords, Stemming & Lemmatization)

## Objective
Goal to extend preprocessing by:
1. Removing stopwords
2. Applying stemming
3. Applying lemmatization

In [None]:
# Step 1: Import libraries

import os
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download resources (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# Step 2: Load sample review (same as before)

base_dir = "/Users/mimi/aclImdb"
train_pos_dir = os.path.join(base_dir, "train/pos")
train_neg_dir = os.path.join(base_dir, "train/neg")

sample_pos = open(os.path.join(train_pos_dir, os.listdir(train_pos_dir)[0]), encoding="utf-8").read()
sample_neg = open(os.path.join(train_neg_dir, os.listdir(train_neg_dir)[0]), encoding="utf-8").read()

print(" Raw Positive Review:\n", sample_pos[:300])
print("\n Raw Negative Review:\n", sample_neg[:300])


In [None]:
# Step 3: Define preprocessing functions

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_with_stopwords(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [t for t in tokens if t not in stop_words]
    return tokens

def apply_stemming(tokens):
    return [stemmer.stem(t) for t in tokens]

def apply_lemmatization(tokens):
    return [lemmatizer.lemmatize(t) for t in tokens]

In [None]:
# Step 4: Apply preprocessing to positive review

tokens_pos = preprocess_with_stopwords(sample_pos)
tokens_pos_stem = apply_stemming(tokens_pos)
tokens_pos_lemma = apply_lemmatization(tokens_pos)

print(" Tokens (no stopwords):", tokens_pos[:30])
print(" Stemmed tokens:", tokens_pos_stem[:30])
print(" Lemmatized tokens:", tokens_pos_lemma[:30])

In [None]:
# Step 5: Apply preprocessing to negative review

tokens_neg = preprocess_with_stopwords(sample_neg)
tokens_neg_stem = apply_stemming(tokens_neg)
tokens_neg_lemma = apply_lemmatization(tokens_neg)

print(" Tokens (no stopwords):", tokens_neg[:30])
print(" Stemmed tokens:", tokens_neg_stem[:30])
print(" Lemmatized tokens:", tokens_neg_lemma[:30])