# Final Pipeline for Text Data

## Objective:
Goal is to:
1. Combine all preprocessing steps into one pipeline
2. Apply it to the IMDb dataset
3. Save cleaned dataset for modeling

In [None]:
# Step 1: Import libraries

import os
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download required NLTK resources (first run only)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download("punkt_tab")

In [None]:
# Step 2: Define preprocessing pipeline

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text, use_stemming=False):
    # Lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenization
    # tokens = word_tokenize(text)
    tokens = text.split()  # Simple fallback tokenizer

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming or Lemmatization
    if use_stemming:
        tokens = [stemmer.stem(t) for t in tokens]
    else:
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
    
    return " ".join(tokens)  # join back into text

In [None]:
# Step 3: Load dataset (subset for demo, can increase later)

base_dir = "/Users/mimi/aclImdb"
train_pos_dir = os.path.join(base_dir, "train/pos")
train_neg_dir = os.path.join(base_dir, "train/neg")

def load_reviews(directory, label, limit=1000):
    data = []
    for i, fname in enumerate(os.listdir(directory)):
        if i >= limit:
            break
        with open(os.path.join(directory, fname), encoding="utf-8") as f:
            data.append((f.read(), label))
    return data

pos_reviews = load_reviews(train_pos_dir, 1, limit=1000)
neg_reviews = load_reviews(train_neg_dir, 0, limit=1000)

all_data = pos_reviews + neg_reviews
df = pd.DataFrame(all_data, columns=["review", "label"])

print("Dataset shape:", df.shape)
df.head()


In [None]:
# Step 4: Apply preprocessing

df["cleaned_review"] = df["review"].apply(lambda x: preprocess_text(x, use_stemming=False))

df.head(10)


In [None]:
# Step 5: Save cleaned dataset

df.to_csv("cleaned_imdb_reviews.csv", index=False)
print(" Cleaned dataset saved as cleaned_imdb_reviews.csv")


# Dfter running this, you should have a CSV file with:
- ``review`` (original text)
- ``label`` (0=negative, 1=positive)
- ``cleaned_review`` (fully preprocessed text)