# Design and Apply a Custom Text Cleaning Pipeline for Two Contrasting Categories and Analysis them 

## Data Aquesition (load data)

In [8]:
# import main libraries and classes 

import re
import nltk

import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\myacoubalex\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\myacoubalex\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\myacoubalex\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\myacoubalex\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [19]:
# decide the categories and load dataset
categories = ['comp.graphics','sci.space']
data = fetch_20newsgroups(subset='train',
                          categories=categories,
                          remove=('headers', 'footers', 'quotes'),
                          random_state = 2025)
# data exploration
print(type(data))             # bunch object (dictionary) : collection of plain text
print(len(data.data))         # Number of documents
print(data.target_names)      # our categories check
print(data.target[:5])        # First 5 category labels (0 or 1)
print(data.data[0][:500])     # First 500 characters of the first document

<class 'sklearn.utils._bunch.Bunch'>
1177
['comp.graphics', 'sci.space']
[0 1 0 1 1]
Have you considered the Apple Laserwriter IIg. We use it for all our B&W
image printing.



In [26]:
# Filter and collect 100 documents per category
graphics_docs = [doc for i, doc in enumerate(data.data) if data.target[i] == 0][:100]
space_docs = [doc for i, doc in enumerate(data.data) if data.target[i] == 1][:100]

print(len(graphics_docs))     
print(len(space_docs))        
print(type(graphics_docs))    
print(graphics_docs[:3])     

100
100
<class 'list'>
['Have you considered the Apple Laserwriter IIg. We use it for all our B&W\nimage printing.\n', "\nI've got the 6.0 spec (obviously since I quoted it in my last posting). \nMy gripe about TIFF is that it's far too complicated and nearly\ninfinitely easier to write than to read, which I think hurts your\nacceptance by anything that will need to read those images (e.g.,\npaint programs).\n\nIn a nutshell, I don't think TIFF is salvageable unless the fat is\ntrimmed significantly- and then it wouldn't be TIFF anymore.  They\nkeep trying to cut it back, but it's late now.  Maybe they >will< fix it,\nand change that magic number to signify the lack of compatibility. \nThat would probably make me happy.", '\n\nFirst, thanks to all who replied to my original question.']


In [28]:
# data cleaning by Regex
def clean_text(text):
    # Remove punctuation, numbers, and symbols
    cleaned = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned

cleaned_space = [clean_text(doc) for doc in space_docs]
cleaned_graphics = [clean_text(doc) for doc in graphics_docs]

In [35]:
# chaek the cleanig
print(space_docs[2])
print("-"*30)
print(cleaned_space[2])

+++
++Once inflated the substance was no longer
++needed since there is nothing to cause the balloon to collapse.
++This inflatable structure could suffer multiple holes with no 
++disastrous deflation.
+
+preasure (and the internal preasure that was needed to maintain
+a spherical shape against this resistance) caused them to
+catastrophically deflated.  The large silvered shards
+
+The billboard should pop like a dime store balloon.

No, you're wrong about this. Give me some time to get my references.


------------------------------

Once inflated the substance was no longer
needed since there is nothing to cause the balloon to collapse
This inflatable structure could suffer multiple holes with no 
disastrous deflation

preasure and the internal preasure that was needed to maintain
a spherical shape against this resistance caused them to
catastrophically deflated  The large silvered shards

The billboard should pop like a dime store balloon

No youre wrong about this Give me some ti

## Preprocessing

### Tokenization

In [57]:
from nltk.tokenize import word_tokenize

def tokenize_docs(doc_list):
    """
    Tokenizes a list of documents using nltk.word_tokenize.
    
    Parameters:
    - doc_list: List of cleaned text documents (strings)
    
    Returns:
    - List of tokenized documents (lists of words)
    """
    return [word_tokenize(doc) for doc in doc_list]

space_tokens = tokenize_docs(cleaned_space)
graphics_tokens = tokenize_docs(cleaned_graphics)

print(space_tokens[0])  # First sci.space document tokenized

['The', 'European', 'Space', 'Agency', 'has', 'involvement', 'with', 'remote', 'earth', 'observation', 'and', 'I', 'presume', 'this', 'includes', 'surveillance', 'optical', 'etc', 'So', 'its', 'not', 'just', 'the', 'USUSSRex', 'who', 'are', 'in', 'the', 'game', 'But', 'what', 'is', 'the', 'game', 'What', 'can', 'be', 'done', 'with', 'space', 'observation', 'The', 'military', 'functions', 'of', 'missile', 'spotting', 'troop', 'spotting', 'etc', 'are', 'well', 'documented', 'but', 'what', 'about', 'anything', 'else', 'The', 'biggest', 'eg', 'I', 'can', 'think', 'of', 'is', 'to', 'get', 'a', 'metal', 'sensing', 'sat', 'over', 'a', 'paying', 'country', 'and', 'scan', 'their', 'territory', 'for', 'precious', 'metals', 'More', 'importantly', 'if', 'radar', 'can', 'spot', 'water', 'vapour', 'clouds', 'presumably', 'a', 'radar', 'based', 'sat', 'will', 'be', 'capable', 'of', 'spotting', 'riversopen', 'water', 'and', 'underground', 'water', 'from', 'space', 'This', 'would', 'be', 'a', 'positive

###  Case Folding (to lower)

In [41]:
def lowercase_tokens(docs):
    """
    Lowercases all tokens in a list of tokenized documents.

    Parameters:
    - docs: List of lists of tokens (e.g., [['NASA', 'launched'], ['Space', 'Station']])

    Returns:
    - List of lists with lowercase tokens
    """
    return [[word.lower() for word in doc] for doc in docs]

lower_space_tokens = lowercase_tokens(space_tokens)
lower_graphics_tokens = lowercase_tokens(graphics_tokens)

print(space_tokens[3])
print(lower_space_tokens[3])

['How', 'ironic']
['how', 'ironic']


###  Remove StopWords

In [45]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    """
    Removes stopwords from a list of words (a single document).

    Parameters:
    - text: a list of words (tokens)

    Returns:
    - A list of words with stopwords removed
    """
    return [word for word in text if word not in stop_words]

filtered_space_tokens = [remove_stopwords(doc) for doc in lower_space_tokens]
filtered_graphics_tokens = [remove_stopwords(doc) for doc in lower_graphics_tokens]

print(lower_space_tokens[4])
print(filtered_space_tokens[4])

['pat', 'sez', 'yeah', 'but', 'a', 'windscreen', 'cut', 'down', 'most', 'of', 'it', 'canopies', 'ended', 'it', 'completely', 'of', 'course', 'the', 'environment', 'in', 'space', 'continues', 'to', 'suck', 'tommy', 'mac', 'tom', 'mcwilliams', 'wk', 'as', 'the', 'radius', 'of', 'vision', 'increases', 'tmibmclmsuedu', 'hm', 'the', 'circumference', 'of', 'mystery', 'grows']
['pat', 'sez', 'yeah', 'windscreen', 'cut', 'canopies', 'ended', 'completely', 'course', 'environment', 'space', 'continues', 'suck', 'tommy', 'mac', 'tom', 'mcwilliams', 'wk', 'radius', 'vision', 'increases', 'tmibmclmsuedu', 'hm', 'circumference', 'mystery', 'grows']


### lemmatization (back words to their roots)

In [46]:
lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(tokens_list):
    """
    Lemmatizes all words in each document.

    Parameters:
    - tokens_list: list of token lists (one list per document)

    Returns:
    - List of lemmatized token lists
    """
    return [[lemmatizer.lemmatize(word) for word in doc] for doc in tokens_list]

lemm_space_tokens = lemmatize_tokens(filtered_space_tokens)
lemm_graphics_tokens = lemmatize_tokens(filtered_graphics_tokens)


### Top Most Freq words in each category 

In [55]:
def most_frequent_words(docs, top_n=10):
    """
    Counts most frequent words in a list of tokenized documents.
    
    Parameters:
    - docs: List of token lists (e.g. [['nasa', 'space'], ['space', 'rocket']])
    - top_n: Number of top frequent words to return
    
    Returns:
    - Pandas Series of top N most frequent words
    """
    all_words = [word for doc in docs for word in doc]
    word_counts = Counter(all_words)
    return pd.Series(word_counts).sort_values(ascending=False).head(top_n)


print("Top words in sci.space:")
print(most_frequent_words(lemm_space_tokens))

print("\nTop words in comp.graphics:")
print(most_frequent_words(lemm_graphics_tokens))

Top words in sci.space:
space      160
would       79
nasa        77
data        65
system      64
lunar       55
also        54
one         51
shuttle     51
mission     50
dtype: int64

Top words in comp.graphics:
image      48
would      40
compass    35
point      31
system     29
file       28
opcols     28
graphic    27
thanks     27
color      26
dtype: int64


---

### Do they use different vocabularies? 
- if we notice the most freq words in each category we will conclued that :They use different vocabularies focused on different domains.

### Are there words that clearly define one category more than the other?

In [56]:
# Get sets of top words
space_top = set(most_frequent_words(lemm_space_tokens).index)
graphics_top = set(most_frequent_words(lemm_graphics_tokens).index)

# comparison between them
common_words = space_top & graphics_top
unique_space = space_top - graphics_top
unique_graphics = graphics_top - space_top

print("Common words:", common_words)
print("\nUnique sci.space words:", unique_space)
print("\nUnique comp.graphics words:", unique_graphics)

Common words: {'would', 'system'}

Unique sci.space words: {'space', 'shuttle', 'one', 'also', 'nasa', 'mission', 'lunar', 'data'}

Unique comp.graphics words: {'thanks', 'image', 'color', 'graphic', 'compass', 'file', 'point', 'opcols'}


**Critical Reflection**

- How does preprocessing help reveal the “essence” of each topic?

Preprocessing plays a vital role in revealing the “essence” of each topic by cleaning and simplifying raw text into meaningful representations. By removing noise such as punctuation, stopwords, and casing inconsistencies, and applying techniques like lemmatization, we reduce redundancy and highlight the true semantic content. This makes topic-specific patterns and keywords more visible and interpretable.
    
    
- What might go wrong if we skipped any of the steps?

If any step is skipped, the analysis can become inaccurate or misleading. For example, without lowercasing, words like “NASA” and “nasa” would be treated as different, distorting frequency counts. Without lemmatization, similar forms like “running” and “ran” wouldn't be grouped, weakening the signal. If stopwords are not removed, common words like “the” or “is” may overwhelm more meaningful ones. Therefore, each step in the preprocessing pipeline is essential for enhancing clarity, consistency, and relevance in the data.

---

###  Modular Preprocessing Pipeline

In [None]:
# those are all dependences and helper functions , we will use them in our pipeline (we imported and created them before but i write them here for clearness)

import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopword set and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """Remove punctuation, numbers, and special characters."""
    return re.sub(r'[^a-zA-Z\s]', '', text)

def lowercase_tokens(tokenized_list):
    """Convert a list of tokens to lowercase (expects list of lists)."""
    return [[word.lower() for word in doc] for doc in tokenized_list]

def remove_stopwords(tokens):
    """Remove stopwords from a list of tokens."""
    return [word for word in tokens if word not in stop_words]

def lemmatize_tokens(tokenized_list):
    """Lemmatize each token (expects list of lists)."""
    return [[lemmatizer.lemmatize(word) for word in doc] for doc in tokenized_list]

In [58]:
# Full pipeline for a single document
def preprocess_document(doc):
    cleaned = clean_text(doc)
    tokens = word_tokenize(cleaned)  # Use word_tokenize for a single doc
    tokens = lowercase_tokens([tokens])[0]
    tokens = remove_stopwords(tokens)
    tokens = lemmatize_tokens([tokens])[0]
    return tokens

# Apply pipeline to a list of documents
def preprocess_documents(docs):
    return [preprocess_document(doc) for doc in docs]

# Apply to both categories
graphics_preprocessed = preprocess_documents(graphics_docs)
space_preprocessed = preprocess_documents(space_docs)

In [60]:
# check the length
print(len(graphics_preprocessed)) 

# check the sample of doc from (cell by cell preprocessing was done above) Vs (pipeline)
print("comp.graphics sample:", graphics_preprocessed[0])
print("comp.graphics sample:", lemm_graphics_tokens[0])

100
100
comp.graphics sample: ['considered', 'apple', 'laserwriter', 'iig', 'use', 'bw', 'image', 'printing']
comp.graphics sample: ['considered', 'apple', 'laserwriter', 'iig', 'use', 'bw', 'image', 'printing']


---