# Investigate How Preprocessing Steps Change the Vocabulary and Semantics

In [8]:
import re
import nltk
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\myacoubalex\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\myacoubalex\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load data

In [10]:
# Load data
data = fetch_20newsgroups(categories=['sci.space'], remove=('headers', 'footers', 'quotes'))
docs = data.data[:100]

# Setup
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

## preprocessing functions

In [11]:
def version_a(doc):
    return doc

def version_b(doc):
    doc = re.sub(r'\W+', ' ', doc)  # Remove non-alphanumeric characters
    tokens = word_tokenize(doc)
    return tokens

def version_c(doc):
    tokens = version_b(doc)
    tokens = [w.lower() for w in tokens if w.lower() not in stop_words]
    return tokens

def version_d(doc):
    tokens = version_c(doc)
    stemmed = [stemmer.stem(token) for token in tokens]
    return stemmed

## analysis function

In [12]:
def analyze(docs, version_func):
    processed = [version_func(doc) for doc in docs]
    if isinstance(processed[0], str):  # Version A (raw text)
        tokenized = [word_tokenize(doc) for doc in processed]
    else:
        tokenized = processed

    flat_tokens = [token for doc in tokenized for token in doc]
    vocab = set(flat_tokens)
    top_words = Counter(flat_tokens).most_common(10)
    avg_len = sum(len(doc) for doc in tokenized) / len(tokenized)

    return {
        "vocab_size": len(vocab),
        "top_words": top_words,
        "avg_len": avg_len
    }

## analysis our versions 

In [13]:
# Run analysis for all versions
results = {
    "Version A": analyze(docs, version_a),
    "Version B": analyze(docs, version_b),
    "Version C": analyze(docs, version_c),
    "Version D": analyze(docs, version_d)
}

# Print results
for version, stats in results.items():
    print(f"\n{version}")
    print("Vocabulary Size:", stats['vocab_size'])
    print("Top 10 Words:", stats['top_words'])
    print("Average Words per Document:", round(stats['avg_len'], 2))


Version A
Vocabulary Size: 5554
Top 10 Words: [(',', 1054), ('.', 963), ('the', 864), ('of', 455), ('to', 446), ('and', 434), ('a', 397), ('--', 316), (')', 315), ('(', 300)]
Average Words per Document: 232.28

Version B
Vocabulary Size: 5386
Top 10 Words: [('the', 866), ('of', 456), ('to', 449), ('and', 435), ('a', 397), ('in', 282), ('is', 226), ('for', 220), ('that', 164), ('I', 160)]
Average Words per Document: 201.34

Version C
Vocabulary Size: 4103
Top 10 Words: [('space', 144), ('nasa', 79), ('would', 66), ('one', 54), ('earth', 53), ('also', 43), ('shuttle', 39), ('spacecraft', 37), ('program', 37), ('time', 36)]
Average Words per Document: 111.76

Version D
Vocabulary Size: 3137
Top 10 Words: [('space', 144), ('nasa', 79), ('orbit', 70), ('would', 66), ('use', 61), ('one', 56), ('launch', 55), ('earth', 53), ('mission', 51), ('program', 44)]
Average Words per Document: 111.76


---

### 1. Which steps had the biggest effect on the vocabulary?
The biggest effect on vocabulary size happened between Version B and Version C, where I applied lowercasing and removed stopwords.This makes sense because a lot of common but less meaningful words like "the", "and", "to", etc., were removed.

### 2. Did some important terms get removed?
Yes, it's possible. For example, the word "would" is still frequent in Version C and D, but some other useful contextual words might have been removed when I eliminated stopwords. Also, sin Version Dtemming  replaced different forms of words with their root form.

### 3. Can too much preprocessing harm the meaning?
In Version D, although stemming helped reduce the vocabulary to 3137, it may have oversimplified some words and removed useful variations. While cleaning the data is helpful for analysis, doing too much can reduce the richness and make it harder to understand the original context. So, preprocessing should be balanced based on the task.


---