### Feature Engineering Best Practices: Handling Text Data
**Question**: Load a dataset with text data (e.g., SMS Spam Collection), perform text
preprocessing, and extract numerical features using TF-IDF.

In [1]:
# write your code from here  

import pandas as pd
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import urllib.request
import os

def download_sms_spam_dataset(url="https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip", filename="smsspamcollection.zip", extract_to="sms_spam_data"):
    import zipfile

    if not os.path.exists(extract_to):
        os.makedirs(extract_to)
    zip_path = os.path.join(extract_to, filename)

    if not os.path.exists(zip_path):
        print("Downloading dataset...")
        urllib.request.urlretrieve(url, zip_path)
    else:
        print("Dataset zip already downloaded.")

    # Extract if not done
    extracted_file = os.path.join(extract_to, "SMSSpamCollection")
    if not os.path.exists(extracted_file):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print("Dataset extracted.")
    else:
        print("Dataset already extracted.")

    return extracted_file

def load_sms_spam_data(filepath):
    try:
        df = pd.read_csv(filepath, sep='\t', header=None, names=['label', 'message'])
        # Map labels to binary
        df['label'] = df['label'].map({'ham': 0, 'spam': 1})
        if df.isnull().any().any():
            df.dropna(inplace=True)
        return df.reset_index(drop=True)
    except Exception as e:
        raise RuntimeError(f"Failed to load SMS Spam data: {e}")

def text_preprocessor(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove digits (optional)
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def build_pipeline():
    vectorizer = TfidfVectorizer(preprocessor=text_preprocessor, stop_words='english', max_features=5000)
    clf = LogisticRegression(max_iter=200, random_state=42)

    pipeline = Pipeline([
        ('tfidf', vectorizer),
        ('clf', clf)
    ])
    return pipeline

def main():
    # Download and load dataset
    data_path = download_sms_spam_dataset()
    df = load_sms_spam_data(data_path)

    # Split train/inference
    X_train, X_infer, y_train, y_infer = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)

    # Build and train pipeline
    pipeline = build_pipeline()
    pipeline.fit(X_train, y_train)

    # Predict and evaluate on inference set
    y_pred = pipeline.predict(X_infer)
    print("Classification report on inference set:")
    print(classification_report(y_infer, y_pred, digits=4))

    # Show sample transformation (TF-IDF vector shape)
    sample_vec = pipeline.named_steps['tfidf'].transform(X_infer[:3])
    print(f"\nTF-IDF feature matrix shape for sample inference data: {sample_vec.shape}")

if __name__ == "__main__":
    main()

Downloading dataset...
Dataset extracted.
Classification report on inference set:
              precision    recall  f1-score   support

           0     0.9622    1.0000    0.9807       966
           1     1.0000    0.7450    0.8538       149

    accuracy                         0.9659      1115
   macro avg     0.9811    0.8725    0.9173      1115
weighted avg     0.9672    0.9659    0.9638      1115


TF-IDF feature matrix shape for sample inference data: (3, 5000)
