## One Hot Encoding and Label Encoding Exercise

This notebook demonstrates OneHot and Label Encoding using scikit-learn on custom text documents.

In [None]:
# Import required libraries
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
# Define the text documents
S1 = 'Often, machine learning tutorials will recommend.'
S2 = 'Getting started in applied machine learning.'
S3 = 'One good example is to use.'

# Preprocess: convert to lowercase and remove punctuation
processed_docs = [doc.lower().replace(',', '').replace('.', '') for doc in [S1, S2, S3]]
print("Processed documents:")
for i, doc in enumerate(processed_docs, 1):
    print(f"S{i}: {doc}")

## Label Encoding

Label Encoding converts each word in the corpus into a numeric value between 0 and n-1 (where n is the number of unique words).

In [None]:
# Prepare data for encoding
data = [doc.split() for doc in processed_docs]
values = data[0] + data[1] + data[2]
print("All words:", values)
print("\nTotal words:", len(values))
print("Unique words:", len(set(values)))

In [None]:
# Label Encoding
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)

print("Label Encoded:", integer_encoded)
print("\nWord to Label mapping:")
for word, label in zip(values, integer_encoded):
    print(f"  {word:15s} -> {label}")

In [None]:
# Show unique label encodings
print("\nUnique word to label mapping:")
unique_words = sorted(set(values))
unique_labels = label_encoder.transform(unique_words)
for word, label in zip(unique_words, unique_labels):
    print(f"  {word:15s} -> {label}")

## One-Hot Encoding

One-Hot Encoding represents each word as a binary vector where only one element is 1 (hot) and all others are 0.

In [None]:
# One-Hot Encoding
onehot_encoder = OneHotEncoder()
onehot_encoded = onehot_encoder.fit_transform(data).toarray()

print("Onehot Encoded Matrix:")
print(onehot_encoded)
print("\nShape:", onehot_encoded.shape)
print("(rows = number of documents, columns = total unique words across all positions)")

In [None]:
# Display each document's encoding
for i, (doc, encoding) in enumerate(zip(processed_docs, onehot_encoded), 1):
    print(f"\nDocument {i}: '{doc}'")
    print(f"One-hot encoding: {encoding}")

## Vocabulary Analysis

In [None]:
# Build vocabulary
vocab = {}
count = 0
for doc in processed_docs:
    for word in doc.split():
        if word not in vocab:
            count += 1
            vocab[word] = count

print("Vocabulary:")
for word, idx in sorted(vocab.items(), key=lambda x: x[1]):
    print(f"  {word:15s} -> {idx}")