In [1]:
"""
One Hot Encoding converts text into binary vectors where each unique word 
gets its own dimension. Only one element is '1' (hot) while others are '0' (cold).

Advantages:
- Simple to understand and implement
- No assumptions about word relationships
- Works well for small vocabularies

Disadvantages:
- Very sparse vectors (mostly zeros)
- High dimensionality with large vocabularies
- No semantic meaning captured
"""

import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

# Simple example with sentences
sentences = [
    "I love machine learning",
    "Machine learning is amazing", 
    "I love programming",
    "Programming is fun"
]

print("=== One Hot Encoding Example ===")
print("Original sentences:")
for i, sentence in enumerate(sentences):
    print(f"{i+1}. {sentence}")

# Method 1: Manual One Hot Encoding
def manual_one_hot_encoding(sentences):
    # Create vocabulary
    vocabulary = set()
    for sentence in sentences:
        words = sentence.lower().split()
        vocabulary.update(words)
    
    vocabulary = sorted(list(vocabulary))
    print(f"\nVocabulary: {vocabulary}")
    print(f"Vocabulary size: {len(vocabulary)}")
    
    # Create word to index mapping
    word_to_index = {word: i for i, word in enumerate(vocabulary)}
    
    # Create one-hot vectors for each sentence
    one_hot_vectors = []
    for sentence in sentences:
        words = sentence.lower().split()
        # Initialize vector with zeros
        vector = [0] * len(vocabulary)
        # Set 1 for words present in sentence
        for word in words:
            vector[word_to_index[word]] = 1
        one_hot_vectors.append(vector)
    
    return vocabulary, one_hot_vectors

vocabulary, vectors = manual_one_hot_encoding(sentences)

print(f"\n=== One Hot Vectors ===")
print(f"Vector dimensions: {len(vocabulary)}")
print(f"Vocabulary: {vocabulary}")
print()

for i, (sentence, vector) in enumerate(zip(sentences, vectors)):
    print(f"Sentence {i+1}: '{sentence}'")
    print(f"Vector: {vector}")
    print(f"Active words: {[vocabulary[j] for j, val in enumerate(vector) if val == 1]}")
    print()

=== One Hot Encoding Example ===
Original sentences:
1. I love machine learning
2. Machine learning is amazing
3. I love programming
4. Programming is fun

Vocabulary: ['amazing', 'fun', 'i', 'is', 'learning', 'love', 'machine', 'programming']
Vocabulary size: 8

=== One Hot Vectors ===
Vector dimensions: 8
Vocabulary: ['amazing', 'fun', 'i', 'is', 'learning', 'love', 'machine', 'programming']

Sentence 1: 'I love machine learning'
Vector: [0, 0, 1, 0, 1, 1, 1, 0]
Active words: ['i', 'learning', 'love', 'machine']

Sentence 2: 'Machine learning is amazing'
Vector: [1, 0, 0, 1, 1, 0, 1, 0]
Active words: ['amazing', 'is', 'learning', 'machine']

Sentence 3: 'I love programming'
Vector: [0, 0, 1, 0, 0, 1, 0, 1]
Active words: ['i', 'love', 'programming']

Sentence 4: 'Programming is fun'
Vector: [0, 1, 0, 1, 0, 0, 0, 1]
Active words: ['fun', 'is', 'programming']

