# One hot encodding Implementations

This notebook is done based on a challenge to implement one hot encoding without using pre-build solutions such as [pd.get_dummies](https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html).

First I tried to use basic implementation using bare python. 
But the challenge was how to deploy it in a different way, for example, using torch.

In [3]:
! pip install torch

Collecting torch
  Downloading torch-2.9.1-cp312-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.9.1-cp312-none-macosx_11_0_arm64.whl (74.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.5/74.5 MB[0m [31m8.3 MB/s[0m  [33m0:00:09[0mm0:00:01[0m00:01[0m
[?25hUsing cached sympy-1.14.0-py3-none-any.whl (6.3 MB)
Installing collected packages: sympy, torch
[2K  Attempting uninstall: sympy
[2K    Found existing installation: sympy 1.13.2
[2K    Uninstalling sympy-1.13.2:━━━━━━━━━━━━━━━━━━[0m [32m0/2[0m [sympy]
[2K      Successfully uninstalled sympy-1.13.2━━━━━[0m [32m0/2[0m [sympy]
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [torch]32m1/2[0m [torch]
[1A[2KSuccessfully installed sympy-1.14.0 torch-2.9.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[3

## Task Implement onehot encoder in python

### Implementation 1: Basic

In [1]:
import pandas as pd

class OneHotEncoder:
    def __init__(self):
        self.categories_ = []
        self.cat_index_ = {}
 
    def fit(self, values: list[str]):
        seen = set()
        unique_values = []
        for val in values:
            if val not in seen:
                unique_values.append(val)
                seen.add(val)
                
        self.categories_ = unique_values
        self.cat_index_ = {cat: idx for idx, cat in enumerate(unique_values)}
        #print(self.cat_index_)
        return self
  
    def transform(self, values: list[str]) -> list[list[int]]:
        result = []
        for val in values:
            one_hot = [0] * len(self.categories_)
            if val in self.cat_index_:
                one_hot[self.cat_index_[val]] = 1
            result.append(one_hot)
        return result


colors: list[str] = [
    "red",
    "green",
    "blue",
    "green",
    "green",
    "red",
    "red",
    "orange"
]

encoder = OneHotEncoder().fit(colors)
encoded = encoder.transform(colors)
print(encoded)

[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 1, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1]]


### Implementation 2: Using pytorch

In [12]:
# Convert list of colors to tokens
colors: list[str] = [
    "red", "green", "blue", "green",
    "green", "red", "red", "orange"
]

# Build vocabulary: unique colors → indices
vocab = {color: idx for idx, color in enumerate(set(colors))}
# e.g., {'red': 0, 'green': 1, 'blue': 2, 'orange': 3}
print(vocab)

{'blue': 0, 'green': 1, 'red': 2, 'orange': 3}


In [13]:
# Tokenize: convert strings to integers
tokens = [vocab[color] for color in colors]
print(tokens)  # [0, 1, 2, 1, 1, 0, 0, 3]

[2, 1, 0, 1, 1, 2, 2, 3]


In [14]:
import torch

input_ids = torch.tensor(tokens)
print(input_ids)


tensor([2, 1, 0, 1, 1, 2, 2, 3])


In [15]:
vocab_size = len(vocab)
output_dim = 3

torch.manual_seed(123)
embedding = torch.nn.Embedding(vocab_size, output_dim)
print(embedding.weight)

Parameter containing:
tensor([[-0.1115,  0.1204, -0.3696],
        [-0.2404, -1.1969,  0.2093],
        [-0.9724, -0.7550,  0.3239],
        [-0.1085,  0.2103, -0.3908]], requires_grad=True)


In [16]:
# Or I can replace the weights with indentity matrix
embedding.weight.data = torch.eye(vocab_size, output_dim)
print(embedding.weight)

Parameter containing:
tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 0.]], requires_grad=True)


In [20]:
print(embedding(torch.tensor([vocab["red"]])))

tensor([[0., 0., 1.]], grad_fn=<EmbeddingBackward0>)


In [21]:
print(embedding(torch.tensor([vocab["green"]])))

tensor([[0., 1., 0.]], grad_fn=<EmbeddingBackward0>)


### Now lets implement the whole solution 

In [59]:
import pandas as pd

class OneHotEncoder:
    def __init__(self):
        torch.manual_seed(123)
        vocab_size = len(vocab)
        output_dim = 3
 
    def fit(self, values: list[str]):
        # Build vocabulary: unique colors → indices
        vocab = {color: idx for idx, color in enumerate(set(values))}
        # e.g., {'red': 0, 'green': 1, 'blue': 2, 'orange': 3}
        print(vocab)
        
        embedding.weight.data = torch.eye(vocab_size, output_dim)

        print(embedding.weight)
        return self
  
    def transform(self, values: list[str]) -> list[list[int]]:
        result = []
        for val in values:
            result.append(embedding(torch.tensor([vocab[val]])))
            #print(embedding(torch.tensor([vocab["red"]])))
        return result
    
    def get_embedding(self, value: str) -> list[int]:
        return embedding(torch.tensor([vocab[value]]))


colors: list[str] = [
    "red",
    "green",
    "blue",
    "green",
    "green",
    "red",
    "red",
    "orange"
]

encoder = OneHotEncoder().fit(colors)
encoded = encoder.transform(colors)
print('My encoded values are: {}'.format(encoded))

# Checking my embeddings
for val in colors:
    emb = encoder.get_embedding(val)
    print('Color: {} -> Embedding: {}'.format(val, emb.tolist()))

{'blue': 0, 'green': 1, 'red': 2, 'orange': 3}
Parameter containing:
tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 0.]], requires_grad=True)
My encoded values are: [tensor([[0., 0., 1.]], grad_fn=<EmbeddingBackward0>), tensor([[0., 1., 0.]], grad_fn=<EmbeddingBackward0>), tensor([[1., 0., 0.]], grad_fn=<EmbeddingBackward0>), tensor([[0., 1., 0.]], grad_fn=<EmbeddingBackward0>), tensor([[0., 1., 0.]], grad_fn=<EmbeddingBackward0>), tensor([[0., 0., 1.]], grad_fn=<EmbeddingBackward0>), tensor([[0., 0., 1.]], grad_fn=<EmbeddingBackward0>), tensor([[0., 0., 0.]], grad_fn=<EmbeddingBackward0>)]
Color: red -> Embedding: [[0.0, 0.0, 1.0]]
Color: green -> Embedding: [[0.0, 1.0, 0.0]]
Color: blue -> Embedding: [[1.0, 0.0, 0.0]]
Color: green -> Embedding: [[0.0, 1.0, 0.0]]
Color: green -> Embedding: [[0.0, 1.0, 0.0]]
Color: red -> Embedding: [[0.0, 0.0, 1.0]]
Color: red -> Embedding: [[0.0, 0.0, 1.0]]
Color: orange -> Embedding: [[0.0, 0.0, 0.0]]
