## One Hot Encoding of Text

In [1]:
documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]
processed_docs = [doc.lower().replace(".","") for doc in documents]
processed_docs

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

In [2]:
vocabulary = {}
count = 0

for document in processed_docs:
    for word in document.split(" "):
        if word not in vocabulary:
            count += 1
            vocabulary[word] = count

In [3]:
print(vocabulary)

{'dog': 1, 'bites': 2, 'man': 3, 'eats': 4, 'meat': 5, 'food': 6}


In [4]:
def one_hot_encode(document):
    one_hot_encoded = []
    
    for word in document.split(" "):
        word_one_hot = [0] * count
        
        if word in vocabulary:
            word_one_hot[vocabulary[word] - 1] = 1
        
        one_hot_encoded.append(word_one_hot)
        
    return one_hot_encoded

In [7]:
one_hot_encoded = one_hot_encode("dog bites man")

In [8]:
print(one_hot_encoded)

[[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]]


In [16]:
text = "Man and dog are good friends."
text = text.lower().replace(".","")
text_array = text.split(" ")

one_hot_encoded = one_hot_encode(text)
print(f"Text: {text}\n")
print("One hot Encoded:\n")
index = 0
for vector in one_hot_encoded:
    print(f"{text_array[index]}: {vector}")
    index = index + 1

Text: man and dog are good friends

One hot Encoded:

man: [0, 0, 1, 0, 0, 0]
and: [0, 0, 0, 0, 0, 0]
dog: [1, 0, 0, 0, 0, 0]
are: [0, 0, 0, 0, 0, 0]
good: [0, 0, 0, 0, 0, 0]
friends: [0, 0, 0, 0, 0, 0]


In [17]:
S1 = 'dog bites man'
S2 = 'man bites dog'
S3 = 'dog eats meat'
S4 = 'man eats food'

In [18]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

data = [S1.split(), S2.split(), S3.split(), S4.split()]
values = data[0] + data[1] + data[2] + data[3]

values

['dog',
 'bites',
 'man',
 'man',
 'bites',
 'dog',
 'dog',
 'eats',
 'meat',
 'man',
 'eats',
 'food']

In [27]:
data

[['dog', 'bites', 'man'],
 ['man', 'bites', 'dog'],
 ['dog', 'eats', 'meat'],
 ['man', 'eats', 'food']]

In [19]:
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
integer_encoded

array([1, 0, 4, 4, 0, 1, 1, 2, 5, 4, 2, 3], dtype=int64)

In [26]:
label_encoder.transform(["dog", "man"])

array([1, 4])

In [28]:
one_hot_encoder = OneHotEncoder()
one_hot_encoded = one_hot_encoder.fit_transform(data).toarray()

one_hot_encoded

array([[1., 0., 1., 0., 0., 0., 1., 0.],
       [0., 1., 1., 0., 1., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 1., 0., 0.]])

In [35]:
ohe_encoded = one_hot_encoder.transform([["dog","bites", "man"]])
ohe_encoded.toarray()

array([[1., 0., 1., 0., 0., 0., 1., 0.]])

In [37]:
one_hot_encoder.get_feature_names_out()

array(['x0_dog', 'x0_man', 'x1_bites', 'x1_eats', 'x2_dog', 'x2_food',
       'x2_man', 'x2_meat'], dtype=object)

In [38]:
data = [["Dog"], ["Bites"], ["Man"],["Eats"], ["Food"], ["Meat"]]

one_hot_encoder = OneHotEncoder()

one_hot_encoded = one_hot_encoder.fit_transform(data).toarray()

one_hot_encoded

array([[0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [39]:
one_hot_encoder.get_feature_names_out()

array(['x0_Bites', 'x0_Dog', 'x0_Eats', 'x0_Food', 'x0_Man', 'x0_Meat'],
      dtype=object)

In [42]:
one_hot_encoder.transform([["Dog"], ["Bites"], ["Man"]]).toarray()

array([[0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.]])