# One-Hot Encoding

In [1]:
from nltk.tokenize import word_tokenize, sent_tokenize
from string import punctuation

mytext = '''Dog bites man.
            Man bites dog.
            Dog eats meat.
            Man eats food.'''

In [2]:
def pre_process(sentence):
    return " ".join(word.lower() for word in word_tokenize(sentence) if word not in punctuation)

In [3]:
word_tokenize(pre_process("Learning NLP is really fun."))

['learning', 'nlp', 'is', 'really', 'fun']

In [6]:
# create a vocab as a list of words in the input text
def get_vocab(text):
    vocab = {}
    my_sentences = sent_tokenize(text)
    for sentence in my_sentences:
        tokens = word_tokenize(pre_process(sentence))
        for token in tokens:
            if token not in vocab.keys():
                vocab[token] = len(vocab)
    return vocab

In [9]:
myVocab = get_vocab(mytext)
print(myVocab)

{'dog': 0, 'bites': 1, 'man': 2, 'eats': 3, 'meat': 4, 'food': 5}


In [10]:
temp = [0] * len(myVocab)
print(temp)

[0, 0, 0, 0, 0, 0]


In [11]:
# create one-hot encoding
def get_one_hot_encoding(vocab, sentence):
    one_hot_encoding = []
    for word in word_tokenize(pre_process(sentence)):
        temp = [0] * len(vocab)
        if word in vocab.keys():
            temp[vocab[word]] = 1
        one_hot_encoding.append(temp)
    print(sentence, "==> One-hot encoding: ", one_hot_encoding)

In [12]:
get_one_hot_encoding(myVocab, "dog")

dog ==> One-hot encoding:  [[1, 0, 0, 0, 0, 0]]


In [13]:
get_one_hot_encoding(myVocab, "Dog bites man.")

Dog bites man. ==> One-hot encoding:  [[1, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0]]


In [14]:
get_one_hot_encoding(myVocab, "Man runs after the dog.")

Man runs after the dog. ==> One-hot encoding:  [[0, 0, 1, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]]


# One-hot-encoding using scikit-learn

In [17]:
S1 = "dog bites man"
S2 = "man bites dog"
S3 = "dog eats meat"
S4 = "man eats food"

In [20]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

data = [S1.split(), S2.split(), S3.split(), S4.split()]
values = data[0] + data[1] + data[2] + data[3]
print ("The data: ", values)

#Label Encoding
lable_encoder = LabelEncoder()
integer_encoded = lable_encoder.fit_transform(values)
print("Lable encoded:", integer_encoded)

# OneHot Encoding
onehot_encoder = OneHotEncoder()
onehot_encoded = onehot_encoder.fit_transform(data).toarray()
print("One hot encoded:", onehot_encoded)


The data:  ['dog', 'bites', 'man', 'man', 'bites', 'dog', 'dog', 'eats', 'meat', 'man', 'eats', 'food']
Lable encoded: [1 0 4 4 0 1 1 2 5 4 2 3]
One hot encoded: [[1. 0. 1. 0. 0. 0. 1. 0.]
 [0. 1. 1. 0. 1. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 1. 0. 0.]]
