<a href="https://colab.research.google.com/github/parsa-abbasi/intro-to-nlp/blob/main/NLP_text_representation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## One-hot Encoding

One-hot encoding is a representation method that represents each word as a vector of 0s and 1s. The length of the vector is equal to the size of the vocabulary. Each word is represented by a vector that has a 1 in the position that corresponds to the index of the word in the vocabulary and 0s in all other positions.

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
from sklearn.preprocessing import OneHotEncoder
from nltk.tokenize import word_tokenize

text = "He who thinks great thoughts often makes great errors"

tokens = word_tokenize(text)

one_hot_encoder = OneHotEncoder(sparse_output=False)

one_hot_encoded = one_hot_encoder.fit_transform([[token] for token in tokens])

print(tokens)
print(one_hot_encoded)

['He', 'who', 'thinks', 'great', 'thoughts', 'often', 'makes', 'great', 'errors']
[[1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]]


## Bag-of-Words (BoW)

Bag-of-Words (BoW) is a representation method that represents each document as a vector of numbers. The length of the vector is equal to the size of the vocabulary. Each document is represented by a vector that has the count of each word in the vocabulary.

It is called a “bag” of words, because any information about the order or structure of words in the document is discarded.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize

docs = ["He who thinks great thoughts often makes great errors",
        "The most thought-provoking thing in our thought-provoking time is that we are still not thinking"]

vectorizer = CountVectorizer(lowercase=True, tokenizer=word_tokenize, stop_words=['the'])

bow = vectorizer.fit_transform(docs)

print(vectorizer.get_feature_names_out())
print(bow.toarray())

['are' 'errors' 'great' 'he' 'in' 'is' 'makes' 'most' 'not' 'often' 'our'
 'still' 'that' 'thing' 'thinking' 'thinks' 'thought-provoking' 'thoughts'
 'time' 'we' 'who']
[[0 1 2 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 1]
 [1 0 0 0 1 1 0 1 1 0 1 1 1 1 1 0 2 0 1 1 0]]




## Tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

docs = ["He who thinks great thoughts often makes great errors",
        "The most thought-provoking thing in our thought-provoking time is that we are still not thinking"]

vectorizer = TfidfVectorizer(lowercase=True, tokenizer=word_tokenize)

tf_idf = vectorizer.fit_transform(docs)

print(vectorizer.get_feature_names_out())
print(tf_idf.shape)
print(tf_idf.toarray())

['are' 'errors' 'great' 'he' 'in' 'is' 'makes' 'most' 'not' 'often' 'our'
 'still' 'that' 'the' 'thing' 'thinking' 'thinks' 'thought-provoking'
 'thoughts' 'time' 'we' 'who']
(2, 22)
[[0.         0.30151134 0.60302269 0.30151134 0.         0.
  0.30151134 0.         0.         0.30151134 0.         0.
  0.         0.         0.         0.         0.30151134 0.
  0.30151134 0.         0.         0.30151134]
 [0.24253563 0.         0.         0.         0.24253563 0.24253563
  0.         0.24253563 0.24253563 0.         0.24253563 0.24253563
  0.24253563 0.24253563 0.24253563 0.24253563 0.         0.48507125
  0.         0.24253563 0.24253563 0.        ]]


