# Text Representation using Sklearn #

## Installation ##

In [1]:
! pip3 install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://packagecloud.io/github/git-lfs/pypi/simple

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [2]:
# Text corpus: 
docs = ["The coffee smells delicious",
        "I need my coffee",
        "My cat loves coffee"]

data, words = [], []
for doc in docs:
    data.append(doc.split())
    words+=doc.split()

print("Data: ",  data)
print("Words: ", words)

Data:  [['The', 'coffee', 'smells', 'delicious'], ['I', 'need', 'my', 'coffee'], ['My', 'cat', 'loves', 'coffee']]
Words:  ['The', 'coffee', 'smells', 'delicious', 'I', 'need', 'my', 'coffee', 'My', 'cat', 'loves', 'coffee']


## Label Encoding ##

In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
label_encoder = LabelEncoder()
label_encoder.fit(words)

In [5]:
print("Classes: ", label_encoder.classes_)
print("Transform labels to normalized encoding: ", label_encoder.transform(["my", "cat", "need", "coffee"]))
print("Transform labels back to original encoding: ", label_encoder.inverse_transform([1, 4, 3]))

Classes:  ['I' 'My' 'The' 'cat' 'coffee' 'delicious' 'loves' 'my' 'need' 'smells']
Transform labels to normalized encoding:  [7 3 8 4]
Transform labels back to original encoding:  ['My' 'coffee' 'cat']


## One Hot Encoding ##

In [6]:
from sklearn.preprocessing import OneHotEncoder

In [7]:
onehot_encoder = OneHotEncoder()
onehot_encoder.fit(data)

In [8]:
print("Categories: ", onehot_encoder.categories_)
print("Encoded data:\n", onehot_encoder.transform(data).toarray())

Categories:  [array(['I', 'My', 'The'], dtype=object), array(['cat', 'coffee', 'need'], dtype=object), array(['loves', 'my', 'smells'], dtype=object), array(['coffee', 'delicious'], dtype=object)]
Encoded data:
 [[0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1.]
 [1. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0.]
 [0. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0.]]


## Bag of Words ##

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
count_vect = CountVectorizer()
count_vect.fit(docs)

In [11]:
print("Vocabulary: ", count_vect.vocabulary_)

Vocabulary:  {'the': 7, 'coffee': 1, 'smells': 6, 'delicious': 2, 'need': 5, 'my': 4, 'cat': 0, 'loves': 3}


In [12]:
sent = "I need Coffee, Coffee fuels my day"
print(f"Bow representation for `{sent}`:", count_vect.transform([sent]).toarray())

Bow representation for `I need Coffee, Coffee fuels my day`: [[0 2 0 0 1 1 0 0]]


In [13]:
count_vect = CountVectorizer(binary=True)
count_vect.fit(docs)

In [14]:
print(f"Bow representation for `{sent}`:", count_vect.transform([sent]).toarray())

Bow representation for `I need Coffee, Coffee fuels my day`: [[0 1 0 0 1 1 0 0]]


## Bag of N-Grams ##

In [15]:
count_vect = CountVectorizer(ngram_range=(1, 2))
count_vect.fit(docs)

In [16]:
print("Vocabulary: ", count_vect.vocabulary_)

Vocabulary:  {'the': 14, 'coffee': 2, 'smells': 12, 'delicious': 4, 'the coffee': 15, 'coffee smells': 3, 'smells delicious': 13, 'need': 10, 'my': 7, 'need my': 11, 'my coffee': 9, 'cat': 0, 'loves': 5, 'my cat': 8, 'cat loves': 1, 'loves coffee': 6}


In [17]:
print(f"Bow representation for `my cat is lovely`:", count_vect.transform(["my cat is lovely"]).toarray())

Bow representation for `my cat is lovely`: [[1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0]]


In [18]:
count_vect = CountVectorizer(ngram_range=(2, 2))
count_vect.fit(docs)

In [19]:
print("Vocabulary: ", count_vect.vocabulary_)

Vocabulary:  {'the coffee': 7, 'coffee smells': 1, 'smells delicious': 6, 'need my': 5, 'my coffee': 4, 'my cat': 3, 'cat loves': 0, 'loves coffee': 2}


In [20]:
print(f"Bow representation for `my cat is lovely`:", count_vect.transform(["my cat is lovely"]).toarray())

Bow representation for `my cat is lovely`: [[0 0 0 1 0 0 0 0]]


## TF-IDF ##

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(docs)

In [23]:
tfidf_vect.transform(["my cat is lovely", "I need Coffee"]).toarray()

array([[0.79596054, 0.        , 0.        , 0.        , 0.60534851,
        0.        , 0.        , 0.        ],
       [0.        , 0.50854232, 0.        , 0.        , 0.        ,
        0.861037  , 0.        , 0.        ]])

In [24]:
print("IDF for all words in the vocabulary",tfidf_vect.idf_)

IDF for all words in the vocabulary [1.69314718 1.         1.69314718 1.69314718 1.28768207 1.69314718
 1.69314718 1.69314718]
