In [1]:
texts = [
    "blue car and blue window",
    "black crow in the window",
    "i see my reflection in the window"
]

In [5]:
# binary encoding checks if a word exists or not
vocab = sorted(set(word for sentence in texts for word in sentence.split()))
print(len(vocab), vocab)

12 ['and', 'black', 'blue', 'car', 'crow', 'i', 'in', 'my', 'reflection', 'see', 'the', 'window']


In [6]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(binary=True)
vec.fit(texts)
print([w for w in sorted(vec.vocabulary_.keys())])

['and', 'black', 'blue', 'car', 'crow', 'in', 'my', 'reflection', 'see', 'the', 'window']


In [7]:
import pandas as pd
pd.DataFrame(vec.transform(texts).toarray(), columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,and,black,blue,car,crow,in,my,reflection,see,the,window
0,1,0,1,1,0,0,0,0,0,0,1
1,0,1,0,0,1,1,0,0,0,1,1
2,0,0,0,0,0,1,1,1,1,1,1


In [8]:
# Counting checks how many times a word appeared
vec = CountVectorizer(binary=False) # we cound ignore binary=False argument since it is default
vec.fit(texts)

pd.DataFrame(vec.transform(texts).toarray(), columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,and,black,blue,car,crow,in,my,reflection,see,the,window
0,1,0,2,1,0,0,0,0,0,0,1
1,0,1,0,0,1,1,0,0,0,1,1
2,0,0,0,0,0,1,1,1,1,1,1


# TF-IDF stands for term frequency-inverse document frequency. 
## TF-IDF assigns more weight to less frequently occurring words rather than frequently occurring ones. It is based on the assumption that less frequently occurring words are more important.
### TF-IDF consists of two parts:

1) Term frequency which is same as Counting method we saw before
<br>
2) Inverse document frequency: This is responsible for reducing the weights of words that occur frequently and increasing the weights of words that occur rarely.
Formula to calculate tf-idf is:

<i> tfidf(t, d, D) = tf(t, d) * idf(t, D) </i>, where
<ul>
    <li> t is a term (word)</li>
    <li> d is a document that this term is in </li>
    <li> D is a collection of all documents </li>
</ul>

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
vec.fit(texts)

pd.DataFrame(vec.transform(texts).toarray(), columns=sorted(vec.vocabulary_.keys()))

Unnamed: 0,and,black,blue,car,crow,in,my,reflection,see,the,window
0,0.396875,0.0,0.793749,0.396875,0.0,0.0,0.0,0.0,0.0,0.0,0.2344
1,0.0,0.534093,0.0,0.0,0.534093,0.406192,0.0,0.0,0.0,0.406192,0.315444
2,0.0,0.0,0.0,0.0,0.0,0.358291,0.47111,0.47111,0.47111,0.358291,0.278245
