In [2]:
import numpy as np
import pandas as pd

## 1. Bag of Words

In [3]:
df = pd.DataFrame({'text':['people watch friends','friends watch friends','people write comment','friends write comment'],'output':[1,1,0,0]})
df

Unnamed: 0,text,output
0,people watch friends,1
1,friends watch friends,1
2,people write comment,0
3,friends write comment,0


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [5]:
bow = cv.fit_transform(df['text'])

In [6]:
# vocab
cv.vocabulary_

{'people': 2, 'watch': 3, 'friends': 1, 'write': 4, 'comment': 0}

In [7]:
print(bow[0].toarray())
print(bow[1].toarray())


[[0 1 1 1 0]]
[[0 2 0 1 0]]


In [8]:
cv.transform(['friends watch and write comment of friends']).toarray()

array([[1, 2, 0, 1, 1]])

### Advantages :<br>
1. simple and intuitive
2. easy to implement 

### Disadvantages:<br>
1. sparsity
2. no fixed size
3. out of vocabulary
4. ordering is an issue

## 2.Bag of N-Grams

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))

In [14]:
bow = cv.fit_transform(df['text'])

In [15]:
print(cv.vocabulary_)

{'people watch': 2, 'watch friends': 4, 'friends watch': 0, 'people write': 3, 'write comment': 5, 'friends write': 1}


In [16]:
print(bow[0].toarray())
print(bow[1].toarray())


[[0 0 1 0 1 0]]
[[1 0 0 0 1 0]]


### Benefits<br>
1. Able to capture semantic of sentence
2. easy to implement
3. easy to intuit

### Disadvantages<br>
1. Dimension of vocaublary increases
2. Slows down the algo
3. No solution for out of vocabulary

## 3. TF-IDF

![TF.png](attachment:TF.png)

![IDF.png](attachment:IDF.png)

In [5]:
df

Unnamed: 0,text,output
0,people watch friends,1
1,friends watch friends,1
2,people write comment,0
3,friends write comment,0


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit_transform(df['text']).toarray()

array([[0.        , 0.49681612, 0.61366674, 0.61366674, 0.        ],
       [0.        , 0.8508161 , 0.        , 0.52546357, 0.        ],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027],
       [0.61366674, 0.49681612, 0.        , 0.        , 0.61366674]])

### Advantages
1. Extensively used in Information Retrival.

### Disadvantages
1. Sparsity
2. Out of vocabulary problem
3. Dimension increases
4. Cannot capture semantic relationship