### **Bag of words-BOW(CountVectorizer)**
**Doesn't care about order of words just know how many times each word appears**

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
sentences = ['the brown dog dog', 'dog eats cat', 'dog eats food', 'the hot dog', 'cat inside the home', 'dog outside the cat']

In [37]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences)

# Create sparse matrix with the count of words in each sentences
mat_content = vectorizer.transform(sentences)

# Represent each word as a feature
features = vectorizer.get_feature_names()

pd.DataFrame(mat_content.todense(), columns=features, index=sentences)

Unnamed: 0,brown,cat,dog,eats,food,home,hot,inside,outside,the
the brown dog dog,1,0,2,0,0,0,0,0,0,1
dog eats cat,0,1,1,1,0,0,0,0,0,0
dog eats food,0,0,1,1,1,0,0,0,0,0
the hot dog,0,0,1,0,0,0,1,0,0,1
cat inside the home,0,1,0,0,0,1,0,1,0,1
dog outside the cat,0,1,1,0,0,0,0,0,1,1


In [25]:
# Note_1: vector shape will not change even after inserting many words
vectorizer.transform(['cat cat cat dog dog dog hot']).todense()

matrix([[0, 3, 3, 0, 0, 1, 0]], dtype=int64)

In [26]:
# Note_2: If we added a new word, will not be considered in the count
vectorizer.transform(['man cat cat cat dog dog dog hot']).todense()

matrix([[0, 3, 3, 0, 0, 1, 0]], dtype=int64)

In [38]:
# Notice here that stop word 'the' is considered as an important word we need to exclude it.
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(sentences)
mat_content = vectorizer.transform(sentences)
features = vectorizer.get_feature_names()
pd.DataFrame(mat_content.todense(), columns=features, index=sentences)

Unnamed: 0,brown,cat,dog,eats,food,home,hot,inside,outside
the brown dog dog,1,0,2,0,0,0,0,0,0
dog eats cat,0,1,1,1,0,0,0,0,0
dog eats food,0,0,1,1,1,0,0,0,0
the hot dog,0,0,1,0,0,0,1,0,0
cat inside the home,0,1,0,0,0,1,0,1,0
dog outside the cat,0,1,1,0,0,0,0,0,1


In [39]:
# Try custom stop words, it can help you to consider your stop words based on your field.
custom_stop_word = ['inside', 'outside', 'the']
vectorizer = CountVectorizer(stop_words=custom_stop_word)
vectorizer.fit(sentences)
mat_content = vectorizer.transform(sentences)
features = vectorizer.get_feature_names()
pd.DataFrame(mat_content.todense(), columns=features, index=sentences)

Unnamed: 0,brown,cat,dog,eats,food,home,hot
the brown dog dog,1,0,2,0,0,0,0
dog eats cat,0,1,1,1,0,0,0
dog eats food,0,0,1,1,1,0,0
the hot dog,0,0,1,0,0,0,1
cat inside the home,0,1,0,0,0,1,0
dog outside the cat,0,1,1,0,0,0,0


### **N-Grames**
**Increasing the N-grams increases the number of features**

In [41]:
# It differs from BOW is adding ngram_range parameters
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, 2))
vectorizer.fit(sentences)
mat_content = vectorizer.transform(sentences)
features = vectorizer.get_feature_names()
pd.DataFrame(mat_content.todense(), columns=features, index=sentences)

Unnamed: 0,brown,brown dog,cat,cat inside,dog,dog dog,dog eats,dog outside,eats,eats cat,eats food,food,home,hot,hot dog,inside,inside home,outside,outside cat
the brown dog dog,1,1,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0
dog eats cat,0,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0
dog eats food,0,0,0,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,0
the hot dog,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0
cat inside the home,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0
dog outside the cat,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1


**<h2>TF-IDF</h2>**
**<h6>We can make word representation with it</h6>**
<img src="tf-idf.png">

In [None]:
# When to use df-idf
# If your topics are separable(contains different words)

# when not to use df-idf
# if you care more about the context or sentiment analysis

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [43]:
tf_idf_vectorizer = TfidfVectorizer(stop_words='english')
tf_idf_vectorizer.fit(sentences)
mat_content = tf_idf_vectorizer.transform(sentences)
features = tf_idf_vectorizer.get_feature_names()
pd.DataFrame(mat_content.todense(), columns=features, index=sentences)

Unnamed: 0,brown,cat,dog,eats,food,home,hot,inside,outside
the brown dog dog,0.698445,0.0,0.715664,0.0,0.0,0.0,0.0,0.0,0.0
dog eats cat,0.0,0.582166,0.430816,0.689551,0.0,0.0,0.0,0.0,0.0
dog eats food,0.0,0.0,0.368313,0.589511,0.718903,0.0,0.0,0.0,0.0
the hot dog,0.0,0.0,0.455969,0.0,0.0,0.0,0.889996,0.0,0.0
cat inside the home,0.0,0.439681,0.0,0.0,0.0,0.635091,0.0,0.635091,0.0
dog outside the cat,0.0,0.524573,0.388196,0.0,0.0,0.0,0.0,0.0,0.757712


In [49]:
# With N-Gram
tf_idf_vectorizer_ng = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tf_idf_vectorizer_ng.fit(sentences)
mat_content = tf_idf_vectorizer_ng.transform(sentences)
features = tf_idf_vectorizer_ng.get_feature_names()
pd.DataFrame(mat_content.todense(), columns=features, index=sentences)

Unnamed: 0,brown,brown dog,cat,cat inside,dog,dog dog,dog eats,dog outside,eats,eats cat,eats food,food,home,hot,hot dog,inside,inside home,outside,outside cat
the brown dog dog,0.496909,0.496909,0.0,0.0,0.50916,0.496909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dog eats cat,0.0,0.0,0.394058,0.0,0.291612,0.0,0.466745,0.0,0.466745,0.569191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dog eats food,0.0,0.0,0.0,0.0,0.269746,0.0,0.431746,0.0,0.431746,0.0,0.526511,0.526511,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the hot dog,0.0,0.0,0.0,0.0,0.340608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.664826,0.664826,0.0,0.0,0.0,0.0
cat inside the home,0.0,0.0,0.327113,0.472493,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.472493,0.0,0.0,0.472493,0.472493,0.0,0.0
dog outside the cat,0.0,0.0,0.357901,0.0,0.264855,0.0,0.0,0.516965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.516965,0.516965


In [48]:
# idf value for each feature
[(word, idf) for word, idf in zip(features ,tf_idf_vectorizer.idf_)]

[('brown', 2.252762968495368),
 ('brown dog', 2.252762968495368),
 ('cat', 1.5596157879354227),
 ('cat inside', 2.252762968495368),
 ('dog', 1.1541506798272583),
 ('dog dog', 2.252762968495368),
 ('dog eats', 1.8472978603872037),
 ('dog outside', 2.252762968495368),
 ('eats', 1.8472978603872037),
 ('eats cat', 2.252762968495368),
 ('eats food', 2.252762968495368),
 ('food', 2.252762968495368),
 ('home', 2.252762968495368),
 ('hot', 2.252762968495368),
 ('hot dog', 2.252762968495368),
 ('inside', 2.252762968495368),
 ('inside home', 2.252762968495368),
 ('outside', 2.252762968495368),
 ('outside cat', 2.252762968495368)]