# Sklearn Count Vectorizer

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

### Initialize CountVectorizer

In [72]:
cv = CountVectorizer(stop_words='english')

### Create Example Data

In [60]:
doc = {'Article':['the house had a tiny little mouse',
 'the cat saw the mouse',
 'the mouse ran away from the house',
 'the cat finally ate the mouse',
 'the end of the mouse story']}

In [61]:
df = pd.DataFrame(doc)

In [62]:
df.head()

Unnamed: 0,Article
0,the house had a tiny little mouse
1,the cat saw the mouse
2,the mouse ran away from the house
3,the cat finally ate the mouse
4,the end of the mouse story


In [73]:
count_vec = cv.fit_transform(df['Article'])

### For all iterable in Article column vocabulary count

In [74]:
cv.vocabulary_

{'house': 5,
 'tiny': 11,
 'little': 6,
 'mouse': 7,
 'cat': 2,
 'saw': 9,
 'ran': 8,
 'away': 1,
 'finally': 4,
 'ate': 0,
 'end': 3,
 'story': 10}

In [76]:
cv.get_feature_names()

['ate',
 'away',
 'cat',
 'end',
 'finally',
 'house',
 'little',
 'mouse',
 'ran',
 'saw',
 'story',
 'tiny']

In [77]:
dfc = pd.DataFrame(count_vec.A, columns=cv.get_feature_names())

### Map of count vector feature set

In [78]:
dfc

Unnamed: 0,ate,away,cat,end,finally,house,little,mouse,ran,saw,story,tiny
0,0,0,0,0,0,1,1,1,0,0,0,1
1,0,0,1,0,0,0,0,1,0,1,0,0
2,0,1,0,0,0,1,0,1,1,0,0,0
3,1,0,1,0,1,0,0,1,0,0,0,0
4,0,0,0,1,0,0,0,1,0,0,1,0


# Sklearn TfidfTransformer

### Initialize tf-idf 

In [79]:
from sklearn.feature_extraction.text import TfidfTransformer

In [81]:
tf_idf = TfidfTransformer(smooth_idf=True,use_idf=True)

In [84]:
tf_idf.fit_transform(dfc)

<5x12 sparse matrix of type '<class 'numpy.float64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [85]:
df_idf = pd.DataFrame(tf_idf.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
mouse,1.0
cat,1.693147
house,1.693147
ate,2.098612
away,2.098612
end,2.098612
finally,2.098612
little,2.098612
ran,2.098612
saw,2.098612
