# DataFrame - Tokenize - TF-IDF

In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
docs=["the house had a tiny little mouse", 
"the cat saw the mouse", 
"the mouse ran away from the house", 
"the cat finally ate the mouse", 
"the end of the mouse story"
]

In [8]:
df = pd.DataFrame(docs,columns=['Doc'])

In [46]:
df.head()

Unnamed: 0,Doc
0,the house had a tiny little mouse
1,the cat saw the mouse
2,the mouse ran away from the house
3,the cat finally ate the mouse
4,the end of the mouse story


In [62]:
cv = CountVectorizer(stop_words='english')

In [63]:
cvw = cv.fit_transform(df['Doc'])

In [64]:
dfc = pd.DataFrame(cvw.A,columns=cv.get_feature_names())

#### Tokenized Document into Count Vectorizer in view sparse Matrix

Count for each tokenized document and apply into sparse matrix for each of its count.

In [65]:
dfc

Unnamed: 0,ate,away,cat,end,finally,house,little,mouse,ran,saw,story,tiny
0,0,0,0,0,0,1,1,1,0,0,0,1
1,0,0,1,0,0,0,0,1,0,1,0,0
2,0,1,0,0,0,1,0,1,1,0,0,0
3,1,0,1,0,1,0,0,1,0,0,0,0
4,0,0,0,1,0,0,0,1,0,0,1,0


In [66]:
tf_idf = TfidfTransformer(smooth_idf=True,use_idf=True)
tf_idf_v = tf_idf.fit_transform(cvw)

#### TFIDF for each documents.

evaluates how relevant a word is to a document in a collection of documents. This is done by multiplying two metrics: how many times a word appears in a document, and the inverse document frequency of the word across a set of documents.

In [67]:
pd.DataFrame(tf_idf_v.todense(),columns=cv.get_feature_names())

Unnamed: 0,ate,away,cat,end,finally,house,little,mouse,ran,saw,story,tiny
0,0.0,0.0,0.0,0.0,0.0,0.475575,0.589463,0.280882,0.0,0.0,0.0,0.589463
1,0.0,0.0,0.588732,0.0,0.0,0.0,0.0,0.347715,0.0,0.729718,0.0,0.0
2,0.0,0.589463,0.0,0.0,0.0,0.475575,0.0,0.280882,0.589463,0.0,0.0,0.0
3,0.589463,0.0,0.475575,0.0,0.589463,0.0,0.0,0.280882,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.670092,0.0,0.0,0.0,0.319302,0.0,0.0,0.670092,0.0
