In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [2]:
corpus = ['The series depicts the story of three poor sisters who are close',
          'their involvement in the case of 70 billion won which goes missing',
          'how they bravely face the richest family who are their opponents']

# TF-IDF using CountVectorizer

In [3]:
cv = CountVectorizer()
cv

CountVectorizer()

In [7]:
word_count_vect = cv.fit_transform(corpus)
word_count_vect

<3x28 sparse matrix of type '<class 'numpy.int64'>'
	with 34 stored elements in Compressed Sparse Row format>

In [8]:
print(word_count_vect)

  (0, 21)	2
  (0, 18)	1
  (0, 6)	1
  (0, 20)	1
  (0, 14)	1
  (0, 24)	1
  (0, 16)	1
  (0, 19)	1
  (0, 26)	1
  (0, 1)	1
  (0, 5)	1
  (1, 21)	1
  (1, 14)	1
  (1, 22)	1
  (1, 12)	1
  (1, 11)	1
  (1, 4)	1
  (1, 0)	1
  (1, 2)	1
  (1, 27)	1
  (1, 25)	1
  (1, 9)	1
  (1, 13)	1
  (2, 21)	1
  (2, 26)	1
  (2, 1)	1
  (2, 22)	1
  (2, 10)	1
  (2, 23)	1
  (2, 3)	1
  (2, 7)	1
  (2, 17)	1
  (2, 8)	1
  (2, 15)	1


In [9]:
print(word_count_vect.shape) #3 docs, 28 features

(3, 28)


In [10]:
# view dense representation
cv_matrix = word_count_vect.toarray()
cv_matrix

array([[0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 2,
        0, 0, 1, 0, 1, 0],
       [1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
        1, 1, 0, 0, 1, 0]])

In [11]:
#get all unique words in the corpus
vocab = cv.get_feature_names()
vocab



['70',
 'are',
 'billion',
 'bravely',
 'case',
 'close',
 'depicts',
 'face',
 'family',
 'goes',
 'how',
 'in',
 'involvement',
 'missing',
 'of',
 'opponents',
 'poor',
 'richest',
 'series',
 'sisters',
 'story',
 'the',
 'their',
 'they',
 'three',
 'which',
 'who',
 'won']

In [12]:
#bag of words model based document feature vectors
pd.DataFrame(cv_matrix, columns=vocab)

Unnamed: 0,70,are,billion,bravely,case,close,depicts,face,family,goes,...,series,sisters,story,the,their,they,three,which,who,won
0,0,1,0,0,0,1,1,0,0,0,...,1,1,1,2,0,0,1,0,1,0
1,1,0,1,0,1,0,0,0,0,1,...,0,0,0,1,1,0,0,1,0,1
2,0,1,0,1,0,0,0,1,1,0,...,0,0,0,1,1,1,0,0,1,0


# TF-IDF using TfidfTransformer

In [13]:
tf_idf = TfidfTransformer(use_idf = True)
tf_idf_matrix = tf_idf.fit_transform(cv_matrix)
tf_idf_matrix = tf_idf_matrix.toarray()
tf_idf_matrix

array([[0.        , 0.23894521, 0.        , 0.        , 0.        ,
        0.31418424, 0.31418424, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.23894521,
        0.        , 0.31418424, 0.        , 0.31418424, 0.31418424,
        0.31418424, 0.37112454, 0.        , 0.        , 0.31418424,
        0.        , 0.23894521, 0.        ],
       [0.30852405, 0.        , 0.30852405, 0.        , 0.30852405,
        0.        , 0.        , 0.        , 0.        , 0.30852405,
        0.        , 0.30852405, 0.30852405, 0.30852405, 0.23464049,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.18221927, 0.23464049, 0.        , 0.        ,
        0.30852405, 0.        , 0.30852405],
       [0.        , 0.25233341, 0.        , 0.33178811, 0.        ,
        0.        , 0.        , 0.33178811, 0.33178811, 0.        ,
        0.33178811, 0.        , 0.        , 0.        , 0.        ,
        0.33178811, 0.    

In [14]:
pd.DataFrame(np.round(tf_idf_matrix,4), columns=vocab)

Unnamed: 0,70,are,billion,bravely,case,close,depicts,face,family,goes,...,series,sisters,story,the,their,they,three,which,who,won
0,0.0,0.2389,0.0,0.0,0.0,0.3142,0.3142,0.0,0.0,0.0,...,0.3142,0.3142,0.3142,0.3711,0.0,0.0,0.3142,0.0,0.2389,0.0
1,0.3085,0.0,0.3085,0.0,0.3085,0.0,0.0,0.0,0.0,0.3085,...,0.0,0.0,0.0,0.1822,0.2346,0.0,0.0,0.3085,0.0,0.3085
2,0.0,0.2523,0.0,0.3318,0.0,0.0,0.0,0.3318,0.3318,0.0,...,0.0,0.0,0.0,0.196,0.2523,0.3318,0.0,0.0,0.2523,0.0
