In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
corpus= [
    'Data science is a study of data and arts and science.',
    'Generally art graduates are right brain and science graduate are left brain',
    'Excelling in both art and science at a time is difficult',
    'Natural Language processing is a part of data science'
]

In [3]:
corpus

['Data science is a study of data and arts and science.',
 'Generally art graduates are right brain and science graduate are left brain',
 'Excelling in both art and science at a time is difficult',
 'Natural Language processing is a part of data science']

In [4]:
tfidfModel = TfidfVectorizer()
data = tfidfModel.fit_transform(corpus)

In [5]:
data

<4x25 sparse matrix of type '<class 'numpy.float64'>'
	with 35 stored elements in Compressed Sparse Row format>

In [7]:
tfidfModel.get_feature_names()

['and',
 'are',
 'art',
 'arts',
 'at',
 'both',
 'brain',
 'data',
 'difficult',
 'excelling',
 'generally',
 'graduate',
 'graduates',
 'in',
 'is',
 'language',
 'left',
 'natural',
 'of',
 'part',
 'processing',
 'right',
 'science',
 'study',
 'time']

In [8]:
tfidfModel.vocabulary_

{'data': 7,
 'science': 22,
 'is': 14,
 'study': 23,
 'of': 18,
 'and': 0,
 'arts': 3,
 'generally': 10,
 'art': 2,
 'graduates': 12,
 'are': 1,
 'right': 21,
 'brain': 6,
 'graduate': 11,
 'left': 16,
 'excelling': 9,
 'in': 13,
 'both': 5,
 'at': 4,
 'time': 24,
 'difficult': 8,
 'natural': 17,
 'language': 15,
 'processing': 20,
 'part': 19}

In [9]:
data.todense()

matrix([[0.44486984, 0.        , 0.        , 0.34848729, 0.        ,
         0.        , 0.        , 0.54950276, 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.22243492,
         0.        , 0.        , 0.        , 0.27475138, 0.        ,
         0.        , 0.        , 0.36371025, 0.34848729, 0.        ],
        [0.16878271, 0.52886145, 0.20848022, 0.        , 0.        ,
         0.        , 0.52886145, 0.        , 0.        , 0.        ,
         0.26443072, 0.26443072, 0.26443072, 0.        , 0.        ,
         0.        , 0.26443072, 0.        , 0.        , 0.        ,
         0.        , 0.26443072, 0.13799092, 0.        , 0.        ],
        [0.22989237, 0.        , 0.28396281, 0.        , 0.36017082,
         0.36017082, 0.        , 0.        , 0.36017082, 0.36017082,
         0.        , 0.        , 0.        , 0.36017082, 0.22989237,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        ,

In [10]:
tfidfModel.idf_

array([1.22314355, 1.91629073, 1.51082562, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.51082562, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.22314355,
       1.91629073, 1.91629073, 1.91629073, 1.51082562, 1.91629073,
       1.91629073, 1.91629073, 1.        , 1.91629073, 1.91629073])

In [14]:
words_idf = dict(zip(tfidfModel.get_feature_names(), tfidfModel.idf_))


In [15]:
sorted(words_idf.items(),key = lambda x: x[1])

[('science', 1.0),
 ('and', 1.2231435513142097),
 ('is', 1.2231435513142097),
 ('art', 1.5108256237659907),
 ('data', 1.5108256237659907),
 ('of', 1.5108256237659907),
 ('are', 1.916290731874155),
 ('arts', 1.916290731874155),
 ('at', 1.916290731874155),
 ('both', 1.916290731874155),
 ('brain', 1.916290731874155),
 ('difficult', 1.916290731874155),
 ('excelling', 1.916290731874155),
 ('generally', 1.916290731874155),
 ('graduate', 1.916290731874155),
 ('graduates', 1.916290731874155),
 ('in', 1.916290731874155),
 ('language', 1.916290731874155),
 ('left', 1.916290731874155),
 ('natural', 1.916290731874155),
 ('part', 1.916290731874155),
 ('processing', 1.916290731874155),
 ('right', 1.916290731874155),
 ('study', 1.916290731874155),
 ('time', 1.916290731874155)]

In [18]:
tfidf_df = pd.DataFrame(data.todense())
tfidf_df.columns = sorted(tfidfModel.vocabulary_)
tfidf_df.head()

Unnamed: 0,and,are,art,arts,at,both,brain,data,difficult,excelling,...,language,left,natural,of,part,processing,right,science,study,time
0,0.44487,0.0,0.0,0.348487,0.0,0.0,0.0,0.549503,0.0,0.0,...,0.0,0.0,0.0,0.274751,0.0,0.0,0.0,0.36371,0.348487,0.0
1,0.168783,0.528861,0.20848,0.0,0.0,0.0,0.528861,0.0,0.0,0.0,...,0.0,0.264431,0.0,0.0,0.0,0.0,0.264431,0.137991,0.0,0.0
2,0.229892,0.0,0.283963,0.0,0.360171,0.360171,0.0,0.0,0.360171,0.360171,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.187952,0.0,0.360171
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.323955,0.0,0.0,...,0.410896,0.0,0.410896,0.323955,0.410896,0.410896,0.0,0.214423,0.0,0.0
