In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np

In [3]:
# 텍스트 수치화.txt
txt = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'The last document?',
]
corpus = pd.Series( txt)
corpus

0            This is the first document.
1    This is the second second document.
2                     And the third one.
3            Is this the first document?
4                     The last document?
dtype: object

In [4]:
docs = [
    '먹고 싶은 사과',
    '먹고 싶은 딸기',
    '맛있고 빨간 딸기 딸기',
    '저는 과일이 좋아요'
]

In [5]:
vect = CountVectorizer()
vv = vect.fit_transform(docs)

In [6]:
vv

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [7]:
vv.toarray()

array([[0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 1, 0, 0],
       [0, 2, 1, 0, 1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 1]], dtype=int64)

In [8]:
vect.vocabulary_

{'먹고': 3,
 '싶은': 6,
 '사과': 5,
 '딸기': 1,
 '맛있고': 2,
 '빨간': 4,
 '저는': 7,
 '과일이': 0,
 '좋아요': 8}

In [11]:
sorted( vect.vocabulary_.items(), key = lambda v:v[1] )

[('과일이', 0),
 ('딸기', 1),
 ('맛있고', 2),
 ('먹고', 3),
 ('빨간', 4),
 ('사과', 5),
 ('싶은', 6),
 ('저는', 7),
 ('좋아요', 8)]

In [13]:
df = pd.DataFrame( vv.toarray(), columns=sorted( vect.vocabulary_ ) )
df

Unnamed: 0,과일이,딸기,맛있고,먹고,빨간,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,1,0,1,0,0,1,0,0
2,0,2,1,0,1,0,0,0,0
3,1,0,0,0,0,0,0,1,1


## TF-IDF

In [None]:
# 텍스트 수치화.txt
txt = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'The last document?',
]
corpus = pd.Series( txt)
corpus

In [14]:
txt

['This is the first document.',
 'This is the second second document.',
 'And the third one.',
 'Is this the first document?',
 'The last document?']

In [15]:
vect = TfidfVectorizer()
vv = vect.fit_transform( txt )

In [17]:
vect.vocabulary_

{'this': 9,
 'is': 3,
 'the': 7,
 'first': 2,
 'document': 1,
 'second': 6,
 'and': 0,
 'third': 8,
 'one': 5,
 'last': 4}

In [18]:
sorted( vect.vocabulary_.items(), key = lambda v:v[1] )

[('and', 0),
 ('document', 1),
 ('first', 2),
 ('is', 3),
 ('last', 4),
 ('one', 5),
 ('second', 6),
 ('the', 7),
 ('third', 8),
 ('this', 9)]

In [21]:
vv.toarray()

array([[0.        , 0.38947624, 0.55775063, 0.4629834 , 0.        ,
        0.        , 0.        , 0.32941651, 0.        , 0.4629834 ],
       [0.        , 0.24151532, 0.        , 0.28709733, 0.        ,
        0.        , 0.85737594, 0.20427211, 0.        , 0.28709733],
       [0.55666851, 0.        , 0.        , 0.        , 0.        ,
        0.55666851, 0.        , 0.26525553, 0.55666851, 0.        ],
       [0.        , 0.38947624, 0.55775063, 0.4629834 , 0.        ,
        0.        , 0.        , 0.32941651, 0.        , 0.4629834 ],
       [0.        , 0.45333103, 0.        , 0.        , 0.80465933,
        0.        , 0.        , 0.38342448, 0.        , 0.        ]])

In [22]:
df = pd.DataFrame( vv.toarray(), columns=sorted( vect.vocabulary_ ) )
df

Unnamed: 0,and,document,first,is,last,one,second,the,third,this
0,0.0,0.389476,0.557751,0.462983,0.0,0.0,0.0,0.329417,0.0,0.462983
1,0.0,0.241515,0.0,0.287097,0.0,0.0,0.857376,0.204272,0.0,0.287097
2,0.556669,0.0,0.0,0.0,0.0,0.556669,0.0,0.265256,0.556669,0.0
3,0.0,0.389476,0.557751,0.462983,0.0,0.0,0.0,0.329417,0.0,0.462983
4,0.0,0.453331,0.0,0.0,0.804659,0.0,0.0,0.383424,0.0,0.0
