## Bag of Words
#### 1. CountVectorizer

In [1]:
text = 'My wife likes to watch baseball games and my daughter likes to watch baseball games too.'

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
cvet = CountVectorizer()
cvet.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [3]:
ouput = cvet.fit_transform([text])      # [text] : 굉장히 많은 데이터가 있기때문에 리스트로 받겠다
ouput.toarray()

array([[1, 2, 1, 2, 2, 2, 2, 1, 2, 1]], dtype=int64)

In [4]:
# array([[1, 2, 1, 2, 2, 2, 2, 1, 2, 1]] 해석
cvet.vocabulary_        

{'my': 5,
 'wife': 9,
 'likes': 4,
 'to': 6,
 'watch': 8,
 'baseball': 1,
 'games': 3,
 'and': 0,
 'daughter': 2,
 'too': 7}

- 불용어(Stopwords) 처리

In [5]:
# 직접 지정
my_stopwords = ['my', 'to', 'and']
cvet = CountVectorizer(stop_words=my_stopwords)
print(cvet.fit_transform([text]).toarray())
print(cvet.vocabulary_)

[[2 1 2 2 1 2 1]]
{'wife': 6, 'likes': 3, 'watch': 5, 'baseball': 0, 'games': 2, 'daughter': 1, 'too': 4}


In [6]:
# NLTK에서 제공하는 불용어 사용
from nltk.corpus import stopwords
sw = stopwords.words('english')
cvet = CountVectorizer(stop_words=sw)
print(cvet.fit_transform([text]).toarray())
print(cvet.vocabulary_)

[[2 1 2 2 2 1]]
{'wife': 5, 'likes': 3, 'watch': 4, 'baseball': 0, 'games': 2, 'daughter': 1}


In [7]:
# Scikit-Learn에서 제공하는 불용어 사용
cvet = CountVectorizer(stop_words='english')
print(cvet.fit_transform([text]).toarray())
print(cvet.vocabulary_)

[[2 1 2 2 2 1]]
{'wife': 5, 'likes': 3, 'watch': 4, 'baseball': 0, 'games': 2, 'daughter': 1}


- 인덱스에 해당하는 단어를 알려주는 함수

In [12]:
voca = cvet.vocabulary_
for key, value in voca.items():
    print(key, value)

wife 5
likes 3
watch 4
baseball 0
games 2
daughter 1


In [14]:
def get_word(index, voca):
    for key, value in voca.items():
        if index == value:
            return key
        
get_word(4, cvet.vocabulary_)



# {'wife': 5, 'likes': 3, 'watch': 4, 'baseball': 0, 'games': 2, 'daughter': 1}
# 여기서 해당 밸류의 숫자와 동일한 것을 불러와줌

'watch'

#### 2. TF-IDF(Term Frequency - Inverse Document Frequency) Vectorizer

In [16]:
text = ['My wife likes to watch baseball games and my daughter likes to watch baseball games too.',
        'My wife likes to play baseball.']

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(stop_words='english')
tvect.fit_transform(text).toarray()             # text가 이번엔 리스트라서 리스트로 변경 X도됨

# baseball이 여러 문장에서 빈도가 많았음으로 감점

array([[0.38649245, 0.27160082, 0.54320165, 0.38649245, 0.        ,
        0.54320165, 0.19324622],
       [0.44832087, 0.        , 0.        , 0.44832087, 0.63009934,
        0.        , 0.44832087]])

In [22]:
print(tvect.vocabulary_)

{'wife': 6, 'likes': 3, 'watch': 5, 'baseball': 0, 'games': 2, 'daughter': 1, 'play': 4}


In [23]:
cvet = CountVectorizer(stop_words='english')
cvet.fit_transform(text).toarray()

array([[2, 1, 2, 2, 0, 2, 1],
       [1, 0, 0, 1, 1, 0, 1]], dtype=int64)

### 3. N-gram

In [24]:
text = ['Machine learning is fun and not boring',
        'Machine is boring and learning is not fun']

In [25]:
cvet = CountVectorizer(stop_words='english')
cvet.fit_transform(text).toarray()

array([[1, 1, 1, 1],
       [1, 1, 1, 1]], dtype=int64)

In [26]:
cvet = CountVectorizer(stop_words='english', ngram_range=(1,2))
# ngram_range=(1,2) : 유니그램(1)과 바이그램(2)을 한꺼번에 쓰겠다! 
cvet.fit_transform(text).toarray()

array([[1, 0, 1, 1, 1, 1, 1, 0, 1],
       [1, 1, 1, 0, 1, 1, 1, 1, 0]], dtype=int64)

In [27]:
cvet.vocabulary_

{'machine': 6,
 'learning': 4,
 'fun': 2,
 'boring': 0,
 'machine learning': 8,
 'learning fun': 5,
 'fun boring': 3,
 'machine boring': 7,
 'boring learning': 1}

### 4. Hyper Parameter

In [28]:
tvect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': 'english',
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}