# 자연어 처리

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import preprocessing

samples = ['너 오늘 이뻐 보인다',
           '나는 오늘 기분이 더러워',
           '끝내주는데, 좋은 일이 있나봐',
           '나 좋은 일이 생겼어',
           '아 오늘 진짜 짜증나',
           '환상적인데, 정말 좋은거 같아']

targets = [[1], [0], [1], [1], [0], [1]]          # 1: 긍정, 0: 부정

# 토큰화
tokenizer = preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(samples)                   # 문장을 학습해서 토큰화
sequences = tokenizer.texts_to_sequences(samples) # 리스트로 반환
sequences

[[4, 1, 5, 6],
 [7, 1, 8, 9],
 [10, 2, 3, 11],
 [12, 2, 3, 13],
 [14, 1, 15, 16],
 [17, 18, 19, 20]]

In [None]:
# 리스트를 배열로 변환하기
import numpy as np
input_sequences = np.array(sequences)
labels = np.array(targets)
input_sequences, labels

(array([[ 4,  1,  5,  6],
        [ 7,  1,  8,  9],
        [10,  2,  3, 11],
        [12,  2,  3, 13],
        [14,  1, 15, 16],
        [17, 18, 19, 20]]), array([[1],
        [0],
        [1],
        [1],
        [0],
        [1]]))

In [None]:
# 토큰화 작업과정 보기
word_index = tokenizer.word_index
word_index

{'오늘': 1,
 '좋은': 2,
 '일이': 3,
 '너': 4,
 '이뻐': 5,
 '보인다': 6,
 '나는': 7,
 '기분이': 8,
 '더러워': 9,
 '끝내주는데': 10,
 '있나봐': 11,
 '나': 12,
 '생겼어': 13,
 '아': 14,
 '진짜': 15,
 '짜증나': 16,
 '환상적인데': 17,
 '정말': 18,
 '좋은거': 19,
 '같아': 20}

In [None]:
# 책의 오탈자.
# (전) 앞에서 다룬 내용이므로 전처리 과정에 대한 설명은 생략한다.
# (후) 전처리 과정은 뒤에서 다시 다룰 예정이므로 여기서는 간단하게 넘어가도록 한다. 간단히 설명하면 텍스트를 모델에 들어갈 수 있는 구조로 만든 것이다.

In [None]:
# Settings
batch_size = 2
num_epochs = 100
vocab_size = len(word_index) + 1            # 0을 포함해서 1을 더한다.
emb_size = 128                              # 임베딩 수, 벡터화 하기 위함
hidden_dimension = 256                      # 은닉층 수
output_dimension = 1                        # 출력층

## (1) Keras Sequential API (p.40)
### Sequential API를 활용해 심층 신경망 모델 생성

In [None]:
# modeling 1
model1 = tf.keras.Sequential()
model1.add(layers.Embedding(vocab_size, emb_size, input_length = 4))     # samples에서 한 문장당 단어가 4개로 이루어져 있었다.
model1.add(layers.Lambda(lambda x: tf.reduce_mean(x, axis = 1)))
model1.add(layers.Dense(hidden_dimension, activation = 'relu'))
model1.add(layers.Dense(output_dimension, activation = 'sigmoid'))
model1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 128)            2688      
                                                                 
 lambda (Lambda)             (None, 128)               0         
                                                                 
 dense (Dense)               (None, 256)               33024     
                                                                 
 dense_1 (Dense)             (None, 1)                 257       
                                                                 
Total params: 35,969
Trainable params: 35,969
Non-trainable params: 0
_________________________________________________________________


### 모델 컴파일

In [None]:
# modeling 1 (다른 표현)
model2 = tf.keras.Sequential([
    layers.Embedding(vocab_size, emb_size, input_length = 4),
    layers.Lambda(lambda x: tf.reduce_mean(x, axis = 1)),
    layers.Dense(hidden_dimension, activation = 'relu'),
    layers.Dense(output_dimension, activation = 'sigmoid')
])
model2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 4, 128)            2688      
                                                                 
 lambda_1 (Lambda)           (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 256)               33024     
                                                                 
 dense_3 (Dense)             (None, 1)                 257       
                                                                 
Total params: 35,969
Trainable params: 35,969
Non-trainable params: 0
_________________________________________________________________


In [None]:
# model compile
model1.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

### 모델 학습

In [None]:
# model fit
model1.fit(input_sequences, labels, epochs=num_epochs, batch_size=batch_size)        # 지도 학습

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f19445fa6d0>

In [None]:
# predict
model1.predict([input_sequences])



array([[9.9906105e-01],
       [6.9417118e-04],
       [9.9973649e-01],
       [9.9972683e-01],
       [7.5124658e-04],
       [9.9941421e-01]], dtype=float32)

## (2) Keras Functional API로 모델 구현

In [None]:
# modeling 2
inputs = layers.Input(shape = (4, ))

embed_output = layers.Embedding(vocab_size, emb_size)(inputs)
pooled_output = tf.reduce_mean(embed_output, axis = 1)
hidden_layer = layers.Dense(hidden_dimension, activation = 'relu')(pooled_output)
outputs = layers.Dense(output_dimension, activation = 'sigmoid')(hidden_layer)

model2 = tf.keras.Model(inputs = inputs, outputs = outputs)
model2.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 4)]               0         
                                                                 
 embedding_6 (Embedding)     (None, 4, 128)            2688      
                                                                 
 tf.math.reduce_mean_1 (TFOp  (None, 128)              0         
 Lambda)                                                         
                                                                 
 dense_12 (Dense)            (None, 256)               33024     
                                                                 
 dense_13 (Dense)            (None, 1)                 257       
                                                                 
Total params: 35,969
Trainable params: 35,969
Non-trainable params: 0
_______________________________________________________

In [None]:
# model compile
model2.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [None]:
# model fit
model2.fit(input_sequences, labels, epochs=num_epochs, batch_size=batch_size)        # 지도 학습

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f31c9646e10>

In [None]:
# predict
model2.predict([input_sequences])



array([[9.9917138e-01],
       [7.6435902e-04],
       [9.9970919e-01],
       [9.9975199e-01],
       [7.1718614e-04],
       [9.9955386e-01]], dtype=float32)

## (3) Subclass로 구현

In [None]:
# modeling 3
class CustomModel(tf.keras.Model):

    def __init__(self, vocab_size, embed_dimension, hidden_dimension, output_dimension):
        super(CustomModel, self).__init__(name='my_model')
        self.embedding = layers.Embedding(vocab_size, embed_dimension)
        self.dense_layer = layers.Dense(hidden_dimension, activation = 'relu')
        self.output_layer = layers.Dense(output_dimension, activation = 'sigmoid')

    def call(self, inputs):
        x = self.embedding(inputs)
        x = tf.reduce_mean(x, axis = 1)
        x = self.dense_layer(x)
        x = self.output_layer(x)

        return x

model3 = CustomModel(vocab_size = vocab_size,
                     embed_dimension = emb_size,
                     hidden_dimension = hidden_dimension,
                     output_dimension = output_dimension)

In [None]:
model3.call(inputs)

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'dense_29')>

In [None]:
model3.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=0.001),
               loss = 'binary_crossentropy',
               metrics = ['accuracy'])

In [None]:
model3.fit(input_sequences, labels, epochs=num_epochs, batch_size = batch_size)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f31c88e7f10>

In [None]:
model3.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 4, 128)            2688      
                                                                 
 dense_28 (Dense)            (None, 256)               33024     
                                                                 
 dense_29 (Dense)            (None, 1)                 257       
                                                                 
Total params: 35,969
Trainable params: 35,969
Non-trainable params: 0
_________________________________________________________________


# 싸이킷런

In [None]:
import sklearn
sklearn.__version__

'1.0.2'

## Iris 데이터

In [None]:
from sklearn.datasets import load_iris

iris_data = load_iris()
iris_data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [None]:
# print(iris_data['data'])
print('shape of data: {}'.format(iris_data['data'].shape))

shape of data: (150, 4)


In [None]:
print(iris_data['feature_names'])

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [None]:
print(iris_data['target_names'])

['setosa' 'versicolor' 'virginica']


In [None]:
# description
print(iris_data['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

### 데이터 분리

In [None]:
from sklearn.model_selection import train_test_split
inputs = iris_data['data']
target = iris_data['target']
train_X, test_X, train_y, test_y = train_test_split(inputs, target, test_size = 0.25, random_state = 42)

print(train_X.shape, test_X.shape, train_y.shape, test_y.shape)

(112, 4) (38, 4) (112,) (38,)


### 지도 학습(K-nearest neighbor classifier)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn1 = KNeighborsClassifier(n_neighbors = 1)
knn2 = KNeighborsClassifier(algorithm = 'auto', leaf_size = 30, metric = 'minkowski',
                            metric_params = None, n_jobs = 1, n_neighbors = 1, p = 2, weights = 'uniform')
knn1.fit(train_X, train_y)
knn2.fit(train_X, train_y)

KNeighborsClassifier(n_jobs=1, n_neighbors=1)

In [None]:
# 새로운 데이터 예측
new_input = np.array([[6.1, 2.8, 4.7, 1.2]])
predict_y1 = knn.predict(new_input)
predict_y2 = knn.predict(new_input)

print(predict_y1, predict_y2)

[1] [1]


In [None]:
# test data
predict_y1 = knn1.predict(test_X)
predict_y2 = knn2.predict(test_X)
print(predict_y1)
print(predict_y2)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0]
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0]


In [None]:
accuracy1 = np.sum(test_y == predict_y1) / len(test_y)
accuracy2 = np.sum(test_y == predict_y2) / len(test_y)
print('Accuracy rate = {}'.format(accuracy1))
print('Accuracy rate = {}'.format(accuracy2))

Accuracy rate = 1.0
Accuracy rate = 1.0


### 비지도학습(Kmeans Clustering)

In [None]:
from sklearn.cluster import KMeans
k_means1 = KMeans(n_clusters = 3)
# k_means2 = KMeans(algorithm = 'auto', copy_x = True, init = 'k-means++', max_iter = 300,
#                   n_clusters = 3, n_init = 10, n_jobs = 1, precompute_distances = 'auto',
#                   random_state = None, tol = 0.0001, verbose = 0)

In [None]:
k_means1.fit(train_X)
# k_means2.fit(train_X)

KMeans(n_clusters=3)

In [None]:
k_means1.labels_

array([1, 1, 2, 2, 2, 1, 1, 2, 2, 0, 2, 0, 2, 0, 2, 1, 0, 2, 1, 1, 1, 2,
       2, 1, 1, 1, 2, 1, 2, 0, 1, 2, 2, 1, 2, 2, 2, 2, 0, 2, 1, 2, 0, 1,
       1, 2, 0, 1, 2, 1, 1, 2, 2, 0, 2, 0, 0, 2, 1, 1, 2, 0, 1, 1, 1, 2,
       0, 1, 0, 0, 1, 2, 2, 2, 0, 0, 1, 0, 2, 0, 2, 2, 2, 1, 2, 2, 1, 2,
       0, 0, 1, 2, 0, 0, 1, 0, 1, 0, 0, 0, 2, 0, 2, 2, 2, 2, 1, 2, 2, 1,
       2, 0], dtype=int32)

In [None]:
# 분포 확인하기
print(train_y[k_means1.labels_ == 0])
print(train_y[k_means1.labels_ == 1])
print(train_y[k_means1.labels_ == 2])

[2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 2]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[2 1 1 1 2 1 1 1 1 1 2 1 1 1 2 2 2 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 1 1
 1 1 1 1 1 1 2 2 1 2 1]


In [None]:
# 새로운 값에 대한 예측
predict_y1 = k_means1.predict(new_input)
predict_y1


array([2], dtype=int32)

In [None]:
# 테스트셋 예측
predict_y1 = k_means1.predict(test_X)
predict_y1

array([2, 1, 0, 2, 2, 1, 2, 0, 2, 2, 0, 1, 1, 1, 1, 2, 0, 2, 2, 0, 1, 2,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1], dtype=int32)

In [None]:
test_y

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0])

In [None]:
# 재 그룹화
arr_pred1 = np.array(predict_y1)
tmp_pred1 = np.array(predict_y1)
arr_pred1[tmp_pred1 == 0] = 2
arr_pred1[tmp_pred1 == 1] = 0
arr_pred1[tmp_pred1 == 2] = 1

arr_pred1

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 1,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0], dtype=int32)

In [None]:
accuracy1 = np.sum(test_y == arr_pred1) / len(test_y)
print('Accuracy rate = {}'.format(accuracy1))

Accuracy rate = 0.9473684210526315


## 사이킷런을 이용한 특징 추출

### CountVectorizer
* 텍스트에서 단어를 기준으로 횟수를 측정하여 벡터로 만든다.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

text_data = ['나는 배가 고프다',
             '내일 점심 뭐먹지',
             '내일 공부 해야겠다',
             '점심 먹고 공부해야지']

# 단어 사전 만들기
count_vectorizer = CountVectorizer()
count_vectorizer.fit(text_data)
print(count_vectorizer.vocabulary_)

{'나는': 3, '배가': 7, '고프다': 0, '내일': 4, '점심': 8, '뭐먹지': 6, '공부': 1, '해야겠다': 9, '먹고': 5, '공부해야지': 2}


In [None]:
# 벡터로 만들기
for text in text_data:
    print(count_vectorizer.transform([text]).toarray())

[[1 0 0 1 0 0 0 1 0 0]]
[[0 0 0 0 1 0 1 0 1 0]]
[[0 1 0 0 1 0 0 0 0 1]]
[[0 0 1 0 0 1 0 0 1 0]]


In [None]:
count_vectorizer.transform(['아이고 배 고프다. 일단 밥 먹고 해야지']).toarray()

array([[1, 0, 0, 0, 0, 1, 0, 0, 0, 0]])

In [None]:
count_vectorizer.get_feature_names()        # 단어 사전의 key(id) 순서대로



['고프다', '공부', '공부해야지', '나는', '내일', '먹고', '뭐먹지', '배가', '점심', '해야겠다']

In [None]:
# 각 문장을 벡터로 만들기
import pandas as pd
df = pd.DataFrame(count_vectorizer.transform(text_data).toarray(), columns=count_vectorizer.get_feature_names())
df




Unnamed: 0,고프다,공부,공부해야지,나는,내일,먹고,뭐먹지,배가,점심,해야겠다
0,1,0,0,1,0,0,0,1,0,0
1,0,0,0,0,1,0,1,0,1,0
2,0,1,0,0,1,0,0,0,0,1
3,0,0,1,0,0,1,0,0,1,0


CountVectorizer는 다음과 같은 세가지 작업을 수행한다.

1. 문서를 토큰 리스트로 변환한다.

2. 각 문서에서 토큰의 출현 빈도를 센다.

3. 각 문서를 BOW 인코딩 벡터로 변환한다.


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'The last document?',
]

vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_

{'this': 9,
 'is': 3,
 'the': 7,
 'first': 2,
 'document': 1,
 'second': 6,
 'and': 0,
 'third': 8,
 'one': 5,
 'last': 4}

In [None]:
count_vectorizer.transform(['Something completely new.']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [None]:
# 벡터로 만들기
for text in corpus:
    print(vect.transform([text]).toarray())

[[0 1 1 1 0 0 0 1 0 1]]
[[0 1 0 1 0 0 2 1 0 1]]
[[1 0 0 0 0 1 0 1 1 0]]
[[0 1 1 1 0 0 0 1 0 1]]
[[0 1 0 0 1 0 0 1 0 0]]


In [None]:
vect.get_feature_names()        # 단어 사전의 key(id) 순서대로



['and',
 'document',
 'first',
 'is',
 'last',
 'one',
 'second',
 'the',
 'third',
 'this']

In [None]:
# 각 문장을 벡터로 만들기
import pandas as pd
df = pd.DataFrame(vect.transform(text_data).toarray(), columns=vect.get_feature_names())
df




Unnamed: 0,and,document,first,is,last,one,second,the,third,this
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0


In [None]:
#### Stop Words
vect = CountVectorizer(stop_words=['and', 'is', 'the', 'this']).fit(corpus)
vect.vocabulary_

{'first': 1, 'document': 0, 'second': 4, 'third': 5, 'one': 3, 'last': 2}

In [None]:
#### Token
vect = CountVectorizer(analyzer='word').fit(corpus)
print(vect.vocabulary_)
vect = CountVectorizer(token_pattern='t\w+').fit(corpus)        # 't'로 시작하며 문자 수가 1개 이상인 것
print(vect.vocabulary_)

{'this': 9, 'is': 3, 'the': 7, 'first': 2, 'document': 1, 'second': 6, 'and': 0, 'third': 8, 'one': 5, 'last': 4}
{'this': 2, 'the': 0, 'third': 1}


In [None]:
#### Ngram
# N그램은 단어장 생성에 사용할 토큰의 크기를 결정한다. 모노그램(monogram)은 토큰 하나만 단어로 사용하며 바이그램(bigram)은 두 개의 연결된 토큰을 하나의 단어로 사용한다.
vect = CountVectorizer(ngram_range=(2, 3)).fit(corpus)
vect.vocabulary_

{'this is': 21,
 'is the': 3,
 'the first': 12,
 'first document': 2,
 'this is the': 22,
 'is the first': 4,
 'the first document': 13,
 'the second': 16,
 'second second': 10,
 'second document': 9,
 'is the second': 5,
 'the second second': 17,
 'second second document': 11,
 'and the': 0,
 'the third': 18,
 'third one': 20,
 'and the third': 1,
 'the third one': 19,
 'is this': 6,
 'this the': 23,
 'is this the': 7,
 'this the first': 24,
 'the last': 14,
 'last document': 8,
 'the last document': 15}

### TfidfVectorizer
* TF-IDF라는 특정한 값을 사용해서 추출한다.
* TF: 한 데이터 안에서 특정 단어가 나타나는 횟수
* DF: 문서 빈도값, 특정 단어가 여러 데이터에 얼마나 자주 나타나는지 알려주는 척도

Frequency - Inversed Document Frequency)

TF-IDF(Term Frequency – Inverse Document Frequency) 인코딩은 단어를 갯수 그대로 카운트하지 않고 모든 문서에 공통적으로 들어있는 단어의 경우 문서 구별 능력이 떨어진다고 보아 가중치를 축소하는 방법이다.


구제적으로는 문서 $d$(document)와 단어 $t$ 에 대해 다음과 같이 계산한다.

$$ \text{tf-idf}(d, t) = \text{tf}(d, t) \cdot \text{idf}(t) $$


여기에서

* $\text{tf}(d, t)$: term frequency. 특정한 단어의 빈도수
* $\text{idf}(t)$ : inverse document frequency. 특정한 단어가 들어 있는 문서의 수에 반비례하는 수

 $$ \text{idf}(d, t) = \log \dfrac{n}{1 + \text{df}(t)} $$

* $n$ : 전체 문서의 수
* $\text{df}(t)$:  단어 $t$를 가진 문서의 수


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

text_data = ['나는 배가 고프다',
             '내일 점심 뭐먹지',
             '내일 공부 해야겠다',
             '점심 먹고 공부해야지']

# 단어 사전 만들기
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(text_data)
print(tfidf_vectorizer.vocabulary_)

{'나는': 3, '배가': 7, '고프다': 0, '내일': 4, '점심': 8, '뭐먹지': 6, '공부': 1, '해야겠다': 9, '먹고': 5, '공부해야지': 2}


In [None]:
# 벡터로 만들기
for text in text_data:
    print(tfidf_vectorizer.transform([text]).toarray())

[[0.57735027 0.         0.         0.57735027 0.         0.
  0.         0.57735027 0.         0.        ]]
[[0.         0.         0.         0.         0.52640543 0.
  0.66767854 0.         0.52640543 0.        ]]
[[0.         0.61761437 0.         0.         0.48693426 0.
  0.         0.         0.         0.61761437]]
[[0.         0.         0.61761437 0.         0.         0.61761437
  0.         0.         0.48693426 0.        ]]


In [None]:
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And the third one.',
    'Is this the first document?',
    'The last document?',
]

# 단어 사전 만들기
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(corpus)

# 각 문장을 벡터로 만들기
import pandas as pd
df = pd.DataFrame(tfidf_vectorizer.transform(corpus).toarray(), columns=tfidf_vectorizer.get_feature_names())
df



Unnamed: 0,and,document,first,is,last,one,second,the,third,this
0,0.0,0.389476,0.557751,0.462983,0.0,0.0,0.0,0.329417,0.0,0.462983
1,0.0,0.241515,0.0,0.287097,0.0,0.0,0.857376,0.204272,0.0,0.287097
2,0.556669,0.0,0.0,0.0,0.0,0.556669,0.0,0.265256,0.556669,0.0
3,0.0,0.389476,0.557751,0.462983,0.0,0.0,0.0,0.329417,0.0,0.462983
4,0.0,0.453331,0.0,0.0,0.804659,0.0,0.0,0.383424,0.0,0.0
