In [4]:
import os
import nltk
import konlpy
import numpy as np
import pandas as pd
import re
from konlpy.tag import Okt

### 1. Bag of Words (BOW)

In [7]:
corpus = [
    'Comptuer scientists are just nerds',
    'Social scientists are attractive',
    'Are nerds attractive',
    'Probably not',
    'THE BOOM shall rise'
]

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'Comptuer scientists are just nerds.',
    'Social scientists are attractive.',
    'Are nerds attractive?',
    'Probably not.',
    'THE BOOM shall rise'
]
vect = CountVectorizer()
vect.fit(corpus)
vect.vocabulary_

{'comptuer': 3,
 'scientists': 9,
 'are': 0,
 'just': 4,
 'nerds': 5,
 'social': 11,
 'attractive': 1,
 'probably': 7,
 'not': 6,
 'the': 12,
 'boom': 2,
 'shall': 10,
 'rise': 8}

In [35]:
dictdf = pd.DataFrame.from_dict(vect.vocabulary_, orient='index').reset_index()
dictdf.columns = ['word', 'num']

dictdf = dictdf.sort_values('num').reset_index(drop=True)
dictdf.to_csv('./data/20210206dictionary.csv')

In [12]:
vect.transform(['Economists are nerds']).toarray()

array([[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [13]:
vect.transform(['Social science boom will rise']).toarray()

array([[0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0]], dtype=int64)

In [14]:
vect.transform(['I majored in management engineering']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)

In [6]:
vect.transform(corpus).toarray()

array([[1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0],
       [1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1]], dtype=int64)

In [7]:
vect.transform(corpus).toarray().shape

(5, 13)

In [8]:
# Using Stop Words
vect2 = CountVectorizer(stop_words=["are","the"]).fit(corpus)
vect2.vocabulary_

{'comptuer': 2,
 'scientists': 8,
 'just': 3,
 'nerds': 4,
 'social': 10,
 'attractive': 0,
 'probably': 6,
 'not': 5,
 'boom': 1,
 'shall': 9,
 'rise': 7}

### 중요하지 않은 (혹은 noise들을) 어휘들을 어떻게 제거할 것인가
1. 형태소분석기(tokenizer, pos-tagger)를 사용하면서 명사/동사/형용사 중 특정 pos만 추출
2. Stop Words를 설정 (제외할 불용어)
3. 형태소분석기를 사용 후 Stop Words 제거 (1+2)

### Why Tf-IDF?
1. 위의 전처리 스텝을 자동화하여 불용어 여부(binary) + 사전포함여부(binary) 판별을 자동화하자
2. "가중치" 개념을 부여하자

### 2. TF-IDF

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
TfVect = TfidfVectorizer()

In [57]:
tfidv_result = TfVect.fit_transform(corpus)
tfidv_result.toarray()

array([[0.34582166, 0.        , 0.        , 0.51637397, 0.51637397,
        0.41660727, 0.        , 0.        , 0.        , 0.41660727,
        0.        , 0.        , 0.        ],
       [0.40382593, 0.48648432, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.48648432,
        0.        , 0.60298477, 0.        ],
       [0.50620441, 0.60981846, 0.        , 0.        , 0.        ,
        0.60981846, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70710678, 0.70710678, 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.5       , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.5       , 0.        ,
        0.5       , 0.        , 0.5       ]])

In [59]:
doc_distance = tfidv_result * tfidv_result.T
doc_distance.toarray()

array([[1.        , 0.34232466, 0.42911125, 0.        , 0.        ],
       [0.34232466, 1.        , 0.50108558, 0.        , 0.        ],
       [0.42911125, 0.50108558, 1.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 1.        ]])

### 3. N-gram

In [13]:
unigram_vect = CountVectorizer(ngram_range=(1, 1)).fit(corpus)
unigram_vect.vocabulary_

{'comptuer': 3,
 'scientists': 9,
 'are': 0,
 'just': 4,
 'nerds': 5,
 'social': 11,
 'attractive': 1,
 'probably': 7,
 'not': 6,
 'the': 12,
 'boom': 2,
 'shall': 10,
 'rise': 8}

In [14]:
bigram_vect = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
bigram_vect.vocabulary_

{'comptuer scientists': 4,
 'scientists are': 8,
 'are just': 1,
 'just nerds': 5,
 'social scientists': 10,
 'are attractive': 0,
 'are nerds': 2,
 'nerds attractive': 6,
 'probably not': 7,
 'the boom': 11,
 'boom shall': 3,
 'shall rise': 9}

In [15]:
trigram_vect = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
trigram_vect.vocabulary_

{'comptuer scientists are': 3,
 'scientists are just': 5,
 'are just nerds': 0,
 'social scientists are': 6,
 'scientists are attractive': 4,
 'are nerds attractive': 1,
 'the boom shall': 7,
 'boom shall rise': 2}

In [16]:
uni_to_trigram_vect = CountVectorizer(ngram_range=(1, 3)).fit(corpus)
uni_to_trigram_vect.vocabulary_

{'comptuer': 10,
 'scientists': 21,
 'are': 0,
 'just': 13,
 'nerds': 15,
 'comptuer scientists': 11,
 'scientists are': 22,
 'are just': 2,
 'just nerds': 14,
 'comptuer scientists are': 12,
 'scientists are just': 24,
 'are just nerds': 3,
 'social': 27,
 'attractive': 6,
 'social scientists': 28,
 'are attractive': 1,
 'social scientists are': 29,
 'scientists are attractive': 23,
 'are nerds': 4,
 'nerds attractive': 16,
 'are nerds attractive': 5,
 'probably': 18,
 'not': 17,
 'probably not': 19,
 'the': 30,
 'boom': 7,
 'shall': 25,
 'rise': 20,
 'the boom': 31,
 'boom shall': 8,
 'shall rise': 26,
 'the boom shall': 32,
 'boom shall rise': 9}