# 자연어 처리 NLP | 텍스트 분석 Text Analysis

In [None]:
# !conda install nltk -y

In [None]:
import nltk
nltk.__version__

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

In [None]:
from nltk.tokenize import word_tokenize, sent_tokenize

text = 'NLTK is a powerful library for NLP!!!'
word_tokenize(text)     # 단어 토큰화

In [None]:
text = '''The Matrix is everywhere its all around us, here even in this room.
You can see it out your window or on your television.
You feel it when you go to work, or go to church or pay your taxes.'''

sent_tokenize(text)

In [None]:
# 문장별 단어 토큰화
def tokenize_text(text):
    sentences = sent_tokenize(text)
    return [word_tokenize(sentence) for sentence in sentences ]

result = tokenize_text(text)
print(len(result), len(result[0]), len(result[2]))

In [None]:
# n-gram
from nltk import ngrams

text = 'The Matrix is everywhere its all around us, here even in this room.'
tokens = word_tokenize(text)

bigram = ngrams(tokens, 2)
print([token for token in bigram])

trigram = ngrams(tokens, 3)
print([token for token in trigram])

In [None]:
# 불용어(stopwords) 제거
from nltk.corpus import stopwords

stopwords.fileids()

print(len(stopwords.words('english')))
stopwords.words('english')

In [None]:
text = 'The Matrix is everywhere its all around us, here even in this room.'
stopwords_list = stopwords.words('english')

tokens = []
for word in word_tokenize(text):    # 토큰화
    word = word.lower()     # 소문자 변환
    if word not in stopwords_list:  # 불용어 처리
        tokens.append(word)

tokens

### BOW -CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

text1 = 'The Matrix is everywhere its all around us, here even in this room. \
You can see it out your window or on your television. \
You feel it when you go to work, or go to church or pay your taxes.'

text2 = 'You take the blue pill and the story ends.  You wake in your bed and you believe whatever you want to believe\
You take the red pill and you stay in Wonderland and I show you how deep the rabbit-hole goes.'

texts = [text1, text2]

count_vectorizer = CountVectorizer()
count_vectorizer.fit(texts)
text_vecs = count_vectorizer.transform(texts)

print(text_vecs)

In [None]:
import pandas as pd

print(count_vectorizer.get_feature_names_out())
print(count_vectorizer.vocabulary_)

vocab = sorted(count_vectorizer.vocabulary_.items(), key=lambda x:x[1])
vocab_df = pd.DataFrame(vocab, columns=['word', 'idx'])
vocab_df

In [None]:
# 단어별 등장 횟수 구하기
word_counts = text_vecs.toarray().sum(axis=0)

# 단어 데이터프레임에 빈도 추가
vocab_df['count'] = vocab_df['idx'].apply(lambda i: word_counts[i])

# idx 열 제거 (불필요함)
vocab_df = vocab_df.drop(columns=['idx'])
vocab_df

In [None]:
count_vectorizer = CountVectorizer(stop_words='english')
texts_vecs = count_vectorizer.fit_transform(texts)
print(text_vecs.toarray().shape)

vocab = sorted(count_vectorizer.vocabulary_.items(), key=lambda x:x[1])
vocab_df = pd.DataFrame(vocab, columns=['word', 'idx'])
vocab_df

(2, 52)


In [None]:
count_vectorizer = CountVectorizer(
    stop_words='english',
    ngram_range=(1, 2),      # n-gram 범위 지정 (최소값, 최대값) / 기본값 (1, 1)
    max_features=30          # 빈도 수 상위인 n개의 데이터 사용
)   

texts_vecs = count_vectorizer.fit_transform(texts)
print(text_vecs.toarray().shape)

count_vectorizer.get_feature_names_out()

(2, 52)


array(['bed', 'bed believe', 'believe', 'believe want', 'believeyou',
       'believeyou red', 'blue', 'blue pill', 'church', 'church pay',
       'deep', 'deep rabbit', 'ends', 'ends wake', 'feel', 'feel work',
       'goes', 'hole', 'hole goes', 'matrix', 'matrix room', 'pay',
       'pay taxes', 'pill', 'pill stay', 'pill story', 'rabbit',
       'rabbit hole', 'red', 'red pill', 'room', 'room window', 'stay',
       'stay wonderland', 'story', 'story ends', 'taxes', 'television',
       'television feel', 'wake', 'wake bed', 'want', 'want believeyou',
       'window', 'window television', 'wonderland', 'wonderland deep',
       'work', 'work church'], dtype=object)

### BOW -TfidfVectorizer

- TF-IDF == Term Frequency-Inverse Document Frequency

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
texts_vecs = tfidf_vectorizer.fit_transform(texts)

display(texts_vecs.toarray())
tfidf_vectorizer.get_feature_names_out()

array([[0.        , 0.        , 0.        , 0.        , 0.33333333,
        0.        , 0.        , 0.33333333, 0.        , 0.        ,
        0.33333333, 0.33333333, 0.        , 0.        , 0.        ,
        0.33333333, 0.        , 0.        , 0.33333333, 0.33333333,
        0.        , 0.        , 0.33333333, 0.        , 0.33333333],
       [0.22941573, 0.22941573, 0.22941573, 0.22941573, 0.        ,
        0.22941573, 0.22941573, 0.        , 0.22941573, 0.22941573,
        0.        , 0.        , 0.45883147, 0.22941573, 0.22941573,
        0.        , 0.22941573, 0.22941573, 0.        , 0.        ,
        0.22941573, 0.22941573, 0.        , 0.22941573, 0.        ]])

array(['bed', 'believe', 'believeyou', 'blue', 'church', 'deep', 'ends',
       'feel', 'goes', 'hole', 'matrix', 'pay', 'pill', 'rabbit', 'red',
       'room', 'stay', 'story', 'taxes', 'television', 'wake', 'want',
       'window', 'wonderland', 'work'], dtype=object)