# 자연어전처리과정

## 단어 단위의 전처리
* 단어 토큰화(word tokenization); 띄어쓰기, 문장기호(', , 등)을 기준으로 단어 리스트화
* 정제(cleaning); 코퍼스(분석에 활용하기 위한 자연어 데이터 (말뭉치))에서 의미 없거나 목적에 적합하지 않은 단어를 제거 (예. 빈도수 2 이하인 단어, 길이가 2 이하인 단어 등)
* 불용어(stopwords) 정의; 의미 없거나 목적에서 벗어나는 단어(목적에 맞게 정의하여)를 제거 (예. do, then, what, she, am, are ...)
* 정규화(normalization); 형태는 다르지만 같은 의미로 사용되는 단어를 하나로 통일 (예. US, USA, U.S., United States of America ...)
* 어간추출(stemming); 특정단어의 핵심이 되는 부분(어간)을 찾아 정규화(예. alize->al, ational->ate, ate->제거 ment->제거 등). porter stemmer, lancaster stemmer 등이 있음. 단, 단순히 어미만 잘라내는 방식으로, activate->activ 가 사전에 없어, 섬세하지 못함


## 문장 단위의 전처리
* 품사 태깅 등, 문장안에서 단어가 사용된 위치에 따라 품사가 달라지는 경우, 단어 단위가 아니라 문장 간의 구분이 된 상태에서 단어의 품사를 정해야 함
* 문장 토큰화(sentence tokenization); 코퍼스를 문장 단위로 토큰화. 마침표를 기준으로 토큰화. dr. mr. 의 마침표는 문장으로 인식하지 않아야 함
* 품사 태깅(POS; part of speech tagging); 문장 안에서의 단어의 품사를 태깅
    * wordnet; 거대한 영어 어휘 데이터베이스. n(wn.NOUN) 명사, a(wn.ADJ) 형용사, r(wn.ADV) 부사, v(wn.VERB) 동사 태그가 존재
* 표제어추출(Lemmatization); 단어의 사전적 어원 태깅 (예. happyiest->happy, am, are, is->be)을 통해 단어를 정규화


In [1]:
# library import and function definition. preprocess.py
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')  # needed for punctuation and acronym such as Mr. Dr. ...
nltk.download('averaged_perceptron_tagger')  # for tagger
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords  # 코퍼스의 종류에 상관없이 많이 사용되는 불용어 179개를 제공
nltk.download('stopwords')
nltk.download('sentiwordnet')
from nltk.corpus import sentiwordnet as swn
#
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

stopwords_set = set(stopwords.words('english'))

def clean_by_freq(tokenized_words, cut_off_count):
    vocab = Counter(tokenized_words)

    uncommon_words = [key for key, value in vocab.items() if value <= cut_off_count]
    cleaned_words = [word for word in tokenized_words if word not in uncommon_words]

    return cleaned_words

def clean_by_len(tokenized_words, cut_off_length):
    cleaned_words = []

    for word in tokenized_words:
        if len(word) > cut_off_length:
            cleaned_words.append(word)

    return cleaned_words

def clean_by_stopwords(tokenized_words, stopwords_set):
    cleaned_words = []

    for word in tokenized_words:
        # 여기에 코드를 작성하세요
        if word not in stopwords_set:
            cleaned_words.append(word)

    return cleaned_words

# 포터 스테머 어간 추출 함수
def stemming_by_porter(tokenized_words):
    porter_stemmer = PorterStemmer()
    porter_stemmed_words = []

    for word in tokenized_words:
        # porter_stemmed_words.append(porter_stemmer.stem(word))
        stem = porter_stemmer.stem(word)
        porter_stemmed_words.append(stem)

    return porter_stemmed_words

# 품사 태깅 함수
def pos_tagger(tokenized_sents):
    pos_tagged_words = []
    for sentence in tokenized_sents:
        # word tokenize
        tokenized_words = word_tokenize(sentence)

        # pos
        pos_tagged = pos_tag(tokenized_words)
        pos_tagged_words.extend(pos_tagged)
    return pos_tagged_words

# 품사 태깅 변환
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    else:
        return

# 표제어(lemmatization)
def word_lemmatizer(pos_tagged_words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []
    for word, tag in pos_tagged_words:
        wn_tag = penn_to_wn(tag)
        if wn_tag in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
            lemmatized_words.append(lemmatizer.lemmatize(word, wn_tag))
        else:
            lemmatized_words.append(word)
    return lemmatized_words

def combine(sentence):
    return ' '.join(sentence)

def idx_encoder(tokens, word_to_idx):
    encoded_idx = []
    for token in tokens:
        idx = word_to_idx[token]
        encoded_idx.append(idx)
    return encoded_idx

def get_sentiment_score(word, pos):
    # 단어와 품사 태그를 기반으로 Synsets 구하기
    word_sentisynsets = list(swn.senti_synsets(word, pos))

    # Synsets의 첫 번째 요소의 이름으로 단일 SentiSynset 구하기

    # SentiSynset의 긍정 지수, 부정 지수 구하기
    pos_score = word_sentisynsets[0].pos_score()
    neg_score = word_sentisynsets[0].neg_score()

    # 긍정 지수 - 부정 지수로 감성 지수 값 계산해 반환하기
    sentiment_score = pos_score-neg_score

    return sentiment_score

def swn_polarity(pos_tagged_words):  # 감성 지수를 구하는 코드
    senti_score = 0

    for word, tag in pos_tagged_words:
        # PennTreeBank 기준 품사를 WordNet 기준 품사로 변경
        wn_tag = penn_to_wn(tag)
        if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
            continue

        # Synset 확인, 어휘 사전에 없을 경우에는 스킵
        if not wn.synsets(word, wn_tag):
            continue
        else:
            synsets = wn.synsets(word, wn_tag)

        # SentiSynset 확인
        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())

        # 감성 지수 계산. pos 감성에서 neg 감성을 빼고, 그 총합을 senti_score로 집계
        word_senti_score = (swn_synset.pos_score() - swn_synset.neg_score())
        senti_score += word_senti_score

    return senti_score

def vader_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()

    senti_score = analyzer.polarity_scores(text)['compound']

    return senti_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## chapter 2. 단어 단위 전처리

In [26]:
# chapter 2, lesson 2 단어 토큰화
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')  # needed for acronym such as Mr. Dr. ...

text = "Although it's not a happily-ever-after ending, it is very realistic."

# 단어 토큰화  https://www.nltk.org/api/nltk.tokenize.html
tokenized_words = word_tokenize(text)

print(tokenized_words)

['Although', 'it', "'s", 'not', 'a', 'happily-ever-after', 'ending', ',', 'it', 'is', 'very', 'realistic', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [27]:
# chapter 2, lesson 3 단어 토큰화 실습
import nltk
#from text import TEXT
from nltk.tokenize import word_tokenize
#nltk.download('punkt')  # needed for acronym such as Mr. Dr. ...

TEXT = """Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.
There was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say to itself, 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its waistcoat-pocket, and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity, she ran across the field after it, and fortunately was just in time to see it pop down a large rabbit-hole under the hedge.
In another moment down went Alice after it, never once considering how in the world she was to get out again.
The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.
Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.
"""

corpus = TEXT

# 단어 토큰화
tokenized_words = word_tokenize(corpus)

print(tokenized_words)

['Alice', 'was', 'beginning', 'to', 'get', 'very', 'tired', 'of', 'sitting', 'by', 'her', 'sister', 'on', 'the', 'bank', ',', 'and', 'of', 'having', 'nothing', 'to', 'do', ':', 'once', 'or', 'twice', 'she', 'had', 'peeped', 'into', 'the', 'book', 'her', 'sister', 'was', 'reading', ',', 'but', 'it', 'had', 'no', 'pictures', 'or', 'conversations', 'in', 'it', ',', "'and", 'what', 'is', 'the', 'use', 'of', 'a', 'book', ',', "'", 'thought', 'Alice', "'without", 'pictures', 'or', 'conversation', '?', "'", 'So', 'she', 'was', 'considering', 'in', 'her', 'own', 'mind', '(', 'as', 'well', 'as', 'she', 'could', ',', 'for', 'the', 'hot', 'day', 'made', 'her', 'feel', 'very', 'sleepy', 'and', 'stupid', ')', ',', 'whether', 'the', 'pleasure', 'of', 'making', 'a', 'daisy-chain', 'would', 'be', 'worth', 'the', 'trouble', 'of', 'getting', 'up', 'and', 'picking', 'the', 'daisies', ',', 'when', 'suddenly', 'a', 'White', 'Rabbit', 'with', 'pink', 'eyes', 'ran', 'close', 'by', 'her', '.', 'There', 'was',

In [28]:
# chapter 2, lesson 4 정제(cleaning); cleaning by word counts
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

TEXT = """After reading the comments for this movie, I am not sure whether I should be angry, sad or sickened. Seeing comments typical of people who a)know absolutely nothing about the military or b)who base everything they think they know on movies like this or on CNN reports about Abu-Gharib makes me wonder about the state of intellectual stimulation in the world. At the time I type this the number of people in the US military: 1.4 million on Active Duty with another almost 900,000 in the Guard and Reserves for a total of roughly 2.3 million. The number of people indicted for abuses at at Abu-Gharib: Currently less than 20 That makes the total of people indicted .00083% of the total military. Even if you indict every single military member that ever stepped in to Abu-Gharib, you would not come close to making that a whole number.  The flaws in this movie would take YEARS to cover. I understand that it's supposed to be sarcastic, but in reality, the writer and director are trying to make commentary about the state of the military without an enemy to fight. In reality, the US military has been at its busiest when there are not conflicts going on. The military is the first called for disaster relief and humanitarian aid missions. When the tsunami hit Indonesia, devestating the region, the US military was the first on the scene. When the chaos of the situation overwhelmed the local governments, it was military leadership who looked at their people, the same people this movie mocks, and said make it happen. Within hours, food aid was reaching isolated villages. Within days, airfields were built, cargo aircraft started landing and a food distribution system was up and running. Hours and days, not weeks and months. Yes there are unscrupulous people in the US military. But then, there are in every walk of life, every occupation. But to see people on this website decide that 2.3 million men and women are all criminal, with nothing on their minds but thoughts of destruction or mayhem is an absolute disservice to the things that they do every day. One person on this website even went so far as to say that military members are in it for personal gain. Wow! Entry level personnel make just under $8.00 an hour assuming a 40 hour work week. Of course, many work much more than 40 hours a week and those in harm's way typically put in 16-18 hour days for months on end. That makes the pay well under minimum wage. So much for personal gain. I beg you, please make yourself familiar with the world around you. Go to a nearby base, get a visitor pass and meet some of the men and women you are so quick to disparage. You would be surprised. The military no longer accepts people in lieu of prison time. They require a minimum of a GED and prefer a high school diploma. The middle ranks are expected to get a minimum of undergraduate degrees and the upper ranks are encouraged to get advanced degrees.
"""

corpus = TEXT

# token list
tokenized_words = word_tokenize(corpus)
print(len(tokenized_words))

# token count
vocab = Counter(tokenized_words)  # {words: counter}
print('vocab is ', vocab)

uncommon_words = [key for key, value in vocab.items() if value <= 2]
print('uncommon_words are ', uncommon_words)
print('frequency <= 2;', len(uncommon_words))

cleaned_by_freq = [word for word in tokenized_words if word not in uncommon_words]
print('common_words are ', cleaned_by_freq)
print('frequency >= 3; ', len(cleaned_by_freq))

579
vocab is  Counter({'the': 30, '.': 28, ',': 21, 'of': 15, 'and': 14, 'to': 13, 'a': 12, 'military': 12, 'in': 12, 'people': 9, 'on': 9, 'are': 9, 'for': 7, 'this': 7, 'that': 6, 'I': 5, 'The': 5, 'you': 5, 'not': 4, 'or': 4, 'about': 4, 'US': 4, 'at': 4, 'every': 4, 'it': 4, 'make': 4, 'was': 4, 'movie': 3, 'be': 3, 'who': 3, 'they': 3, 'Abu-Gharib': 3, 'makes': 3, 'number': 3, 'million': 3, 'with': 3, 'total': 3, 'would': 3, 'an': 3, 'there': 3, 'days': 3, 'hour': 3, 'minimum': 3, 'get': 3, 'comments': 2, ')': 2, 'know': 2, 'nothing': 2, 'base': 2, 'state': 2, 'world': 2, 'time': 2, ':': 2, '2.3': 2, 'indicted': 2, 'than': 2, 'That': 2, "'s": 2, 'but': 2, 'reality': 2, 'is': 2, 'first': 2, 'aid': 2, 'When': 2, 'their': 2, 'Within': 2, 'hours': 2, 'food': 2, 'months': 2, 'But': 2, 'website': 2, 'men': 2, 'women': 2, 'so': 2, 'personal': 2, 'gain': 2, 'under': 2, '40': 2, 'work': 2, 'week': 2, 'much': 2, 'ranks': 2, 'degrees': 2, 'After': 1, 'reading': 1, 'am': 1, 'sure': 1, 'whethe

In [29]:
# chapter 2, lesson 4 정제(cleaning)-lesson 5 실습; cleaning by word lengths
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
#from text import TEXT
#nltk.download('punkt')

TEXT = """Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.
There was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say to itself, 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its waistcoat-pocket, and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity, she ran across the field after it, and fortunately was just in time to see it pop down a large rabbit-hole under the hedge.
In another moment down went Alice after it, never once considering how in the world she was to get out again.
The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.
Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.
"""

corpus = TEXT
tokenized_words = word_tokenize(corpus)

def clean_by_freq(tokenized_words, cut_off_count):
    vocab = Counter(tokenized_words)

    uncommon_words = [key for key, value in vocab.items() if value <= cut_off_count]
    cleaned_words = [word for word in tokenized_words if word not in uncommon_words]

    return cleaned_words

def clean_by_len(tokenized_words, cut_off_length):
    cleaned_words = []

    for word in tokenized_words:
        if len(word) > cut_off_length:
            cleaned_words.append(word)

    return cleaned_words

clean_by_freq = clean_by_freq(tokenized_words, 2)
cleaned_words = clean_by_len(clean_by_freq, 2)

#cleaned_words

In [30]:
# chapter 2, lesson 6 불용어(stopwords)
from nltk.corpus import stopwords
#nltk.download('stopwords')  # 코퍼스의 종류에 상관없이 많이 사용되는 불용어 179개를 제공

stopwords_set = set(stopwords.words('english'))

print('stopwords count :', len(stopwords_set))
print(stopwords_set)

stopwords_set.add('hello')
stopwords_set.remove('the')
stopwords_set.remove('me')

#print('stopwords count is', len(stopwords_set))
#print('stopwords are', stopwords_set)

cleaned_words = []

for word in cleaned_by_freq:
    if word not in stopwords_set:
        cleaned_words.append(word)

print('불용어 제거 전; ', len(cleaned_by_freq))
print('불용어 제거 후; ', len(cleaned_words))

stopwords count : 179
{'i', 'me', 'in', 'here', 'from', 'on', 'any', 'off', 'at', 'each', 'when', 'very', 'own', 'does', 'because', 'why', 'who', 'your', 'most', 'those', 'her', "aren't", 'were', 'where', 'll', 'yours', 'against', 'am', 'some', 'hadn', 'being', 'same', 'above', 'these', 'did', 'was', 'with', 'not', 'once', 'ourselves', 'doesn', 'more', 'wasn', 'myself', "don't", 'ours', "shan't", 'having', 'while', 'about', 'what', 'which', 's', "you're", 'should', 'didn', "hadn't", 'hasn', 'wouldn', "you'll", 'our', 'out', 'ma', 'y', 'shouldn', "that'll", 'mustn', "shouldn't", 'have', "wouldn't", "it's", 'do', 'both', 'won', 'over', 'themselves', 'couldn', 'for', 'been', 'whom', 'theirs', 'himself', 'she', 'itself', 'than', "weren't", "you'd", 'further', 'isn', "didn't", 'haven', 'is', 're', 'during', 'herself', 'all', 'just', 't', 'mightn', 'my', 'or', 'them', 'a', 'will', 'can', 'between', 'below', 'few', 'weren', "won't", 've', "mustn't", 'no', "should've", 'we', 'by', 'yourselves'

In [31]:
# 챕터 02.단어단위전처리 07.불용어제거실습
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#from text import TEXT
nltk.download('stopwords')
nltk.download('punkt')

TEXT = """After reading the comments for this movie, I am not sure whether I should be angry, sad or sickened. Seeing comments typical of people who a)know absolutely nothing about the military or b)who base everything they think they know on movies like this or on CNN reports about Abu-Gharib makes me wonder about the state of intellectual stimulation in the world. At the time I type this the number of people in the US military: 1.4 million on Active Duty with another almost 900,000 in the Guard and Reserves for a total of roughly 2.3 million. The number of people indicted for abuses at at Abu-Gharib: Currently less than 20 That makes the total of people indicted .00083% of the total military. Even if you indict every single military member that ever stepped in to Abu-Gharib, you would not come close to making that a whole number.  The flaws in this movie would take YEARS to cover. I understand that it's supposed to be sarcastic, but in reality, the writer and director are trying to make commentary about the state of the military without an enemy to fight. In reality, the US military has been at its busiest when there are not conflicts going on. The military is the first called for disaster relief and humanitarian aid missions. When the tsunami hit Indonesia, devestating the region, the US military was the first on the scene. When the chaos of the situation overwhelmed the local governments, it was military leadership who looked at their people, the same people this movie mocks, and said make it happen. Within hours, food aid was reaching isolated villages. Within days, airfields were built, cargo aircraft started landing and a food distribution system was up and running. Hours and days, not weeks and months. Yes there are unscrupulous people in the US military. But then, there are in every walk of life, every occupation. But to see people on this website decide that 2.3 million men and women are all criminal, with nothing on their minds but thoughts of destruction or mayhem is an absolute disservice to the things that they do every day. One person on this website even went so far as to say that military members are in it for personal gain. Wow! Entry level personnel make just under $8.00 an hour assuming a 40 hour work week. Of course, many work much more than 40 hours a week and those in harm's way typically put in 16-18 hour days for months on end. That makes the pay well under minimum wage. So much for personal gain. I beg you, please make yourself familiar with the world around you. Go to a nearby base, get a visitor pass and meet some of the men and women you are so quick to disparage. You would be surprised. The military no longer accepts people in lieu of prison time. They require a minimum of a GED and prefer a high school diploma. The middle ranks are expected to get a minimum of undergraduate degrees and the upper ranks are encouraged to get advanced degrees.
"""
corpus = TEXT
tokenized_words = word_tokenize(TEXT)

# NLTK에서 제공하는 불용어 목록을 세트 자료형으로 받아와 주세요
stopwords_set = set(stopwords.words('english'))

def clean_by_stopwords(tokenized_words, stopwords_set):
    cleaned_words = []

    for word in tokenized_words:
        # 여기에 코드를 작성하세요
        if word not in stopwords_set:
            cleaned_words.append(word)

    return cleaned_words

# 테스트 코드
#clean_by_stopwords(tokenized_words, stopwords_set)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
# 챕터 02.단어단위전처리  레슨 08.정규화
text = "What can I do for you? Do your homework now."
print(text.lower()) # 대소문자 통합

synonym_dict = {'US':'USA', 'U.S':'USA', 'Ummm':'Umm', 'Ummmm':'Umm'}
text = "She became a US citizen. Ummmm, I think, maybe and or."
normalized_words = []

tokenized_words = nltk.word_tokenize(text)

for word in tokenized_words:
    if word in synonym_dict.keys():
        word = synonym_dict[word]

    normalized_words.append(word)
print(normalized_words)

what can i do for you? do your homework now.
['She', 'became', 'a', 'USA', 'citizen', '.', 'Umm', ',', 'I', 'think', ',', 'maybe', 'and', 'or', '.']


In [6]:
# 챕터 02.단어단위전처리 레슨 09.어간추출 (stemming)
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')  # needed for acronym such as Mr. Dr. ...

from nltk.stem import PorterStemmer  # 단순히 어미만 잘라내는 방식
from nltk.stem import LancasterStemmer

porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
text = "You are so lovely. I am loving you now."
porter_stemmed_words = []
lancaster_stemmed_words = []

tokenized_words = nltk.word_tokenize(text)

for word in tokenized_words:
#    stem = porter_stemmer.stem(word)
#    porter_stemmed_words.append(stem)
    porter_stemmed_words.append(porter_stemmer.stem(word))
    lancaster_stemmed_words.append(lancaster_stemmer.stem(word))

#for word in tokenized_words:
#    stem = lancaster_stemmer.stem(word)
#    lancaster_stemmed_words.append(stem)

print('before; ', tokenized_words)
print('porter; ', porter_stemmed_words)
print('lancaster; ', lancaster_stemmed_words)



before;  ['You', 'are', 'so', 'lovely', '.', 'I', 'am', 'loving', 'you', 'now', '.']
porter;  ['you', 'are', 'so', 'love', '.', 'i', 'am', 'love', 'you', 'now', '.']
lancaster;  ['you', 'ar', 'so', 'lov', '.', 'i', 'am', 'lov', 'you', 'now', '.']


In [34]:
from nltk.stem import PorterStemmer

# 포터 스테머 어간 추출 함수
def stemming_by_porter(tokenized_words):
    porter_stemmer = PorterStemmer()
    porter_stemmed_words = []

    for word in tokenized_words:
        # porter_stemmed_words.append(porter_stemmer.stem(word))
        stem = porter_stemmer.stem(word)
        porter_stemmed_words.append(stem)

    return porter_stemmed_words

In [None]:
# chapter 2, lesson 10 어간 추출 실습
# 필요한 패키지와 함수 불러오기
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
#from text import TEXT
nltk.download('punkt')

TEXT = """After reading the comments for this movie, I am not sure whether I should be angry, sad or sickened. Seeing comments typical of people who a)know absolutely nothing about the military or b)who base everything they think they know on movies like this or on CNN reports about Abu-Gharib makes me wonder about the state of intellectual stimulation in the world. At the time I type this the number of people in the US military: 1.4 million on Active Duty with another almost 900,000 in the Guard and Reserves for a total of roughly 2.3 million. The number of people indicted for abuses at at Abu-Gharib: Currently less than 20 That makes the total of people indicted .00083% of the total military. Even if you indict every single military member that ever stepped in to Abu-Gharib, you would not come close to making that a whole number.  The flaws in this movie would take YEARS to cover. I understand that it's supposed to be sarcastic, but in reality, the writer and director are trying to make commentary about the state of the military without an enemy to fight. In reality, the US military has been at its busiest when there are not conflicts going on. The military is the first called for disaster relief and humanitarian aid missions. When the tsunami hit Indonesia, devestating the region, the US military was the first on the scene. When the chaos of the situation overwhelmed the local governments, it was military leadership who looked at their people, the same people this movie mocks, and said make it happen. Within hours, food aid was reaching isolated villages. Within days, airfields were built, cargo aircraft started landing and a food distribution system was up and running. Hours and days, not weeks and months. Yes there are unscrupulous people in the US military. But then, there are in every walk of life, every occupation. But to see people on this website decide that 2.3 million men and women are all criminal, with nothing on their minds but thoughts of destruction or mayhem is an absolute disservice to the things that they do every day. One person on this website even went so far as to say that military members are in it for personal gain. Wow! Entry level personnel make just under $8.00 an hour assuming a 40 hour work week. Of course, many work much more than 40 hours a week and those in harm's way typically put in 16-18 hour days for months on end. That makes the pay well under minimum wage. So much for personal gain. I beg you, please make yourself familiar with the world around you. Go to a nearby base, get a visitor pass and meet some of the men and women you are so quick to disparage. You would be surprised. The military no longer accepts people in lieu of prison time. They require a minimum of a GED and prefer a high school diploma. The middle ranks are expected to get a minimum of undergraduate degrees and the upper ranks are encouraged to get advanced degrees.
"""

corpus = TEXT
tokenized_words = word_tokenize(corpus)

# 포터 스테머의 어간 추출
def stemming_by_porter(tokenized_words):
    porter_stemmer = PorterStemmer()
    porter_stemmed_words = []

    for word in tokenized_words:
        porter_stemmed_words.append(porter_stemmer.stem(word))

    return porter_stemmed_words

stemming_by_porter(tokenized_words)

In [8]:
# chapter 2, lesson 11 자연어 전처리 적용1
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('punkt')  # needed for acronym such as Mr. Dr. ...
from nltk.corpus import stopwords
nltk.download('stopwords')

# .py 모듈 수정 시 자동 리로드
#%load_ext autoreload
#%autoreload 2

#from preprocess import clean_by_freq
#from preprocess import clean_by_len
#from preprocess import clean_by_stopwords


df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitnlp/nlp/imdb.tsv', delimiter='\\t')
# 1. normalization
df['review'] = df['review'].str.lower()

# 2. tokenize
df['word_tokens'] = df['review'].apply(word_tokenize)

# 3. cleaning
stopwords_set = set(stopwords.words('english'))
df['cleaned_tokens'] = df['word_tokens'].apply(lambda x: clean_by_freq(x, 1))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_len(x, 2))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_stopwords(x, stopwords_set))

# 4. stemming
df['stemmed_tokens'] = df['cleaned_tokens'].apply(stemming_by_porter)

df['stemmed_tokens'][0]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitnlp/nlp/imdb.tsv', delimiter='\\t')


['one',
 'film',
 'said',
 'realli',
 'bad',
 'movi',
 'like',
 'said',
 'realli',
 'bad',
 'movi',
 'bad',
 'one',
 'film',
 'like']

In [9]:
# .py 모듈 수정 시 자동 리로드
%load_ext autoreload
%autoreload 2

## chapter 3. 문장 단위 전처리

In [None]:
# chapter 3, lesson 2 문장 토큰화 실습
# sentence tokenization
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

TEXT = """Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.
There was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say to itself, 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its waistcoat-pocket, and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity, she ran across the field after it, and fortunately was just in time to see it pop down a large rabbit-hole under the hedge.
In another moment down went Alice after it, never once considering how in the world she was to get out again.
The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.
Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.
"""

corpus = TEXT
tokenized_sents = sent_tokenize(corpus)

tokenized_sents

In [20]:
# chapter 3, lesson 3 품사태깅(POS tagging)
from nltk.tag import pos_tag  # part of speech tagging 품사태깅. 각 품사태그는 Penn Treebank POS Tags 기준
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import nltk
#nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


TEXT = "Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \"Hey, let\'s pool our money together and make a really bad movie!\" Or something like that."

corpus = TEXT
pos_tagged_words = []

tokenized_sents = sent_tokenize(corpus)
for sentence in tokenized_sents:
    # word tokenize
    tokenized_words = word_tokenize(sentence)
#    print(tokenized_words)

    # pos
    pos_tagged = pos_tag(tokenized_words)
    pos_tagged_words.extend(pos_tagged)

print(pos_tagged_words)

[('Watching', 'VBG'), ('Time', 'NNP'), ('Chasers', 'NNPS'), (',', ','), ('it', 'PRP'), ('obvious', 'VBZ'), ('that', 'IN'), ('it', 'PRP'), ('was', 'VBD'), ('made', 'VBN'), ('by', 'IN'), ('a', 'DT'), ('bunch', 'NN'), ('of', 'IN'), ('friends', 'NNS'), ('.', '.'), ('Maybe', 'RB'), ('they', 'PRP'), ('were', 'VBD'), ('sitting', 'VBG'), ('around', 'IN'), ('one', 'CD'), ('day', 'NN'), ('in', 'IN'), ('film', 'NN'), ('school', 'NN'), ('and', 'CC'), ('said', 'VBD'), (',', ','), ('``', '``'), ('Hey', 'NNP'), (',', ','), ('let', 'VB'), ("'s", 'POS'), ('pool', 'VB'), ('our', 'PRP$'), ('money', 'NN'), ('together', 'RB'), ('and', 'CC'), ('make', 'VB'), ('a', 'DT'), ('really', 'RB'), ('bad', 'JJ'), ('movie', 'NN'), ('!', '.'), ("''", "''"), ('Or', 'CC'), ('something', 'NN'), ('like', 'IN'), ('that', 'DT'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [21]:
# chapter3, lesson5 표제어 추출 (Lemmatization); 표제어(lemma)란 사전적 어원. am, are, is -> be. 단어의 정규화

from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')
from nltk.tag import pos_tag  # Penn Treebank POS(part of speech) Tag
from nltk.corpus import wordnet as wn  # WordNet POS Tag
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

text = 'You are the happiest person.'
tokenize_words = word_tokenize(text)

tagged_words = pos_tag(tokenize_words)
print(tagged_words)

def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    else:
        return

lemmatizer = WordNetLemmatizer()
lemmatized_words = []

for word, tag in tagged_words:
    wn_tag = penn_to_wn(tag)
    if wn_tag in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
        lemmatized_words.append(lemmatizer.lemmatize(word, wn_tag))
    else:
        lemmatized_words.append(word)

print('before; ', tokenize_words)
print('after; ', lemmatized_words)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


[('You', 'PRP'), ('are', 'VBP'), ('the', 'DT'), ('happiest', 'JJS'), ('person', 'NN'), ('.', '.')]
before;  ['You', 'are', 'the', 'happiest', 'person', '.']
after;  ['You', 'be', 'the', 'happy', 'person', '.']


In [46]:
# chapter 3, lesson 7 자연어 전처리 적용 II ~ lesson 8 자연어 전처리 후 통합하기
# nlp preprocessing II

import pandas as pd
#import nltk
#from nltk.tokenize import word_tokenize
#from nltk.tokenize import sent_tokenize
#nltk.download('punkt')  # needed for acronym such as Mr. Dr. ...
#from nltk.corpus import stopwords
#nltk.download('stopwords')

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitdata/imdb.tsv', delimiter="\\t")

# sentence tokenization
df['review'] = df['review'].str.lower()
df['sent_tokens'] = df['review'].apply(sent_tokenize)
#print(df['sent_tokens'][0])

# 품사 태깅 pos_tagging
df['pos_tagged_tokens'] = df['sent_tokens'].apply(pos_tagger)
#print(df['pos_tagged_tokens'][0])

# 표제어 추출 Lemmatization
df['lemmatized_tokens'] = df['pos_tagged_tokens'].apply(word_lemmatizer)
#print(df['lemmatized_tokens'][0])

#
stopwords_set = set(stopwords.words('english'))
df['cleaned_tokens'] = df['lemmatized_tokens'].apply(lambda x: clean_by_freq(x, 1))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_len(x, 2))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_stopwords(x, stopwords_set))
df[['cleaned_tokens']]

# combination
#df['combined_corpus'] = df['cleaned_tokens'].apply(combine)
#df[['combined_corpus']]

  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitdata/imdb.tsv', delimiter="\\t")


Unnamed: 0,cleaned_tokens
0,"[make, one, film, say, make, really, bad, movi..."
1,"[film, film]"
2,"[new, york, joan, barnard, elvire, audrey, bar..."
3,"[film, film, jump, send, n't, jump, radio, n't..."
4,"[site, movie, bad, even, movie, movie, make, m..."
5,"[ehle, northam, wonderful, wonderful, ehle, no..."
6,"[role, movie, n't, author, book, funny, author..."
7,"[plane, ceo, search, rescue, mission, call, ce..."
8,"[gritty, movie, movie, keep, sci-fi, good, kee..."
9,"[girl, girl]"


In [44]:
# Test APPLY function
# apply를 하면, 데이터 구조를 따로 고려하지 않고도, 리스트의 엔티티별로 해당 함수를 적용해줌

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
#nltk.download('punkt')  # needed for acronym such as Mr. Dr. ...

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitdata/imdb.tsv', delimiter="\\t")
tokens = []

df['review'] = df['review'].str.lower()
df['review'][0]
for i in range(0, 10):
    # print(sent_tokenize(df['review'][i]))
    tokens.append(sent_tokenize(df['review'][i]))

print(tokens[4])
#df['sent_tokens'] = sent_tokenize(df['review'][0])
#df['sent_tokens']

['"yes, i agree with everyone on this site this movie is very very bad.', 'to even call this a movie is an insult to all movies ever made.', "it's 40 minutes long.", 'someone compares this movie to an after school special.', 'b-i-n-g-o!', 'that describes is perfectly.', 'the packaging for this movie intentionally is misleading.', 'for example, the title of this movie should describe the movie.', 'rubberface???', 'that should be the first hint.', 'it was retitled with a new package of some goofy face jim probably made in his stand-up days.', 'i was hoping for more stand-up from jim.', 'if you like jim now as an actor.', 'you would love him in his stand up days.', 'still trying to locate the rodney dangerfield young comedians special from hbo that featured jim in his early career days.', "it isn't even mentioned on this site.", "i'd love to find anything jim did stand-up wise.", 'also jim carrey is a supporting actor in this movie.', 'the main character is very very annoying.', 'she is s

  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitdata/imdb.tsv', delimiter="\\t")


## chapter 4. 자연어 숫자로 표현하기

In [55]:
# chapter 4, lesson 1 정수 인코딩 Integer Encoding; 텍스트를 숫자데이터로 변환하는 방법. 토큰화된 각 단어에 특정 정수를 매핑

#tokens = df['cleaned_tokens'][4]
tokens = sum(df['cleaned_tokens'], [])  # whole token summed list
#print('tokens are ', tokens)

vocab = Counter(tokens)
#print('vocab is ', vocab)
vocab = vocab.most_common()
#print('most_common vocab is ', vocab)

word_to_idx = {}
i = 0

for (word, frequency) in vocab:
    i += 1  # 0은 아무 의미 없는 (무시되는) 정수를 위해 남겨두고, 1부터 시작
    word_to_idx[word] = i

print(word_to_idx)

{'movie': 1, 'film': 2, "n't": 3, 'scene': 4, 'bad': 5, 'time': 6, 'reason': 7, 'make': 8, 'jim': 9, 'good': 10, 'one': 11, 'like': 12, 'could': 13, "'re": 14, 'quastel': 15, 'really': 16, 'even': 17, 'monster': 18, 'joan': 19, 'love': 20, 'author': 21, 'try': 22, 'dialogue': 23, 'idea': 24, 'italy': 25, 'colleague': 26, 'maggot': 27, 'end': 28, 'watch': 29, 'jump': 30, 'radio': 31, 'stand-up': 32, 'day': 33, 'terrible': 34, 'ehle': 35, 'northam': 36, 'search': 37, 'rescue': 38, 'call': 39, 'knowles': 40, 'henriksen': 41, 'easily': 42, 'see': 43, 'appear': 44, 'get': 45, 'character': 46, 'think': 47, 'use': 48, 'whether': 49, 'need': 50, 'though': 51, 'sci-fi': 52, 'look': 53, 'say': 54, 'new': 55, 'york': 56, 'barnard': 57, 'elvire': 58, 'audrey': 59, 'john': 60, 'saxon': 61, 'etruscan': 62, 'tomb': 63, 'drug': 64, 'story': 65, 'romantic': 66, 'waste': 67, 'etrusco': 68, 'send': 69, 'reporter': 70, 'fear': 71, 'site': 72, 'special': 73, 'describe': 74, 'actor': 75, 'stand': 76, 'comed

In [None]:
def idx_encoder(tokens, word_to_idx):
    encoded_idx = []
    for token in tokens:
        idx = word_to_idx[token]
        encoded_idx.append(idx)
    return encoded_idx

df['integer_encoded'] = df['cleaned_tokens'].apply(lambda x: idx_encoder(x, word_to_idx))
print(df[['integer_encoded']])


In [None]:
# chapter 4, lesson 2 정수 인코딩 실습
TEXT = """Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.
There was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say to itself, 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its waistcoat-pocket, and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity, she ran across the field after it, and fortunately was just in time to see it pop down a large rabbit-hole under the hedge.
In another moment down went Alice after it, never once considering how in the world she was to get out again.
The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.
Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.
"""

word_to_idx = {} # 단어 별 인덱스 부여하기 위한 딕셔너리
i = 0
encoded_idx = [] # 각 토큰의 정수 인덱스를 부여하기 위한 리스트
corpus = TEXT

tokenized_words = word_tokenize(corpus)

# 단어의 빈도수를 계산하여 정렬하는 코드를 작성하세요
vocab = Counter(tokenized_words)
vocab = vocab.most_common()

for (word, frequency) in vocab:
    # 여기에 코드를 작성하세요
    i += 1
    word_to_idx[word] = i

for word in tokenized_words:
    # 여기에 코드를 작성하세요
    idx = word_to_idx[word]
    encoded_idx.append(idx)

# 테스트 코드
encoded_idx

## chapter 5. 감성 분석 sentiment analysis
감성 분석을 위해선 코퍼스에 포함되는 특정 단어의 감성을 판단하는 기준이 필요

### 기준을 만드는 접근법 2가지
* 규칙 기반 감성 분석; 감성 어휘 사전(사람이 특정 단어를 보고 직접 긍정/부정/중립 수치를 기재해 놓은 단어들의 집합)을 기준으로 단어의 긍부정을 분류
* 머신러닝 기반 감성 분석; 다수의 코퍼스들을 통해 긍정단어와 부정단어를 구분하는 모델을 (당연히 정답 데이터로) 학습시켜 그 모델을 기반으로 감성지수를 확인

### 어휘사전 (nltk.corpus)
대표적인 감성 어휘 사전으로 SentiWordNet이 있음. WordNet/SentiWordNet은 NLTK에서 제공하는 대규모 영어 어휘 사전. 단어의 품사에 따라 감성 지수는 달라지므로, 품사의 synset을 정확히 지정하는 것이 중요
* WordNet/Synset(Sets of Cognitive Synonyms)
    * 단어, 품사, 순번; 일반적으로 더 많이 사용되는 의미가 앞 순번으로 부여. 의미가 비슷한 다른 단어들도 목록에 포함
* SentiWordNet/SentiSynset(WordNet과 유사하나, 0~1사이의 긍정지수/부정지수/객관성지수를 할당. 긍정$-$부정으로 판단)
    * 긍정지수 pos_score, 부정지수 neg_score, 객관성지수 obj_score

### **VADER(Valence Aware Dictionary and sEntiment Reasoner)**
* 감성 분석을 위한 어휘 사전이자 알고리즘
* SentiWordNet과의 큰 차이점은 일반적인 감성 어휘 사전의 규칙 외에도 축약형과 기호 등을 고려해 감성 지수를 추출할 수 있다는 점
* 그래서 주로 축약형 표현이나 특수 문자가 많이 사용된 소셜 미디어 텍스트를 분석할 때 자주 사용됨
* SentiWordNet은 단어의 감성지수만 확인할 수 있기때문에, 코퍼스의 감성 지수도 각 단어의 감성 지수 합으로 계산했었으나, VADER는 단어, 문장, 여러문장으로 이루어진 코퍼스로 바로 감성 지수를 계산할 수 있음
* 심지어 코퍼스를 단어 단위로 토큰화해 파라미터로 전달할 필요 없이, VADER 내부 동작에서 필요한 토큰화와 감성 지수 추출 작업을 알아서 해 줌
* 결과를 딕셔너리 형태의 점수들로 반환 (neg;부정, neu;중립, pos;긍정, compound; 세 지수 조합 (-1 부정~1 긍정)

In [1]:
# chapter 5, lesson 3 SentiWordNet
# WordNet의 Synset과 SentiWordNet의 SentiSynset의 결과 비교
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import pos_tag

nltk.download('wordnet')
nltk.download('sentiwordnet')
nltk.download('omw-1.4')

#word = 'happy'
word = 'hard'
#print("wordnet-{}: ".format(word), wn.synsets(word))
#print("sentiwordnet-{}: ".format(word), list(swn.senti_synsets(word)))

# happy의 긍정, 부정, 중립 지수 확인하기
word_sentisynsets = list(swn.senti_synsets(word))

pos_score = word_sentisynsets[0].pos_score()
neg_score = word_sentisynsets[0].neg_score()
obj_score = word_sentisynsets[0].obj_score()

print(pos_score, neg_score, obj_score)
sentiment_score = pos_score - neg_score
print('sentiment_score = pos_score - neg_score: ', sentiment_score)


# 품사 별 감성 지수 비교
adj_synsets = wn.synsets(word, wn.ADJ)
print('adj_synsets of {} is ...\n'.format(word), adj_synsets)
adv_synsets = wn.synsets(word, wn.ADV)
print('adv_synsets of {} is ...\n'.format(word), adv_synsets)

adj_synset = adj_synsets[0]
adv_synset = adv_synsets[0]

adj_senti_synset = swn.senti_synset(adj_synset.name())
adv_senti_synset = swn.senti_synset(adv_synset.name())
print(adj_senti_synset, adv_senti_synset)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


0.0 0.75 0.25
sentiment_score = pos_score - neg_score:  -0.75
adj_synsets of hard is ...
 [Synset('difficult.a.01'), Synset('hard.a.02'), Synset('hard.a.03'), Synset('hard.s.04'), Synset('arduous.s.01'), Synset('unvoiced.a.01'), Synset('hard.a.07'), Synset('hard.a.08'), Synset('intemperate.s.03'), Synset('hard.s.10'), Synset('hard.s.11'), Synset('hard.s.12')]
adv_synsets of hard is ...
 [Synset('hard.r.01'), Synset('hard.r.02'), Synset('hard.r.03'), Synset('hard.r.04'), Synset('hard.r.05'), Synset('heavily.r.07'), Synset('hard.r.07'), Synset('hard.r.08'), Synset('hard.r.09'), Synset('hard.r.10')]
<difficult.a.01: PosScore=0.0 NegScore=0.75> <hard.r.01: PosScore=0.125 NegScore=0.125>


In [25]:
# chapter 5,
from nltk.corpus import sentiwordnet as swn
word = 'love'
pos = wn.VERB

word_synsets = wn.synsets(word, pos)

word_synset = word_synsets[0]
word_senti_synset = swn.senti_synset(word_synset.name())

pos_score = word_senti_synset.pos_score()
neg_score = word_senti_synset.neg_score()

sentiment_score = pos_score - neg_score
print(sentiment_score)

# 또는 --------
word_sentisynsets = list(swn.senti_synsets(word, pos))
pos_score = word_sentisynsets[0].pos_score()
neg_score = word_sentisynsets[0].neg_score()
print(pos_score-neg_score)
# --------

#word_synsets =  wn.synsets(word, pos)
#word_synset = word_synsets[0]
#print(word_synset)
#word_senti_synset = swn.senti_synset(word_synsets[0].name())
#word_senti_synset

0.5
0.5


In [11]:
# chapter 5, lesson 5 감성 분석 적용
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitnlp/nlp/imdb.tsv', delimiter='\\t')
#print('df is \n', df)

# sentence tokenization; sentence 별로 분리하는 작업
df['review'] = df['review'].str.lower()
df['sent_tokens'] = df['review'].apply(sent_tokenize)
#print("df['sent_tokens'] is \n", df['sent_tokens'])

# 품사 태깅 pos_tagging
df['pos_tagged_tokens'] = df['sent_tokens'].apply(pos_tagger)
#print("df['pos_tagged_tokens'] is \n", df['pos_tagged_tokens'])

# 표제어 추출 Lemmatization
df['lemmatized_tokens'] = df['pos_tagged_tokens'].apply(word_lemmatizer)
#print(df['lemmatized_tokens'][0])

#
stopwords_set = set(stopwords.words('english'))
df['cleaned_tokens'] = df['lemmatized_tokens'].apply(lambda x: clean_by_freq(x, 1))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_len(x, 2))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_stopwords(x, stopwords_set))

pos_tagged_words = df['pos_tagged_tokens'][0]
senti_score = 0

for word, tag in pos_tagged_words:
    wn_tag = penn_to_wn(tag)  # penn tree bank 기준 품사를 wordnet 기준 품사로 변경
    if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
        continue

    if not wn.synsets(word, wn_tag):
        continue
    else:
        synsets = wn.synsets(word, wn_tag)

    synset = synsets[0]
    #    print(synset.name())
    swn_synset = swn.senti_synset(synset.name())

    word_senti_score = (swn_synset.pos_score() - swn_synset.neg_score())
    senti_score += word_senti_score

print(senti_score)

-0.375


  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitnlp/nlp/imdb.tsv', delimiter='\\t')


In [37]:
# chapter 5, lesson 6 감성 분석 결과 확인
#from preprocess import swn_polarity
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitnlp/nlp/imdb.tsv', delimiter='\\t')
#print('df is \n', df)
index = 8

# sentence tokenization; sentence 별로 분리하는 작업
df['review'] = df['review'].str.lower()
df['sent_tokens'] = df['review'].apply(sent_tokenize)
#print("df['sent_tokens'] is \n", df['sent_tokens'][index])

# 문장 안에서의 품사 태깅 pos_tagging (pos ; part of speech)
df['pos_tagged_tokens'] = df['sent_tokens'].apply(pos_tagger)
#print("df['pos_tagged_tokens'][{}] is {}\n".format(index, df['pos_tagged_tokens'][index]))
#print("df['pos_tagged_tokens'] is \n", df['pos_tagged_tokens'])

df['swn_sentiment'] = df['pos_tagged_tokens'].apply(swn_polarity)
print('index; {}. sentiment; {}'.format(index, df.iloc[index][['review', 'swn_sentiment']]))

#df['review'][index]
print("df['review'] is ", df['review'])
#swn_polarity['review'][1]

index; 8. sentiment; review           a well made, gritty science fiction movie, it ...
swn_sentiment                                                  4.5
Name: 8, dtype: object
df['review'] is  0    "watching time chasers, it obvious that it was...
1    i saw this film about 20 years ago and remembe...
2    minor spoilers in new york, joan barnard (elvi...
3    i went to see this film with a great deal of e...
4    "yes, i agree with everyone on this site this ...
5    "jennifer ehle was sparkling in \""pride and p...
6    amy poehler is a terrific comedian on saturday...
7    "a plane carrying employees of a large biotech...
8    a well made, gritty science fiction movie, it ...
9    "incredibly dumb and utterly predictable story...
Name: review, dtype: object


  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitnlp/nlp/imdb.tsv', delimiter='\\t')


In [None]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
# import pos_tagger, penn_to_wn from preprocess
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
# download nltk.download('punkt', 'wordnet', 'sentiwordnet', averaged_perception_tagger')

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitnlp/nlp/imdb.tsv', delimiter='\\t')
df['sent_tokens'] = df['review'].apply(sent_tokenize)  # tokenize sentense
df['pos_tagged_tokens'] = df['sent_tokens'].apply(pos_tagger)  # tag 품사 to part of speech
#df['pos_tagged_tokens']

  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitnlp/nlp/imdb.tsv', delimiter='\\t')


Unnamed: 0,review
0,"""Watching Time Chasers, it obvious that it was..."
1,I saw this film about 20 years ago and remembe...
2,"Minor Spoilers In New York, Joan Barnard (Elvi..."
3,I went to see this film with a great deal of e...
4,"""Yes, I agree with everyone on this site this ..."
5,"""Jennifer Ehle was sparkling in \""""Pride and P..."
6,Amy Poehler is a terrific comedian on Saturday...
7,"""A plane carrying employees of a large biotech..."
8,"A well made, gritty science fiction movie, it ..."
9,"""Incredibly dumb and utterly predictable story..."


In [None]:
# chapter 5, lesson 8 VADER
!pip install vaderSentiment

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 KB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [38]:
# chapter 5, lesson 8 VADER
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

senti_analyzer = SentimentIntensityAnalyzer()

text1 = "This is a great movie!"
text2 = "This is a terrible movie!"
text3 = "This movie was just okay."

senti_scores_text1 = senti_analyzer.polarity_scores(text1)
senti_scores_text2 = senti_analyzer.polarity_scores(text2)
senti_scores_text3 = senti_analyzer.polarity_scores(text3)

print(senti_scores_text1)
print(senti_scores_text2)
print(senti_scores_text3)

def vader_sentiment(text):
    analyzer = SentimentIntensityAnalyzer()

    senti_score = analyzer.polarity_scores(text)['compound']

    return senti_score

df['vader_sentiment'] = df['review'].apply(vader_sentiment)
df[['review', 'swn_sentiment', 'vader_sentiment']]

{'neg': 0.0, 'neu': 0.406, 'pos': 0.594, 'compound': 0.6588}
{'neg': 0.531, 'neu': 0.469, 'pos': 0.0, 'compound': -0.5255}
{'neg': 0.0, 'neu': 0.678, 'pos': 0.322, 'compound': 0.2263}


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,review,swn_sentiment,vader_sentiment
0,"""watching time chasers, it obvious that it was...",-0.375,-0.9095
1,i saw this film about 20 years ago and remembe...,-1.5,-0.9694
2,"minor spoilers in new york, joan barnard (elvi...",-2.25,-0.2794
3,i went to see this film with a great deal of e...,-0.5,-0.9707
4,"""yes, i agree with everyone on this site this ...",3.0,0.8049
5,"""jennifer ehle was sparkling in \""""pride and p...",6.75,0.9494
6,amy poehler is a terrific comedian on saturday...,0.75,0.8473
7,"""a plane carrying employees of a large biotech...",8.75,0.9885
8,"a well made, gritty science fiction movie, it ...",4.5,0.9887
9,"""incredibly dumb and utterly predictable story...",-1.125,-0.7375


In [40]:
# chapter 5, lesson 9 VADER 감정 분석 실습
import nltk
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitnlp/nlp/imdb.tsv', delimiter='\\t')
analyzer = SentimentIntensityAnalyzer()

def vader_sentiment(text):
    # 여기에 코드를 작성하세요
    senti_score = analyzer.polarity_scores(text)['compound']
    return senti_score

df['senti_score'] = df['review'].apply(vader_sentiment)
df

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitnlp/nlp/imdb.tsv', delimiter='\\t')


Unnamed: 0,review,senti_score
0,"""Watching Time Chasers, it obvious that it was...",-0.9095
1,I saw this film about 20 years ago and remembe...,-0.9694
2,"Minor Spoilers In New York, Joan Barnard (Elvi...",-0.2794
3,I went to see this film with a great deal of e...,-0.9707
4,"""Yes, I agree with everyone on this site this ...",0.398
5,"""Jennifer Ehle was sparkling in \""""Pride and P...",0.9494
6,Amy Poehler is a terrific comedian on Saturday...,0.8473
7,"""A plane carrying employees of a large biotech...",0.9864
8,"A well made, gritty science fiction movie, it ...",0.9887
9,"""Incredibly dumb and utterly predictable story...",-0.7375


## chapter 6. 한국어 자연어 처리
### 띄어쓰기교정
* 한국어의 경우 띄어쓰기가 지켜지지 않아도 이해하기가 힘들지 않아, 오히려 띄어쓰기가 잘 안 지켜지는 경우가 많음.
* 띄어쓰기가 어긋난 데이터로 분석을 진행하면 전처리의 토큰화 과정부터 오류가 발생할 수 있어
* 전처리 단계에서 띄어쓰기를 미리 교정해야 함
* 무료 띄어쓰기 도구; py-hanspell (네이버 맞춤법 검사기를 이용)
    * colab에서 인스톨 한 후, spell_checker의 동작이 정상적이지 않을 수 있음
    * 이는, 네이버 맞춤법 검사기의 passportKey와 _callback 값이 매일 바뀌기 때문
    * passportKey와 _callback 값은 아래 방법으로 취득
        * 네이버 맞춤법 web 접근
        * F12 로 코드 확인
        * passportKey와 _callback을 검색
    * 그날 그날 올바른 passportKey와 _callback 값을, spell_checker.py (/usr/local/lib/python3.10/dist-packages/hanspell/spell_checkr.py) 의 payload 변수에 다음과 같이 변경해 주어야 함
    ```
    payload = {
            'passportKey': '737d31055a3116cb5b66f233997f2bb7f86d988b',
            '_callback': 'jQuery224020524192873615132_1701671014045',
            'color_blindness': '0',
            'q': text
        }

    ```
    * **변경 후 반드시 런타임을 재실행해야함**
    

### 형태소분석
* 단어의 어근과 접사를 분리
* 형태소 분석을 위한 많은 분석기가 공개되어 있고, 대표적인 한국어 형태소 분석기는 KoNLPy
* KoNLPy로, 문장분리, 형태소분석, 어간추출, 의미역추출, 개체명인식 등을 손쉽게 할 수 있음
* 그 외의 형태소 분석 도구들
    * [soynlp](https://github.com/lovit/soynlp); L tokenizer, MaxScoreTokenizer 등 다양한 형태소 분석기 제공
    * [Khaiii](https://tech.kakao.com/2018/12/13/khaiii/); 2018년 카카오가 공개한 오픈소스 한국어 형태소 분석기
    * [Google sentencepiece](https://github.com/google/sentencepiece); 2018년에 구굴에서 공개한 형태소 분석 패키지

### 양질의데이터확보
* 사용인구가 다른 대표적인 언어보다 적고, 한국어의 독특한 특징때문에 전처리 작업이 까다로와 양질의 데이터 확보가 어려움
* 공개되어 있는 양질의 한국어 데이터
    * [KorQuAD](https://korquad.github.io/) (Korean Questions and Answers Dataset); 2018 LG CNS. 위키피디아와 전통적인 인쇄 및 전자 출판물 자료 기반. version2.0은 1.0보다 2만쌍 추가된 12만쌍
    * [네이버 영화 리뷰](https://github.com/e9t/nsmc/); 감성 지수에 대한 레이블이 함께 기록. 한국어 감성 분석용. 20만개 데이터중 10만개가 긍정, 10만개가 부정
    * [한국어 위키](https://ko.wikipedia.org/wiki/%EC%9C%84%ED%82%A4%EB%B0%B1%EA%B3%BC:%EB%8C%80%EB%AC%B8); 현재 한국어 코퍼스 중 가장 많은 양의 데이터를 보유. 다운로드할 수 있게 되어 있지 않아서, 필요하면 크롤링을 해야 함


In [8]:
!pip3 install --upgrade pip
#!pip install py-hanspell
# colab의 경우 아래와 같이 인스톨
#!pip install git+https://github.com/ssut/py-hanspell.git

!pip3 install konlpy

[0m

In [3]:
# chapter 6, lesson 1 띄어쓰기 교정
#import sys
#sys.path.append('/content/drive/MyDrive/Colab Notebooks/py-hanspell-master/')
#sys.path.append('/content/drive/MyDrive/Colab Notebooks/py-hanspell-master/hanspell/')
from hanspell import spell_checker

text = "아버지가방에들어가신다나는오늘코딩을했다"

hanspell_sent = spell_checker.check(text)
print(hanspell_sent.checked)

아버지가 방에 들어가신다 나는 오늘 코딩을 했다


In [9]:
# chapter 6, lesson 2 형태소 분석
import konlpy
from konlpy.tag import Kkma, Komoran, Okt, Hannanum  # 형태소 분석기. 이 외에도 mecab가 유명

kkma = Kkma()
komoran = Komoran()
okt = Okt()
hannanum = Hannanum()

text = "아버지가 방에 들어가신다 나는 오늘 코딩을 했다"

print("Kkma: ", kkma.morphs(text))
print("Komoran: ", komoran.morphs(text))
print("Okt: ", okt.morphs(text))
print("Hannanum: ", hannanum.morphs(text))

Kkma:  ['아버지', '가', '방', '에', '들어가', '시', 'ㄴ다', '나', '는', '오늘', '코딩', '을', '하', '었', '다']
Komoran:  ['아버지', '가', '방', '에', '들어가', '시', 'ㄴ다', '나', '는', '오늘', '코', '딩', '을', '하', '았', '다']
Okt:  ['아버지', '가', '방', '에', '들어가신다', '나', '는', '오늘', '코딩', '을', '했다']
Hannanum:  ['아버지', '가', '방', '에', '들', '어', '가', '시ㄴ다', '나', '는', '오늘', '코딩', '을', '하', '었다']
