# 자연어전처리과정

## 단어 단위의 전처리
* 단어 토큰화(word tokenization); 띄어쓰기, 문장기호(', , 등)을 기준으로 단어 리스트화
* 정제(cleaning); 코퍼스(말뭉치)에서 의미 없거나 목적에 접합하지 않은 단어를 제거 (예. 빈도수 2 이하인 단어, 길이가 2 이하인 단어 등)
* 불용어(stopwords) 정의; 의미 없거나 목적에서 벗어나는 단어(목적에 맞게 정의하여)를 제거 (예. do, then, wha, she, am, are ...)
* 정규화(normalization); 형태는 다르지만 같은 의미로 사용되는 단어를 하나로 통일 (예. US, USA, U.S., United States of America ...)
* 어간추출(stemming); 특정단어의 핵심이 되는 부분(어간)을 찾아 정규화(예. alize->al, ational->ate, ate->제거 ment->제거 등). porter stemmer, lancaster stemmer 등이 있음

## 문장 단위의 전처리
* 문장 토큰화(sentence tokenization); 코퍼스를 문장 단위로 토큰화. 마침표를 기준으로 토큰화. dr. mr. 의 마침표는 문장으로 인식하지 않아야 함
* 품사 태깅(POS; part of speech tagging); 문장 안에서의 단어의 품사를 태깅
* 표제어추출(Lemmatization); 단어의 사전적 어원 태깅 (예. happyiest->happy, am, are, is->be)
 

In [3]:
# library import and function definition. preprocess.py
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
nltk.download('punkt')  # needed for acronym such as Mr. Dr. ...
nltk.download('averaged_perceptron_tagger')  # for tagger
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
nltk.download('stopwords')

stopwords_set = set(stopwords.words('english'))

def clean_by_freq(tokenized_words, cut_off_count):
    vocab = Counter(tokenized_words)

    uncommon_words = [key for key, value in vocab.items() if value <= cut_off_count]
    cleaned_words = [word for word in tokenized_words if word not in uncommon_words]

    return cleaned_words

def clean_by_len(tokenized_words, cut_off_length):
    cleaned_words = []

    for word in tokenized_words:
        if len(word) > cut_off_length:
            cleaned_words.append(word)

    return cleaned_words

def clean_by_stopwords(tokenized_words, stopwords_set):
    cleaned_words = []

    for word in tokenized_words:
        # 여기에 코드를 작성하세요
        if word not in stopwords_set:
            cleaned_words.append(word)

    return cleaned_words

# 포터 스테머 어간 추출 함수
def stemming_by_porter(tokenized_words):
    porter_stemmer = PorterStemmer()
    porter_stemmed_words = []

    for word in tokenized_words:
        # porter_stemmed_words.append(porter_stemmer.stem(word))
        stem = porter_stemmer.stem(word)
        porter_stemmed_words.append(stem)

    return porter_stemmed_words

# 품사 태깅 함수
def pos_tagger(tokenized_sents):
    pos_tagged_words = []
    for sentence in tokenized_sents:
        # word tokenize
        tokenized_words = word_tokenize(sentence)

        # pos
        pos_tagged = pos_tag(tokenized_words)
        pos_tagged_words.extend(pos_tagged)
    return pos_tagged_words

# 품사 태깅 변환
def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    else:
        return

# 표제어(lemmatization)
def word_lemmatizer(pos_tagged_words):
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = []
    for word, tag in pos_tagged_words:
        wn_tag = penn_to_wn(tag)
        if wn_tag in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
            lemmatized_words.append(lemmatizer.lemmatize(word, wn_tag))
        else:
            lemmatized_words.append(word)
    return lemmatized_words

def combine(sentence):
    return ' '.join(sentence)

def idx_encoder(tokens, word_to_idx):
    encoded_idx = []
    for token in tokens:
        idx = word_to_idx[token]
        encoded_idx.append(idx)
    return encoded_idx

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')  # needed for acronym such as Mr. Dr. ...

text = "Although it's not a happily-ever-after ending, it is very realistic."

# 단어 토큰화  https://www.nltk.org/api/nltk.tokenize.html
tokenized_words = word_tokenize(text)

print(tokenized_words)

In [None]:
import nltk
#from text import TEXT
from nltk.tokenize import word_tokenize
#nltk.download('punkt')  # needed for acronym such as Mr. Dr. ...

TEXT = """Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.
There was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say to itself, 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its waistcoat-pocket, and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity, she ran across the field after it, and fortunately was just in time to see it pop down a large rabbit-hole under the hedge.
In another moment down went Alice after it, never once considering how in the world she was to get out again.
The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.
Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.
"""

corpus = TEXT

# 단어 토큰화
tokenized_words = word_tokenize(corpus)

print(tokenized_words)

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter

TEXT = """After reading the comments for this movie, I am not sure whether I should be angry, sad or sickened. Seeing comments typical of people who a)know absolutely nothing about the military or b)who base everything they think they know on movies like this or on CNN reports about Abu-Gharib makes me wonder about the state of intellectual stimulation in the world. At the time I type this the number of people in the US military: 1.4 million on Active Duty with another almost 900,000 in the Guard and Reserves for a total of roughly 2.3 million. The number of people indicted for abuses at at Abu-Gharib: Currently less than 20 That makes the total of people indicted .00083% of the total military. Even if you indict every single military member that ever stepped in to Abu-Gharib, you would not come close to making that a whole number.  The flaws in this movie would take YEARS to cover. I understand that it's supposed to be sarcastic, but in reality, the writer and director are trying to make commentary about the state of the military without an enemy to fight. In reality, the US military has been at its busiest when there are not conflicts going on. The military is the first called for disaster relief and humanitarian aid missions. When the tsunami hit Indonesia, devestating the region, the US military was the first on the scene. When the chaos of the situation overwhelmed the local governments, it was military leadership who looked at their people, the same people this movie mocks, and said make it happen. Within hours, food aid was reaching isolated villages. Within days, airfields were built, cargo aircraft started landing and a food distribution system was up and running. Hours and days, not weeks and months. Yes there are unscrupulous people in the US military. But then, there are in every walk of life, every occupation. But to see people on this website decide that 2.3 million men and women are all criminal, with nothing on their minds but thoughts of destruction or mayhem is an absolute disservice to the things that they do every day. One person on this website even went so far as to say that military members are in it for personal gain. Wow! Entry level personnel make just under $8.00 an hour assuming a 40 hour work week. Of course, many work much more than 40 hours a week and those in harm's way typically put in 16-18 hour days for months on end. That makes the pay well under minimum wage. So much for personal gain. I beg you, please make yourself familiar with the world around you. Go to a nearby base, get a visitor pass and meet some of the men and women you are so quick to disparage. You would be surprised. The military no longer accepts people in lieu of prison time. They require a minimum of a GED and prefer a high school diploma. The middle ranks are expected to get a minimum of undergraduate degrees and the upper ranks are encouraged to get advanced degrees.
"""

corpus = TEXT

# token list
tokenized_words = word_tokenize(corpus)
#print(tokenized_words)

# token count
vocab = Counter(tokenized_words)
print(len(vocab))

uncommon_words = [key for key, value in vocab.items() if value <= 2]
print('frequency <= 2;', len(uncommon_words))

cleaned_by_freq = [word for word in tokenized_words if word not in uncommon_words]
print('frequency >= 3; ', len(cleaned_by_freq))

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
#from text import TEXT
#nltk.download('punkt')

TEXT = """Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.
There was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say to itself, 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its waistcoat-pocket, and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity, she ran across the field after it, and fortunately was just in time to see it pop down a large rabbit-hole under the hedge.
In another moment down went Alice after it, never once considering how in the world she was to get out again.
The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.
Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.
"""

corpus = TEXT
tokenized_words = word_tokenize(corpus)

def clean_by_freq(tokenized_words, cut_off_count):
    vocab = Counter(tokenized_words)

    uncommon_words = [key for key, value in vocab.items() if value <= cut_off_count]
    cleaned_words = [word for word in tokenized_words if word not in uncommon_words]

    return cleaned_words

def clean_by_len(tokenized_words, cut_off_length):
    cleaned_words = []

    for word in tokenized_words:
        if len(word) > cut_off_length:
            cleaned_words.append(word)

    return cleaned_words

clean_by_freq = clean_by_freq(tokenized_words, 2)
cleaned_words = clean_by_len(clean_by_freq, 2)

#cleaned_words

In [None]:
from nltk.corpus import stopwords
#nltk.download('stopwords')

stopwords_set = set(stopwords.words('english'))

print('stopwords count :', len(stopwords_set))
#print(stopwords_set)

stopwords_set.add('hello')
stopwords_set.remove('the')
stopwords_set.remove('me')

#print('stopwords count is', len(stopwords_set))
#print('stopwords are', stopwords_set)

cleaned_words = []

for word in cleaned_by_freq:
    if word not in stopwords_set:
        cleaned_words.append(word)

print('불용어 제거 전; ', len(cleaned_by_freq))
print('불용어 제거 후; ', len(cleaned_words))

In [None]:
# 챕터 02.단어단위전처리 07.불용어제거실습
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#from text import TEXT
nltk.download('stopwords')
nltk.download('punkt')

TEXT = """After reading the comments for this movie, I am not sure whether I should be angry, sad or sickened. Seeing comments typical of people who a)know absolutely nothing about the military or b)who base everything they think they know on movies like this or on CNN reports about Abu-Gharib makes me wonder about the state of intellectual stimulation in the world. At the time I type this the number of people in the US military: 1.4 million on Active Duty with another almost 900,000 in the Guard and Reserves for a total of roughly 2.3 million. The number of people indicted for abuses at at Abu-Gharib: Currently less than 20 That makes the total of people indicted .00083% of the total military. Even if you indict every single military member that ever stepped in to Abu-Gharib, you would not come close to making that a whole number.  The flaws in this movie would take YEARS to cover. I understand that it's supposed to be sarcastic, but in reality, the writer and director are trying to make commentary about the state of the military without an enemy to fight. In reality, the US military has been at its busiest when there are not conflicts going on. The military is the first called for disaster relief and humanitarian aid missions. When the tsunami hit Indonesia, devestating the region, the US military was the first on the scene. When the chaos of the situation overwhelmed the local governments, it was military leadership who looked at their people, the same people this movie mocks, and said make it happen. Within hours, food aid was reaching isolated villages. Within days, airfields were built, cargo aircraft started landing and a food distribution system was up and running. Hours and days, not weeks and months. Yes there are unscrupulous people in the US military. But then, there are in every walk of life, every occupation. But to see people on this website decide that 2.3 million men and women are all criminal, with nothing on their minds but thoughts of destruction or mayhem is an absolute disservice to the things that they do every day. One person on this website even went so far as to say that military members are in it for personal gain. Wow! Entry level personnel make just under $8.00 an hour assuming a 40 hour work week. Of course, many work much more than 40 hours a week and those in harm's way typically put in 16-18 hour days for months on end. That makes the pay well under minimum wage. So much for personal gain. I beg you, please make yourself familiar with the world around you. Go to a nearby base, get a visitor pass and meet some of the men and women you are so quick to disparage. You would be surprised. The military no longer accepts people in lieu of prison time. They require a minimum of a GED and prefer a high school diploma. The middle ranks are expected to get a minimum of undergraduate degrees and the upper ranks are encouraged to get advanced degrees.
"""
corpus = TEXT
tokenized_words = word_tokenize(TEXT)

# NLTK에서 제공하는 불용어 목록을 세트 자료형으로 받아와 주세요
stopwords_set = set(stopwords.words('english'))

def clean_by_stopwords(tokenized_words, stopwords_set):
    cleaned_words = []

    for word in tokenized_words:
        # 여기에 코드를 작성하세요
        if word not in stopwords_set:
            cleaned_words.append(word)

    return cleaned_words

# 테스트 코드
#clean_by_stopwords(tokenized_words, stopwords_set)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# 챕터 02.단어단위전처리  레슨 08.정규화
text = "What can I do for you? Do your homework now."
print(text.lower()) # 대소문자 통합

synonym_dict = {'US':'USA', 'U.S':'USA', 'Ummm':'Umm', 'Ummmm':'Umm'}
text = "She became a US citizen. Ummmm, I think, maybe and or."
normalized_words = []

tokenized_words = nltk.word_tokenize(text)

for word in tokenized_words:
    if word in synonym_dict.keys():
        word = synonym_dict[word]

    normalized_words.append(word)
print(normalized_words)

what can i do for you? do your homework now.
['She', 'became', 'a', 'USA', 'citizen', '.', 'Umm', ',', 'I', 'think', ',', 'maybe', 'and', 'or', '.']


In [None]:
# 챕터 02.단어단위전처리 레슨 09.어간추출
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')  # needed for acronym such as Mr. Dr. ...

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer

porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
text = "You are so lovely. I am loving you now."
porter_stemmed_words = []
lancaster_stemmed_words = []

tokenized_words = nltk.word_tokenize(text)

for word in tokenized_words:
    stem = porter_stemmer.stem(word)
    porter_stemmed_words.append(stem)

for word in tokenized_words:
    stem = lancaster_stemmer.stem(word)
    lancaster_stemmed_words.append(stem)

print('before; ', tokenized_words)
print('porter; ', porter_stemmed_words)
print('lancaster; ', lancaster_stemmed_words)



before;  ['You', 'are', 'so', 'lovely', '.', 'I', 'am', 'loving', 'you', 'now', '.']
porter;  ['you', 'are', 'so', 'love', '.', 'i', 'am', 'love', 'you', 'now', '.']
lancaster;  ['you', 'ar', 'so', 'lov', '.', 'i', 'am', 'lov', 'you', 'now', '.']


In [None]:
from nltk.stem import PorterStemmer

# 포터 스테머 어간 추출 함수
def stemming_by_porter(tokenized_words):
    porter_stemmer = PorterStemmer()
    porter_stemmed_words = []

    for word in tokenized_words:
        # porter_stemmed_words.append(porter_stemmer.stem(word))
        stem = porter_stemmer.stem(word)
        porter_stemmed_words.append(stem)

    return porter_stemmed_words

In [None]:
# 필요한 패키지와 함수 불러오기
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
#from text import TEXT
nltk.download('punkt')

TEXT = """After reading the comments for this movie, I am not sure whether I should be angry, sad or sickened. Seeing comments typical of people who a)know absolutely nothing about the military or b)who base everything they think they know on movies like this or on CNN reports about Abu-Gharib makes me wonder about the state of intellectual stimulation in the world. At the time I type this the number of people in the US military: 1.4 million on Active Duty with another almost 900,000 in the Guard and Reserves for a total of roughly 2.3 million. The number of people indicted for abuses at at Abu-Gharib: Currently less than 20 That makes the total of people indicted .00083% of the total military. Even if you indict every single military member that ever stepped in to Abu-Gharib, you would not come close to making that a whole number.  The flaws in this movie would take YEARS to cover. I understand that it's supposed to be sarcastic, but in reality, the writer and director are trying to make commentary about the state of the military without an enemy to fight. In reality, the US military has been at its busiest when there are not conflicts going on. The military is the first called for disaster relief and humanitarian aid missions. When the tsunami hit Indonesia, devestating the region, the US military was the first on the scene. When the chaos of the situation overwhelmed the local governments, it was military leadership who looked at their people, the same people this movie mocks, and said make it happen. Within hours, food aid was reaching isolated villages. Within days, airfields were built, cargo aircraft started landing and a food distribution system was up and running. Hours and days, not weeks and months. Yes there are unscrupulous people in the US military. But then, there are in every walk of life, every occupation. But to see people on this website decide that 2.3 million men and women are all criminal, with nothing on their minds but thoughts of destruction or mayhem is an absolute disservice to the things that they do every day. One person on this website even went so far as to say that military members are in it for personal gain. Wow! Entry level personnel make just under $8.00 an hour assuming a 40 hour work week. Of course, many work much more than 40 hours a week and those in harm's way typically put in 16-18 hour days for months on end. That makes the pay well under minimum wage. So much for personal gain. I beg you, please make yourself familiar with the world around you. Go to a nearby base, get a visitor pass and meet some of the men and women you are so quick to disparage. You would be surprised. The military no longer accepts people in lieu of prison time. They require a minimum of a GED and prefer a high school diploma. The middle ranks are expected to get a minimum of undergraduate degrees and the upper ranks are encouraged to get advanced degrees.
"""

corpus = TEXT
tokenized_words = word_tokenize(corpus)

# 포터 스테머의 어간 추출
def stemming_by_porter(tokenized_words):
    porter_stemmer = PorterStemmer()
    porter_stemmed_words = []

    for word in tokenized_words:
        porter_stemmed_words.append(porter_stemmer.stem(word))

    return porter_stemmed_words

stemming_by_porter(tokenized_words)

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('punkt')  # needed for acronym such as Mr. Dr. ...
from nltk.corpus import stopwords
nltk.download('stopwords')

#from preprocess import clean_by_freq
#from preprocess import clean_by_len
#from preprocess import clean_by_stopwords


df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitnlp/nlp/imdb.tsv', delimiter='\\t')
# normalization
df['review'] = df['review'].str.lower()  

# tokenize
df['word_tokens'] = df['review'].apply(word_tokenize)  

# cleaning
stopwords_set = set(stopwords.words('english'))
df['cleaned_tokens'] = df['word_tokens'].apply(lambda x: clean_by_freq(x, 1))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_len(x, 2))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_stopwords(x, stopwords_set))

# stemming
df['stemmed_tokens'] = df['cleaned_tokens'].apply(stemming_by_porter)

df['stemmed_tokens'][0]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitnlp/nlp/imdb.tsv', delimiter='\\t')


['one',
 'film',
 'said',
 'realli',
 'bad',
 'movi',
 'like',
 'said',
 'realli',
 'bad',
 'movi',
 'bad',
 'one',
 'film',
 'like']

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# sentence tokenization
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

TEXT = """Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.
There was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say to itself, 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its waistcoat-pocket, and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity, she ran across the field after it, and fortunately was just in time to see it pop down a large rabbit-hole under the hedge.
In another moment down went Alice after it, never once considering how in the world she was to get out again.
The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.
Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.
"""

corpus = TEXT
tokenized_sents = sent_tokenize(corpus)

tokenized_sents

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


["Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'",
 'So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.',
 "There was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say to itself, 'Oh dear!",
 'Oh dear!',
 "I shall be late!'",
 '(when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually\xa0took a watch out of its waistcoat-pocket, 

In [None]:
from nltk.tag import pos_tag  # part of speech tagging
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
import nltk
#nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


text = "Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \"Hey, let\'s pool our money together and make a really bad movie!\" Or something like that."
pos_tagged_words = []

tokenized_sents = sent_tokenize(text)
for sentence in tokenized_sents:
    # word tokenize
    tokenized_words = word_tokenize(sentence)

    # pos
    pos_tagged = pos_tag(tokenized_words)
    pos_tagged_words.extend(pos_tagged)

print(pos_tagged_words)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[('Watching', 'VBG'), ('Time', 'NNP'), ('Chasers', 'NNPS'), (',', ','), ('it', 'PRP'), ('obvious', 'VBZ'), ('that', 'IN'), ('it', 'PRP'), ('was', 'VBD'), ('made', 'VBN'), ('by', 'IN'), ('a', 'DT'), ('bunch', 'NN'), ('of', 'IN'), ('friends', 'NNS'), ('.', '.'), ('Maybe', 'RB'), ('they', 'PRP'), ('were', 'VBD'), ('sitting', 'VBG'), ('around', 'IN'), ('one', 'CD'), ('day', 'NN'), ('in', 'IN'), ('film', 'NN'), ('school', 'NN'), ('and', 'CC'), ('said', 'VBD'), (',', ','), ('``', '``'), ('Hey', 'NNP'), (',', ','), ('let', 'VB'), ("'s", 'POS'), ('pool', 'VB'), ('our', 'PRP$'), ('money', 'NN'), ('together', 'RB'), ('and', 'CC'), ('make', 'VB'), ('a', 'DT'), ('really', 'RB'), ('bad', 'JJ'), ('movie', 'NN'), ('!', '.'), ("''", "''"), ('Or', 'CC'), ('something', 'NN'), ('like', 'IN'), ('that', 'DT'), ('.', '.')]


In [None]:
# chapter3, lesson5 표제어 추출 (Lemmatization); 표제어(lemma)란 사전적 어원. am, are, is -> be

from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')
from nltk.tag import pos_tag  # Penn Treebank POS Tag
from nltk.corpus import wordnet as wn # WordNet POS Tag
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

text = 'You are the happiest person.'
tokenize_words = word_tokenize(text)

tagged_words = pos_tag(tokenize_words)
print(tagged_words)

def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    else:
        return

lemmatizer = WordNetLemmatizer()
lemmatized_words = []

for word, tag in tagged_words:
    wn_tag = penn_to_wn(tag)
    if wn_tag in (wn.NOUN, wn.ADJ, wn.ADV, wn.VERB):
        lemmatized_words.append(lemmatizer.lemmatize(word, wn_tag))
    else:
        lemmatized_words.append(word)

print('before; ', tokenize_words)
print('after; ', lemmatized_words)

[('You', 'PRP'), ('are', 'VBP'), ('the', 'DT'), ('happiest', 'JJS'), ('person', 'NN'), ('.', '.')]
before;  ['You', 'are', 'the', 'happiest', 'person', '.']
after:  ['You', 'be', 'the', 'happy', 'person', '.']


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
# nlp preprocessing II

import pandas as pd
#import nltk
#from nltk.tokenize import word_tokenize
#from nltk.tokenize import sent_tokenize
#nltk.download('punkt')  # needed for acronym such as Mr. Dr. ...
#from nltk.corpus import stopwords
#nltk.download('stopwords')

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitdata/imdb.tsv', delimiter="\\t")

# sentence tokenization
df['review'] = df['review'].str.lower()
df['sent_tokens'] = df['review'].apply(sent_tokenize)
#df['sent_tokens'][0]

# 품사 태깅 pos_tagging
df['pos_tagged_tokens'] = df['sent_tokens'].apply(pos_tagger)
print(df['pos_tagged_tokens'][0])

# 표제어 추출 Lemmatization
df['lemmatized_tokens'] = df['pos_tagged_tokens'].apply(word_lemmatizer)
print(df['lemmatized_tokens'][0])

# 
stopwords_set = set(stopwords.words('english'))
df['cleaned_tokens'] = df['lemmatized_tokens'].apply(lambda x: clean_by_freq(x, 1))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_len(x, 2))
df['cleaned_tokens'] = df['cleaned_tokens'].apply(lambda x: clean_by_stopwords(x, stopwords_set))

# combination
df['combined_corpus'] = df['cleaned_tokens'].apply(combine)
df[['combined_corpus']]

  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitdata/imdb.tsv', delimiter="\\t")


[('``', '``'), ('watching', 'JJ'), ('time', 'NN'), ('chasers', 'NNS'), (',', ','), ('it', 'PRP'), ('obvious', 'VBZ'), ('that', 'IN'), ('it', 'PRP'), ('was', 'VBD'), ('made', 'VBN'), ('by', 'IN'), ('a', 'DT'), ('bunch', 'NN'), ('of', 'IN'), ('friends', 'NNS'), ('.', '.'), ('maybe', 'RB'), ('they', 'PRP'), ('were', 'VBD'), ('sitting', 'VBG'), ('around', 'IN'), ('one', 'CD'), ('day', 'NN'), ('in', 'IN'), ('film', 'NN'), ('school', 'NN'), ('and', 'CC'), ('said', 'VBD'), (',', ','), ('\\', 'FW'), ("''", "''"), ("''", "''"), ('hey', 'NN'), (',', ','), ('let', 'VB'), ("'s", 'POS'), ('pool', 'VB'), ('our', 'PRP$'), ('money', 'NN'), ('together', 'RB'), ('and', 'CC'), ('make', 'VB'), ('a', 'DT'), ('really', 'RB'), ('bad', 'JJ'), ('movie', 'NN'), ('!', '.'), ('\\', 'NN'), ("''", "''"), ("''", "''"), ('or', 'CC'), ('something', 'NN'), ('like', 'IN'), ('that', 'DT'), ('.', '.'), ('what', 'WP'), ('ever', 'RB'), ('they', 'PRP'), ('said', 'VBD'), (',', ','), ('they', 'PRP'), ('still', 'RB'), ('ended',

Unnamed: 0,combined_corpus
0,make one film say make really bad movie like s...
1,film film
2,new york joan barnard elvire audrey barnard jo...
3,film film jump send n't jump radio n't send re...
4,site movie bad even movie movie make movie spe...
5,ehle northam wonderful wonderful ehle northam ...
6,role movie n't author book funny author author...
7,plane ceo search rescue mission call ceo harla...
8,gritty movie movie keep sci-fi good keep suspe...
9,girl girl


In [2]:
# Test APPLY function
# apply를 하면, 데이터 구조를 따로 고려하지 않고도, 리스트의 엔티티별로 해당 함수를 적용해줌

import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
#nltk.download('punkt')  # needed for acronym such as Mr. Dr. ...

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitdata/imdb.tsv', delimiter="\\t")
tokens = []

df['review'] = df['review'].str.lower()
df['review'][0]
for i in range(0, 10):
    # print(sent_tokenize(df['review'][i]))
    tokens.append(sent_tokenize(df['review'][i]))

print(tokens[5])
#df['sent_tokens'] = sent_tokenize(df['review'][0])
#df['sent_tokens']

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/codeitdata/imdb.tsv', delimiter="\\t")


['"jennifer ehle was sparkling in \\""pride and prejudice.\\"" jeremy northam was simply wonderful in \\""the winslow boy.\\"" with actors of this caliber, this film had to have a lot going for it.', 'even those who were critical of the movie spoke of the wonderful sequences involving these two.', 'i was eager to see it.', 'it is with bitter disappointment, however, that i must report that this flick is a piece of trash.', 'the scenes between ehle and northam had no depth or tenderness or real passion; they consisted of hackneyed and unsubtle latter-day cinematic lust--voracious open-mouthed kissing and soft-porn humping.', "lust can be entertaining if it's done with originality; this was tasteless and awful.", 'ehle and northam have sullied their craft; they should be ashamed.', 'as for the modern part of the romance, i was unnerved by the effeminate appearance of the male lead.', "aren't there any masculine men left in hollywood?", 'the plot was kind of interesting; with a better scr

In [24]:
# 정수 인코딩 Integer Encoding; 텍스트를 숫자데이터로 변환하는 방법. 토큰화된 각 단어에 특정 정수를 매핑

#tokens = df['cleaned_tokens'][4]
tokens = sum(df['cleaned_tokens'], [])

vocab = Counter(tokens)
vocab = vocab.most_common()

word_to_idx = {}
i = 0

for (word, frequency) in vocab:
    i += 1  # 0은 아무 의미 없는 (무시되는) 정수를 위해 남겨두고, 1부터 시작
    word_to_idx[word] = i

print(word_to_idx)

{'movie': 1, 'film': 2, "n't": 3, 'scene': 4, 'bad': 5, 'time': 6, 'reason': 7, 'make': 8, 'jim': 9, 'good': 10, 'one': 11, 'like': 12, 'could': 13, "'re": 14, 'quastel': 15, 'really': 16, 'even': 17, 'monster': 18, 'joan': 19, 'love': 20, 'author': 21, 'try': 22, 'dialogue': 23, 'idea': 24, 'italy': 25, 'colleague': 26, 'maggot': 27, 'end': 28, 'watch': 29, 'jump': 30, 'radio': 31, 'stand-up': 32, 'day': 33, 'terrible': 34, 'ehle': 35, 'northam': 36, 'search': 37, 'rescue': 38, 'call': 39, 'knowles': 40, 'henriksen': 41, 'easily': 42, 'see': 43, 'appear': 44, 'get': 45, 'character': 46, 'think': 47, 'use': 48, 'whether': 49, 'need': 50, 'though': 51, 'sci-fi': 52, 'look': 53, 'say': 54, 'new': 55, 'york': 56, 'barnard': 57, 'elvire': 58, 'audrey': 59, 'john': 60, 'saxon': 61, 'etruscan': 62, 'tomb': 63, 'drug': 64, 'story': 65, 'romantic': 66, 'waste': 67, 'etrusco': 68, 'send': 69, 'reporter': 70, 'fear': 71, 'site': 72, 'special': 73, 'describe': 74, 'actor': 75, 'stand': 76, 'comed

In [33]:
def idx_encoder(tokens, word_to_idx):
    encoded_idx = []
    for token in tokens:
        idx = word_to_idx[token]
        encoded_idx.append(idx)
    return encoded_idx

df['integer_encoded'] = df['cleaned_tokens'].apply(lambda x: idx_encoder(x, word_to_idx))
#print(df[['integer_encoded']])

# padding to make a matrix with tokens x max_length
max_len = max(len(item) for item in df['integer_encoded'])
print(max_len)

for tokens in df['integer_encoded']:
    while len(tokens) < max_len:
        tokens.append(0)

df[['integer_encoded']]

200


Unnamed: 0,integer_encoded
0,"[8, 11, 2, 54, 8, 16, 5, 1, 12, 54, 8, 16, 5, ..."
1,"[2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[55, 56, 19, 57, 58, 59, 57, 60, 61, 25, 62, 6..."
3,"[2, 2, 30, 69, 3, 30, 31, 3, 69, 70, 71, 30, 7..."
4,"[72, 1, 5, 17, 1, 1, 8, 1, 73, 74, 1, 1, 74, 1..."
5,"[35, 36, 78, 78, 35, 36, 79, 79, 35, 36, 0, 0,..."
6,"[80, 1, 3, 21, 81, 82, 21, 21, 80, 3, 82, 83, ..."
7,"[85, 86, 37, 38, 87, 39, 86, 88, 40, 89, 41, 9..."
8,"[120, 1, 1, 121, 52, 10, 121, 122, 53, 1, 52, ..."
9,"[123, 123, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [21]:
TEXT = """Alice was beginning to get very tired of sitting by her sister on the bank, and of having nothing to do: once or twice she had peeped into the book her sister was reading, but it had no pictures or conversations in it, 'and what is the use of a book,' thought Alice 'without pictures or conversation?'
So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.
There was nothing so very remarkable in that; nor did Alice think it so very much out of the way to hear the Rabbit say to itself, 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred to her that she ought to have wondered at this, but at the time it all seemed quite natural); but when the Rabbit actually took a watch out of its waistcoat-pocket, and looked at it, and then hurried on, Alice started to her feet, for it flashed across her mind that she had never before seen a rabbit with either a waistcoat-pocket, or a watch to take out of it, and burning with curiosity, she ran across the field after it, and fortunately was just in time to see it pop down a large rabbit-hole under the hedge.
In another moment down went Alice after it, never once considering how in the world she was to get out again.
The rabbit-hole went straight on like a tunnel for some way, and then dipped suddenly down, so suddenly that Alice had not a moment to think about stopping herself before she found herself falling down a very deep well.
Either the well was very deep, or she fell very slowly, for she had plenty of time as she went down to look about her and to wonder what was going to happen next. First, she tried to look down and make out what she was coming to, but it was too dark to see anything; then she looked at the sides of the well, and noticed that they were filled with cupboards and book-shelves; here and there she saw maps and pictures hung upon pegs. She took down a jar from one of the shelves as she passed; it was labelled 'ORANGE MARMALADE', but to her great disappointment it was empty: she did not like to drop the jar for fear of killing somebody, so managed to put it into one of the cupboards as she fell past it.
"""

word_to_idx = {} # 단어 별 인덱스 부여하기 위한 딕셔너리
i = 0
encoded_idx = [] # 각 토큰의 정수 인덱스를 부여하기 위한 리스트
corpus = TEXT

tokenized_words = word_tokenize(corpus)

# 단어의 빈도수를 계산하여 정렬하는 코드를 작성하세요
vocab = Counter(tokenized_words)
vocab = vocab.most_common()

for (word, frequency) in vocab:
    # 여기에 코드를 작성하세요
    i += 1
    word_to_idx[word] = i

for word in tokenized_words:
    # 여기에 코드를 작성하세요
    idx = word_to_idx[word]
    encoded_idx.append(idx)

# 테스트 코드
encoded_idx

[14,
 8,
 79,
 2,
 39,
 11,
 80,
 7,
 81,
 40,
 10,
 41,
 29,
 3,
 82,
 1,
 6,
 7,
 83,
 42,
 2,
 84,
 43,
 44,
 15,
 85,
 4,
 16,
 86,
 45,
 3,
 46,
 10,
 41,
 8,
 87,
 1,
 17,
 5,
 16,
 88,
 30,
 15,
 89,
 18,
 5,
 1,
 90,
 31,
 91,
 3,
 92,
 7,
 9,
 46,
 1,
 24,
 47,
 14,
 93,
 30,
 15,
 94,
 95,
 24,
 96,
 4,
 8,
 48,
 18,
 10,
 97,
 49,
 50,
 19,
 25,
 19,
 4,
 98,
 1,
 20,
 3,
 99,
 100,
 101,
 10,
 102,
 11,
 103,
 6,
 104,
 51,
 1,
 105,
 3,
 106,
 7,
 107,
 9,
 108,
 109,
 52,
 110,
 3,
 111,
 7,
 112,
 113,
 6,
 114,
 3,
 115,
 1,
 32,
 33,
 9,
 116,
 34,
 26,
 117,
 118,
 53,
 119,
 40,
 10,
 12,
 120,
 8,
 42,
 27,
 11,
 121,
 18,
 21,
 22,
 122,
 54,
 14,
 55,
 5,
 27,
 11,
 123,
 23,
 7,
 3,
 56,
 2,
 124,
 3,
 34,
 125,
 2,
 126,
 1,
 127,
 57,
 35,
 128,
 57,
 35,
 129,
 130,
 52,
 131,
 35,
 24,
 50,
 32,
 4,
 47,
 5,
 132,
 133,
 1,
 5,
 134,
 2,
 10,
 21,
 4,
 135,
 2,
 136,
 137,
 28,
 138,
 1,
 17,
 28,
 3,
 36,
 5,
 139,
 140,
 141,
 142,
 51,
 22,
 17,
 32,
 3,
 