# NLP Data Preprocess Examples
- [Datasets](#datasets)
- [Basic](#basic)
- [Word Embedding](#word-embedding)

In [None]:
import warnings
warnings.filterwarnings("ignore")

import nltk
import konlpy
import gensim
import tensorflow as tf

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import re
import urllib.request

## Datasets
- [20 Newsgroups Dataset](#20-newsgroups-dataset)
- [SMS Spam Collection Dataset](#sms-spam-collection-dataset)
- [Book Description Dataset](#book-description-dataset)
- [Naver Movie Review Dataset](#naver-movie-review-dataset)

### 20 Newsgroups Dataset

In [None]:
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer

dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
print(len(dataset.data))
dataset.data[0]

11314


"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

### SMS Spam Collection Dataset

In [None]:
urllib.request.urlretrieve('https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv', filename='spam.csv')
dataset = pd.read_csv('spam.csv', encoding='latin-1')
print(dataset.shape)
dataset.head()

(5572, 5)


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


### Book Description Dataset

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/09.%20Word%20Embedding/dataset/data.csv", filename="data.csv")
dataset = pd.read_csv("data.csv")
print(dataset.shape)
dataset.head()

(2382, 8)


Unnamed: 0.2,Unnamed: 0.1,Desc,Unnamed: 0,author,genre,image_link,rating,title
0,0,We know that power is shifting: From West to E...,0.0,Moisés Naím,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.63,The End of Power: From Boardrooms to Battlefie...
1,1,Following the success of The Accidental Billio...,1.0,Blake J. Harris,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.94,"Console Wars: Sega, Nintendo, and the Battle t..."
2,2,How to tap the power of social software and ne...,2.0,Chris Brogan,Business,https://i.gr-assets.com/images/S/compressed.ph...,3.78,Trust Agents: Using the Web to Build Influence...
3,3,William J. Bernstein is an American financial ...,3.0,William J. Bernstein,Business,https://i.gr-assets.com/images/S/compressed.ph...,4.2,The Four Pillars of Investing
4,4,Amazing book. And I joined Steve Jobs and many...,4.0,Akio Morita,Business,https://i.gr-assets.com/images/S/compressed.ph...,4.05,Made in Japan: Akio Morita and Sony


### Naver Movie Review Dataset

In [None]:
from konlpy.tag import Okt

urllib.request.urlretrieve('https://raw.githubusercontent.com/e9t/nsmc/master/ratings.txt', filename='ratings.txt')
dataset = pd.read_table('ratings.txt')
print(dataset.shape)
dataset.head()

(200000, 3)


Unnamed: 0,id,document,label
0,8112052,어릴때보고 지금다시봐도 재밌어요ㅋㅋ,1
1,8132799,"디자인을 배우는 학생으로, 외국디자이너와 그들이 일군 전통을 통해 발전해가는 문화산...",1
2,4655635,폴리스스토리 시리즈는 1부터 뉴까지 버릴께 하나도 없음.. 최고.,1
3,9251303,와.. 연기가 진짜 개쩔구나.. 지루할거라고 생각했는데 몰입해서 봤다.. 그래 이런...,1
4,10067386,안개 자욱한 밤하늘에 떠 있는 초승달 같은 영화.,1


### English Tokenization

In [None]:
from nltk.corpus import stopwords

documents = dataset.data

documents = [re.sub('[^a-zA-Z ]', ' ', document) for document in documents]
documents = [document.lower() for document in documents if len(document.split())>3]

stop_words = stopwords.words('english')
pattern = re.compile(r'\b(' + r'|'.join(stop_words) + r')\b\s*')
tokenized = [pattern.sub('', document).split() for document in documents]
tokenized = [document for document in tokenized if len(document)>1]
print('Samples:', len(tokenized))

Samples: 10930


### Korean Tokenization

In [None]:
from nltk.corpus import stopwords

documents = list()

documents = [re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', document) for document in documents]
documents = [document for document in documents if len(document.split())>3]

okt = Okt()
stop_words = stopwords.words('korean')
tokenized = list()

for document in documents:
    tokenized_sentence = okt.morphs(document, stem=True)
    tokenized_sentence = [word for word in tokenized_sentence if word not in stop_words]
    tokenized.append(tokenized_sentence)
print('Samples:', len(tokenized))

### Tokenized Dataset Encoding

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized)

word2idx = tokenizer.word_index
idx2word = {value:key for key,value in word2idx.items()}
encoded = tokenizer.texts_to_sequences(tokenized)

## Basic
- [Tokenization](#tokenization)
- [Stemming](#stemming)
- [Lemmatization](#lemmatization)
- [Stopword](#stopword)
- [Integer Encoding](#integer-encoding)
- [Padding](#padding)
- [One-Hot Encoding](#one-hot-encoding)
- [Korean Tools](#korean-tools)

### Tokenization

In [None]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

text = "I am actively looking for Ph.D. students. and you are a Ph.D. student."
tokenized_sentence = word_tokenize(text)

print('Word tokenization:', tokenized_sentence)
print('POS tagging:', pos_tag(tokenized_sentence))

Word tokenization: ['I', 'am', 'actively', 'looking', 'for', 'Ph.D.', 'students', '.', 'and', 'you', 'are', 'a', 'Ph.D.', 'student', '.']
POS tagging: [('I', 'PRP'), ('am', 'VBP'), ('actively', 'RB'), ('looking', 'VBG'), ('for', 'IN'), ('Ph.D.', 'NNP'), ('students', 'NNS'), ('.', '.'), ('and', 'CC'), ('you', 'PRP'), ('are', 'VBP'), ('a', 'DT'), ('Ph.D.', 'NNP'), ('student', 'NN'), ('.', '.')]


In [None]:
from nltk.tokenize import sent_tokenize

text = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to make sure no one was near."
print('Sentence Tokenization:', sent_tokenize(text))

Sentence Tokenization: ['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of a cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to make sure no one was near.']


In [None]:
from konlpy.tag import Okt

okt = Okt()
text = '열심히 코딩한 당신, 연휴에는 여행을 가봐요'

print('OKT 형태소 분석:', okt.morphs(text))
print('OKT 품사 태깅:', okt.pos(text))
print('OKT 명사 추출:', okt.nouns(text))

OKT 형태소 분석: ['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요']
OKT 품사 태깅: [('열심히', 'Adverb'), ('코딩', 'Noun'), ('한', 'Josa'), ('당신', 'Noun'), (',', 'Punctuation'), ('연휴', 'Noun'), ('에는', 'Josa'), ('여행', 'Noun'), ('을', 'Josa'), ('가봐요', 'Verb')]
OKT 명사 추출: ['코딩', '당신', '연휴', '여행']


### Stemming

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import word_tokenize

porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
words = ['formalize', 'allowance', 'electricical']

print('Original:'.ljust(18,' '), words)
print('Porter Stemmer:'.ljust(18,' '), [porter_stemmer.stem(w) for w in words])
print('Lancaster Stemmer:'.ljust(18,' '), [lancaster_stemmer.stem(w) for w in words])

Original:          ['formalize', 'allowance', 'electricical']
Porter Stemmer:    ['formal', 'allow', 'electric']
Lancaster Stemmer: ['form', 'allow', 'elect']


### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
words = ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']

print('Original:'.ljust(14,' '), words)
print('Lemmatization:'.ljust(14,' '), [lemmatizer.lemmatize(word) for word in words])

Original:      ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
Lemmatization: ['policy', 'doing', 'organization', 'have', 'going', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']


### Stopword

In [None]:
from nltk.corpus import stopwords

stop_words_list = stopwords.words('english')
print('Stopwords:',stop_words_list[:10])

Stopwords: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [None]:
example = "Family is not an important thing. It's everything."
stop_words = set(stopwords.words('english')) 
word_tokens = word_tokenize(example)

print('Origianl:'.ljust(9,' '), word_tokens) 
print('Result:'.ljust(9,' '), [word for word in word_tokens if word not in stop_words])

Origianl: ['Family', 'is', 'not', 'an', 'important', 'thing', '.', 'It', "'s", 'everything', '.']
Result:   ['Family', 'important', 'thing', '.', 'It', "'s", 'everything', '.']


In [None]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

raw_text = 'A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain.'
stop_words = set(stopwords.words('english'))

sentences = sent_tokenize(raw_text)
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
preprocessed_sentences = list()

for sentence in tokenized_sentences:
    word_tokens = [word.lower() for word in sentence if len(word) > 2]
    word_tokens = [word for word in word_tokens if word not in stop_words]
    preprocessed_sentences.append(word_tokens)

### Integer Encoding

In [None]:
from collections import Counter
from nltk import FreqDist

words = np.hstack(preprocessed_sentences)
vocab_size = 5
vocab = dict(Counter(words).most_common(vocab_size))
# vocab = dict(FreqDist(words).most_common(vocab_size))
print('Word counts:', vocab)

word_to_index = {word:i for i,word in enumerate(vocab.keys(),1)}
word_to_index['OOV'] = len(word_to_index)+1 # Out-Of-Vocabulary 토큰 추가
print('Word index:', word_to_index)

encoded_sentences = list()
for sentence in preprocessed_sentences:
    encoded_sentence = [word_to_index[word] if word in word_to_index else word_to_index['OOV'] for word in sentence]
    encoded_sentences.append(encoded_sentence)

print('Encoded sentences:', encoded_sentences)

Word counts: {'barber': 8, 'secret': 6, 'huge': 5, 'kept': 4, 'person': 3}
Word index: {'barber': 1, 'secret': 2, 'huge': 3, 'kept': 4, 'person': 5, 'OOV': 6}
Encoded sentences: [[1, 5], [1, 6, 5], [1, 3, 5], [6, 2], [2, 4, 3, 2], [3, 2], [1, 4, 6], [1, 4, 6], [1, 4, 2], [6, 6, 3, 2, 6, 1, 6], [1, 6, 3, 6]]


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

vocab_size = 5
tokenizer = Tokenizer(num_words=vocab_size+2, oov_token='OOV')
tokenizer.fit_on_texts(preprocessed_sentences)
encoded = tokenizer.texts_to_sequences(preprocessed_sentences)

# num_words는 texts_to_sequences()에만 적용
print('Word counts:', dict(tokenizer.word_counts))
print('Word index:', tokenizer.word_index)
print('Encoded sentences:', encoded)

Word counts: {'barber': 8, 'person': 3, 'good': 1, 'huge': 5, 'knew': 1, 'secret': 6, 'kept': 4, 'word': 2, 'keeping': 2, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1}
Word index: {'OOV': 1, 'barber': 2, 'secret': 3, 'huge': 4, 'kept': 5, 'person': 6, 'word': 7, 'keeping': 8, 'good': 9, 'knew': 10, 'driving': 11, 'crazy': 12, 'went': 13, 'mountain': 14}
Encoded sentences: [[2, 6], [2, 1, 6], [2, 4, 6], [1, 3], [3, 5, 4, 3], [4, 3], [2, 5, 1], [2, 5, 1], [2, 5, 3], [1, 1, 4, 3, 1, 2, 1], [2, 1, 4, 1]]


### Padding

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded = pad_sequences(encoded, padding='post', truncating='post', maxlen=5)
padded

array([[2, 6, 0, 0, 0],
       [2, 1, 6, 0, 0],
       [2, 4, 6, 0, 0],
       [1, 3, 0, 0, 0],
       [3, 5, 4, 3, 0],
       [4, 3, 0, 0, 0],
       [2, 5, 1, 0, 0],
       [2, 5, 1, 0, 0],
       [2, 5, 3, 0, 0],
       [1, 1, 4, 3, 1],
       [2, 1, 4, 1, 0]], dtype=int32)

### One-Hot Encoding

In [None]:
from tensorflow.keras.utils import to_categorical

print('endoced:', encoded[0])

one_hot = to_categorical(encoded[0])
one_hot

endoced: [2, 6]


array([[0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

### Korean Tools

In [None]:
# PyKoSpacing
# 띄어쓰기 교정
# pip install git+https://github.com/haven-jeon/PyKoSpacing.git

from pykospacing import Spacing

sentence = str()

spacing = Spacing()
kospacing_sent = spacing(sentence)

# Py-Hanspell
# 네이버 한글 맞춤법 검사기
# pip install git+https://github.com/ssut/py-hanspell.git

from hanspell import spell_checker

spelled_sent = spell_checker.check(sentence)
hanspell_sent = spelled_sent.checked

# SOYNLP
# 학습 기반 토크나이저, 반복되는 문자 정제
# pip install soynlp

## Word Embedding
- [DTM](#dtm)
- [TF-IDF](#tf-idf)
- [Word2Vec](#word2vec)
- [GloVe](#glove)
- [FastText](#fasttext)
- [Keras Embedding Layer](#keras-embedding-layer)
- [ELMo](#elmo)

### DTM

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['you know I want your love', 'I like you', 'what should I do ']
vector = CountVectorizer()

print(vector.fit_transform(corpus).toarray())
print(vector.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ['you know I want your love', 'I like you', 'what should I do ']
vector = TfidfVectorizer()

print(vector.fit_transform(corpus).toarray())
print(vector.vocabulary_)

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


### Word2Vec

In [None]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# sg=0: CBOW, sg=1: Skip-gram
model = Word2Vec(sentences=tokenized, vector_size=100, window=5, min_count=5, workers=4, sg=0)

In [None]:
model_result = model.wv.most_similar('like')
print(model_result)

[('nice', 0.6690633893013), ('better', 0.6060903668403625), ('probably', 0.5894891619682312), ('prefer', 0.5708296298980713), ('bad', 0.5548878908157349), ('done', 0.5529392957687378), ('ferrari', 0.5524083375930786), ('want', 0.539306104183197), ('think', 0.5361883640289307), ('guess', 0.5306634306907654)]


In [None]:
model.wv.save_word2vec_format('eng_w2v') # 모델 저장
loaded_model = KeyedVectors.load_word2vec_format('eng_w2v') # 모델 로드

In [None]:
# 구글의 사전 훈련된 Word2Vec 모델
urllib.request.urlretrieve('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', \
                           filename='GoogleNews-vectors-negative300.bin.gz')
word2vec_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [None]:
# Embedding Visualization
# @ https://projector.tensorflow.org/

!python -m gensim.scripts.word2vec2tensor --input eng_w2v --output eng_w2v

### GloVe

In [None]:
from urllib.request import urlretrieve, urlopen
import zipfile

# 사전 훈련된 GloVe
urlretrieve('http://nlp.stanford.edu/data/glove.6B.zip', filename='glove.6B.zip')
zf = zipfile.ZipFile('glove.6B.zip')
zf.extractall()

embedding_dict = dict()
with open('glove.6B.100d.txt', 'r', encoding='utf8') as f:
    for line in f:
        word_vector = line.split()
        word = word_vector[0]
        word_vector_arr = np.asarray(word_vector[1:], dtype='float32')
        embedding_dict[word] = word_vector_arr

### FastText

In [None]:
from gensim.models import FastText

model = FastText(sentences=tokenized, vector_size=100, window=5, min_count=5, workers=4, sg=1)

In [None]:
model_result = model.wv.most_similar('like')
print(model_result)

[('likes', 0.795409083366394), ('likud', 0.7720040082931519), ('liked', 0.7430700659751892), ('unlike', 0.74107825756073), ('alike', 0.7275046110153198), ('likewise', 0.7235626578330994), ('dislike', 0.6973028182983398), ('nike', 0.687152087688446), ('pike', 0.6598094701766968), ('unhappy', 0.6429505944252014)]


### Keras Embedding Layer

In [None]:
from tensorflow.keras.layers import Embedding

vocab_size = 20000
output_dim = 128
input_length = 500

v = Embedding(vocab_size, output_dim, input_length=input_length)

### ELMo

In [None]:
import tensorflow_hub as hub
from tensorflow.keras.layers import Lambda

elmo = hub.Module("https://tfhub.dev/google/elmo/1", trainable=True)

def ELMoEmbedding(x):
    return elmo(tf.squeeze(tf.cast(x, tf.string)), as_dict=True, signature="default")["default"]

# embedding_layer = Lambda(ELMoEmbedding, output_shape=(1024, ))(input_text)