# 텍스트 마이닝 기본 문법
### 문장 토큰화
- 파이썬 머신러닝 완벽가이드 (492p)

In [1]:
from nltk import sent_tokenize
import nltk

nltk.download('punkt') #마침표, 개행문자 관련 데이터셋 다운

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
text_sample = 'The Matrix is everywhere its all around us, here even in this room. \
               You can see it out your window or on your television. \
               You feel it when you go to work, or go to church or pay your taxes.'

sentences = sent_tokenize(text=text_sample)
print(type(sentences))
print(len(sentences))
print(sentences)

<class 'list'>
3
['The Matrix is everywhere its all around us, here even in this room.', 'You can see it out your window or on your television.', 'You feel it when you go to work, or go to church or pay your taxes.']


### 단어 토큰화

In [4]:
from nltk import word_tokenize

sentence = "The Matrix is everywhere its all around us, here even in this room."
words = word_tokenize(sentence)
print(type(words))
print(len(words))
print(words)

<class 'list'>
15
['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.']


### 문서를 단어로 토큰화 함수

In [5]:
from nltk import word_tokenize, sent_tokenize

# 여러개의 문장으로 된 입력데이터(즉, 문서)를 문장별로 단어 토큰화하는 함수
def tokenize_text(text):

    # 문장 분리
    sentences = sent_tokenize(text) #문장 수만큼 리스트 존재
    # 문장별로 단어 토큰화
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    return word_tokens

# 문서를 단어별로 토큰화
word_tokens = tokenize_text(text_sample)
print(type(word_tokens), len(word_tokens))
print(word_tokens)

<class 'list'> 3
[['The', 'Matrix', 'is', 'everywhere', 'its', 'all', 'around', 'us', ',', 'here', 'even', 'in', 'this', 'room', '.'], ['You', 'can', 'see', 'it', 'out', 'your', 'window', 'or', 'on', 'your', 'television', '.'], ['You', 'feel', 'it', 'when', 'you', 'go', 'to', 'work', ',', 'or', 'go', 'to', 'church', 'or', 'pay', 'your', 'taxes', '.']]


In [13]:
text_sample = '"We don’t advertise” is a common refrain I see in founder stories. \
It’s usually a way to communicate product quality. Our product is so good we didn’t have to advertise. \
Ads are expensive. So not advertising is an attractive business strategy. \
But just because you don’t advertise, doesn’t mean you don’t market. Every company needs some sort of customer acquisition strategy. It just doesn’t need to be advertising.'
word_tokens = tokenize_text(text_sample)
print(type(word_tokens), len(word_tokens))
print(word_tokens)

<class 'list'> 8
[['``', 'We', 'don', '’', 't', 'advertise', '”', 'is', 'a', 'common', 'refrain', 'I', 'see', 'in', 'founder', 'stories', '.'], ['It', '’', 's', 'usually', 'a', 'way', 'to', 'communicate', 'product', 'quality', '.'], ['Our', 'product', 'is', 'so', 'good', 'we', 'didn', '’', 't', 'have', 'to', 'advertise', '.'], ['Ads', 'are', 'expensive', '.'], ['So', 'not', 'advertising', 'is', 'an', 'attractive', 'business', 'strategy', '.'], ['But', 'just', 'because', 'you', 'don', '’', 't', 'advertise', ',', 'doesn', '’', 't', 'mean', 'you', 'don', '’', 't', 'market', '.'], ['Every', 'company', 'needs', 'some', 'sort', 'of', 'customer', 'acquisition', 'strategy', '.'], ['It', 'just', 'doesn', '’', 't', 'need', 'to', 'be', 'advertising', '.']]


In [7]:
text_sample = "이균용 대법원장 후보자(사진)의 자녀들이 9세, 11세 때 가액이 2억5000만원(2023년 현재 기준)에 가까운 비상장주식을 각각 취득한 것으로 보여 취득 경위나 증여세 납부 여부가 소명돼야 한다는 지적이 나온다. 이 후보자는 공직자 재산공개 때 가액이 10억원 가까이 되는 비상장주식을 신고하지 않았는데, 앞서 대법원은 고위법관이 재산 신고를 누락하면 징계하겠다고 경고한 것으로 드러났다."
word_tokens = tokenize_text(text_sample)
print(type(word_tokens), len(word_tokens))
print(word_tokens)

<class 'list'> 2
[['이균용', '대법원장', '후보자', '(', '사진', ')', '의', '자녀들이', '9세', ',', '11세', '때', '가액이', '2억5000만원', '(', '2023년', '현재', '기준', ')', '에', '가까운', '비상장주식을', '각각', '취득한', '것으로', '보여', '취득', '경위나', '증여세', '납부', '여부가', '소명돼야', '한다는', '지적이', '나온다', '.'], ['이', '후보자는', '공직자', '재산공개', '때', '가액이', '10억원', '가까이', '되는', '비상장주식을', '신고하지', '않았는데', ',', '앞서', '대법원은', '고위법관이', '재산', '신고를', '누락하면', '징계하겠다고', '경고한', '것으로', '드러났다', '.']]


## StopWords 제거
- 불용어 : 분석에 큰 의미가 없는 단어 제거
- 495p

In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
print('영어 불용어 개수: ', len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english')[:20])

영어 불용어 개수:  179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


### 불용어 제거

In [28]:
text_sample = '"We don’t advertise” is a common refrain I see in founder stories. \
It’s usually a way to communicate product quality. Our product is so good we didn’t have to advertise. \
Ads are expensive. So not advertising is an attractive business strategy. \
But just because you don’t advertise, doesn’t mean you don’t market. Every company needs some sort of customer acquisition strategy. It just doesn’t need to be advertising.'

word_tokens = tokenize_text(text_sample)

# 특정 도메인에서 분석하려고 함
stopwords = nltk.corpus.stopwords.words('english')
stopwords = stopwords + ["common", "advertise"] #불필요한 단어 추가
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [29]:
all_tokens = []
# 위 예제의 3개의 문장별로 얻은 word_tokens list 에 대해 stop word 제거 Loop
for sentence in word_tokens:
    filtered_words=[]
    # 개별 문장별로 tokenize된 sentence list에 대해 stop word 제거 Loop
    for word in sentence:
        #소문자로 모두 변환합니다.
        word = word.lower()
        # tokenize 된 개별 word가 stop words 들의 단어에 포함되지 않으면 word_tokens에 추가
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)

print(all_tokens)

[['``', '’', '”', 'refrain', 'see', 'founder', 'stories', '.'], ['’', 'usually', 'way', 'communicate', 'product', 'quality', '.'], ['product', 'good', '’', '.'], ['ads', 'expensive', '.'], ['advertising', 'attractive', 'business', 'strategy', '.'], ['’', ',', '’', 'mean', '’', 'market', '.'], ['every', 'company', 'needs', 'sort', 'customer', 'acquisition', 'strategy', '.'], ['’', 'need', 'advertising', '.']]


### 어근 추출

In [24]:
from nltk.stem import LancasterStemmer
stemmer = LancasterStemmer()

print(stemmer.stem('working'),stemmer.stem('works'),stemmer.stem('worked'))
print(stemmer.stem('amusing'),stemmer.stem('amuses'),stemmer.stem('amused'))
print(stemmer.stem('happier'),stemmer.stem('happiest'))
print(stemmer.stem('fancier'),stemmer.stem('fanciest'))

work work work
amus amus amus
happy happiest
fant fanciest


In [30]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')

lemma = WordNetLemmatizer()
print(lemma.lemmatize('amusing','v'),lemma.lemmatize('amuses','v'),lemma.lemmatize('amused','v'))
print(lemma.lemmatize('happier','a'),lemma.lemmatize('happiest','a'))
print(lemma.lemmatize('fancier','a'),lemma.lemmatize('fanciest','a'))

amuse amuse amuse
happy happy
fancy fancy


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 데이터 불러오기

In [32]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [38]:
import pandas as pd
DATA_PATH = '/content/drive/MyDrive/멀티캠퍼스/data/text_mining/'
review_df = pd.read_csv(DATA_PATH + './labeledTrainData.tsv', header=0, sep="\t", quoting=3)
review_df.head(3)

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."


In [39]:
review_df['review'][0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

### 텍스트 전처리
- 참고 : 정규표현식
- https://wikidocs.net/4308
- https://www.w3schools.com/python/python_regex.asp
- https://regexr.com/

In [40]:
import re
review_df['review'] = review_df['review'].str.replace('<br />', ' ')
review_df['review'][0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.  Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.  The actual feature film bit when it finally starts is only on f

In [41]:
# 영어 문자열이 아닌 모든 문자는 공백으로 변환
review_df['review'] = review_df['review'].apply(lambda x: re.sub("[^a-zA-Z]", " ", x))
review_df['review'][0]

' With all this stuff going down at the moment with MJ i ve started listening to his music  watching the odd documentary here and there  watched The Wiz and watched Moonwalker again  Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent  Moonwalker is part biography  part feature film which i remember going to see at the cinema when it was originally released  Some of it has subtle messages about MJ s feeling towards the press and also the obvious message of drugs are bad m kay   Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring  Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him   The actual feature film bit when it finally starts is only on for 

### 훈련/테스트 데이터 분리

In [43]:
from sklearn.model_selection import train_test_split

class_df = review_df['sentiment']
feature_df = review_df.drop(['id', 'sentiment'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(feature_df, class_df, test_size = 0.3, random_state = 156)
X_train.shape, X_test.shape

((17500, 1), (7500, 1))

### 모델 학습
- count 벡터화, 로지스틱 회귀 파이프라인

In [44]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score

pipeline = Pipeline([
    ('cnt_vect', CountVectorizer(stop_words='english', ngram_range=(1, 2) )),
    ('lr_clf', LogisticRegression(solver='liblinear', C=10))
])

pipeline.fit(X_train['review'], y_train)

In [46]:
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:, 1]

print('예측 정확도는 {0:.4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test ,pred),
                                         roc_auc_score(y_test, pred_probs)))

예측 정확도는 0.8861, ROC-AUC는 0.9503


- TF-IDF 벡터화, 로지스틱 파이프라인

In [47]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression(solver='liblinear', C=10))
])

pipeline.fit(X_train['review'], y_train)
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:, 1]

print('예측 정확도는 {0:.4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test ,pred),
                                         roc_auc_score(y_test, pred_probs)))

예측 정확도는 0.8936, ROC-AUC는 0.9598
