In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [None]:
### stevejobs.txt

In [None]:
# 불러오기
with open('../data/stevejobs.txt', 'r', encoding='utf8') as f:
    rows=f.readlines()
    lines=[row for row in rows]
text=' '.join(lines)

In [None]:
# 1-1.텍스트 전처리(정규화): 클렌징
import re
compile=re.compile('[^ a-zA-Z0-9\.]+')  #[a-zA-Z]+
text=compile.sub('',text).lower()

In [None]:
# 1-2.토큰화(문장)
import nltk
nltk.download('punkt')
sentences=nltk.sent_tokenize(text=text)

In [None]:
# 토큰화(단어) extend: 리스트로 
word_token=[]
for sentence in sentences:
    words=nltk.word_tokenize(sentence)
    word_token.extend(words)
print(word_token[:3])

In [None]:
# 토큰화(문단 --> 단어) append: 2차원 데이터프레임으로
word_token2=[]
for sentence in sentences:
    words=nltk.word_tokenize(sentence)
    word_token2.append(words)
print(word_token2[:3])

In [None]:
# 토큰화 함수: 문서의 모든 단어
from nltk import sent_tokenize, word_tokenize

def tokenize_text(text):
    sentences = nltk.sent_tokenize(text)
    word_tokens = [nltk.word_tokenize(sentense) for sentense in sentences]
    return word_tokens
word_tokens = tokenize_text(text)
print(word_tokens[:2])

In [None]:
# 1-3.불용어(stop_words) 제거
nltk.download('stopwords')
stopwords=nltk.corpus.stopwords.words('english')

all_tokens=[]
for sentence in word_tokens:
    filtered_words=[]
    for word in sentence:
        if word not in stopwords:
            filtered_words.append(word)
    all_tokens.append(filtered_words)
all_tokens[:1]

In [None]:
# 1-4. 어근(형태소) 추출: Stemming/ Lemmatization
from nltk import LancasterStemmer
stemmer=LancasterStemmer()

stemmer.stem('working')

In [None]:
# 2.피처 벡터화/축출(BOW : CountVectorizer/TfidfVectorizer)
# 2-1. 카운트 기반의 벡터화: CountVectorizer
# 셋트 만들기
from sklearn.datasets import fetch_20newsgroups
train_news=fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'), random_state=156)
X_train=train_news.data
y_train=train_news.target
test_news=fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'), random_state=156)
X_test=test_news.data
y_test=test_news.target

# 벡처화
from sklearn.feature_extraction.text import CountVectorizer
cnt_vct=CountVectorizer()
cnt_vct.fit(X_train)   # y값 없음!!
X_train_cnt_vct=cnt_vct.transform(X_train)
X_test_cnt_vct=cnt_vct.transform(X_test)

# 3.모델로 분류
from sklearn.linear_model import LogisticRegression
lr_clf=LogisticRegression(solver='liblinear')
lr_clf.fit(X_train_cnt_vct, y_train)
pred=lr_clf.predict(X_test_cnt_vct)
print('예측 정확도 : ', accuracy_score(y_test, pred))

In [None]:
# 2-2. TF-IDF: TfidfVectorizer
# (자주 나오는 단어에 가중치/모든 문서에서 자주나오는 단어에 패털티)
from sklearn.feature_extraction.text import TfidfVectorizer
tfdf_vct=TfidfVectorizer()
tfdf_vct.fit(X_train)
X_train_tfdf_vct=tfdf_vct.transform(X_train)
X_test_tfdf_vct=tfdf_vct.transform(X_test)

# 3.모델로 분류
from sklearn.linear_model import LogisticRegression
lr_clf=LogisticRegression(solver='liblinear')
lr_clf.fit(X_train_tfdf_vct, y_train)
pred=lr_clf.predict(X_test_tfdf_vct)
print('예측 정확도 : ', accuracy_score(y_test, pred))

In [None]:
# 3-1. GridSearchCV로 파라미터 성능 향상
params={'C':[0.01, 0.1, 1, 5, 10]}
gr_lr_clf = GridSearchCV(lr_clf,param_grid=params,cv=3,scoring='accuracy', verbose=1)
gr_lr_clf.fit(X_train_tfdf_vct , y_train)
print('LogisticRegression의 최적 파라미터: ', gr_lr_clf.best_params_)

# 최적 C값으로 예측, 정확도 평가
pred=gr_lr_clf.predict(X_test_tfdf_vct)
print('예측 정확도 : ', accuracy_score(y_test, pred))

In [None]:
# pipeline으로 만들기:TF-IDF 벡터화, GridSearchCV 최적찾기
from sklearn.datasets import fetch_20newsgroups
train_news=fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'), random_state=156)
X_train=train_news.data
y_train=train_news.target
test_news=fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'), random_state=156)
X_test=test_news.data
y_test=test_news.target

from sklearn.pipeline import Pipeline

pipeline=Pipeline([('tfdf_vect', TfidfVectorizer(stop_words='english')),
                   ('lr_clf', LogisticRegression())])
params={'tfdf_vect__ngram_range': [(1,1),(1,2)],
        'tfdf_vect__max_df': [100,200,300],
        'lr_clf__C': [1,5,7]}
grid_cv_pipe=GridSearchCV(pipeline, param_grid=params,cv=3,scoring='accuracy')
grid_cv_pipe.fit(X_train,y_train)
print('최적 파라미터: ', grid_cv_pipe.best_params_)
pred=gr_lr_clf.predict(X_test)
print('예측 정확도 : ', accuracy_score(y_test, pred))

In [None]:
# 한글 단어 토큰
from konlpy.tag import Okt
okt = Okt()
words = []
for sentence in sentences:
    word = okt.morphs(sentence)
    words.append(word)
print(words[:2])
test_text = '나는 정말로 파이썬을 좋아한다. 아니 머신러닝을 더 좋아한다.'
print('normalize :', okt.normalize(test_text)) # 문장으로 추출
print('morphs :', okt.morphs(test_text))       # 구문 분석
print('nouns :', okt.nouns(test_text))         # 명사만
print('phrases :', okt.phrases(test_text))     # 구문
print('pos :', okt.pos(test_text))             # 품사와 함께 값고 함께 