In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from konlpy.tag import Okt
import numpy as np
import pandas as pd
import re



In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

In [4]:
newsgroups_train.target_names, set(newsgroups_train.target)

(['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc'],
 {0, 1, 2, 3})

In [5]:
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

In [9]:
cv = CountVectorizer(token_pattern="[\w']+", max_df=0.5, min_df=3, stop_words='english')

In [10]:
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)
X_train_cv.shape, X_test_cv.shape

((2034, 8455), (1353, 8455))

In [11]:
for word, count in zip(cv.get_feature_names_out()[:50], X_train_cv[0].toarray()[0, :50]):
    print(word, ':', count, end=', ')

' : 0, '91 : 0, '92 : 0, '93 : 0, 'em : 0, 'i : 0, 'official : 0, 's : 0, 'the : 0, 'you : 0, 0 : 0, 00 : 0, 000 : 0, 01 : 0, 02 : 0, 03 : 0, 04 : 0, 05 : 0, 06 : 0, 0674 : 0, 07 : 0, 08 : 0, 09 : 0, 1 : 0, 10 : 0, 100 : 0, 1000 : 0, 101 : 0, 101010 : 0, 102 : 0, 1024x768 : 0, 1030 : 0, 104 : 0, 105 : 0, 107 : 0, 109 : 0, 11 : 0, 110 : 0, 111 : 0, 112 : 0, 113 : 0, 115 : 0, 1150 : 0, 12 : 0, 120 : 0, 1200 : 0, 121 : 0, 125 : 0, 128 : 0, 129 : 0, 

In [18]:
def my_tokenizer(doc):
    tokenizer = word_tokenize(doc)
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(word) for word in tokenizer]
    return tokens

In [19]:
cv = CountVectorizer(tokenizer=my_tokenizer, token_pattern="[\w']+", max_df=0.5, min_df=3, stop_words='english')

In [20]:
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)
X_train_cv.shape, X_test_cv.shape



((2034, 7430), (1353, 7430))

In [21]:
for word, count in zip(cv.get_feature_names_out()[:50], X_train_cv[0].toarray()[0, :50]):
    print(word, ':', count, end=', ')

! : 0, # : 0, $ : 0, % : 0, & : 0, ' : 0, '' : 0, '91 : 0, '92 : 0, 'cause : 0, 'd : 1, 'em : 0, 'll : 0, 'm : 0, 'official : 0, 're : 0, 's : 0, 'the : 0, 've : 1, 'what : 0, 'you : 0, ( : 1, * : 0, + : 0, +1 : 0, +41 : 0, +the : 0, - : 0, -+ : 0, -- : 0, -1 : 0, -bill : 0, -d : 0, -ekr : 0, -end : 0, -s : 0, -tommy : 0, .. : 0, ... : 0, .... : 0, ..... : 0, ...... : 0, ....... : 0, ........ : 0, .......... : 0, ............ : 0, .3ds : 2, .bmp : 0, .clp : 0, .gif : 0, 

In [31]:
nb_clf = MultinomialNB(alpha=0.1).fit(X_train_cv, y_train)
nb_clf.score(X_train_cv, y_train), nb_clf.score(X_test_cv, y_test)

(0.9203539823008849, 0.7686622320768662)

In [32]:
pred = nb_clf.predict(X_test_cv)
for i in range(10):
    print(newsgroups_train.target_names[pred[i]])

sci.space
comp.graphics
comp.graphics
comp.graphics
comp.graphics
comp.graphics
sci.space
sci.space
alt.atheism
sci.space


In [33]:
tf = TfidfVectorizer(tokenizer=my_tokenizer, token_pattern="[\w']+", max_df=0.5, min_df=3, stop_words='english')

In [34]:
X_train_tf = tf.fit_transform(X_train)
X_test_tf = tf.transform(X_test)
X_train_tf.shape, X_test_tf.shape



((2034, 7430), (1353, 7430))

In [40]:
nb_clf = MultinomialNB(alpha=0.1).fit(X_train_tf, y_train)
nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test)

(0.9552605703048181, 0.7775314116777532)

In [46]:
logreg = LogisticRegression(C=10, max_iter=1000).fit(X_train_tf, y_train)
logreg.score(X_train_tf, y_train), logreg.score(X_test_tf, y_test)

(0.9744346116027532, 0.754619364375462)

In [51]:
logreg_l1 = LogisticRegression(C=10, penalty='l1', solver='liblinear', max_iter=1000).fit(X_train_tf, y_train)
logreg_l1.score(X_train_tf, y_train), logreg_l1.score(X_test_tf, y_test)

(0.9749262536873157, 0.7250554323725056)

In [52]:
def top10_features(classifier, vectorizer, categories):
    feautre_names = np.asarray(vectorizer.get_feature_names_out())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[:-11:-1]
        print(f"{category}: {', '.join(feautre_names[top10])}")

In [53]:
top10_features(logreg, tf, newsgroups_train.target_names)

alt.atheism: atheist, atheism, religion, deletion, n't, bobby, islam, post, claim, islamic
comp.graphics: graphic, image, file, computer, hi, package, 3d, 42, ftp, card
sci.space: space, orbit, nasa, .., launch, sci.space, spacecraft, mar, cost, shuttle
talk.religion.misc: christian, god, jesus, child, order, fbi, koresh, blood, christ, hare


In [54]:
np.argsort(logreg.coef_[1])[:-11:-1]

array([3139, 3478, 2787, 1635, 3311, 4902,  279,  296, 2962, 1316],
      dtype=int64)

In [66]:
tf = TfidfVectorizer(tokenizer=my_tokenizer, lowercase=True, token_pattern="[\w']+", max_df=0.5, min_df=3, ngram_range=(1, 2), stop_words='english')

In [67]:
X_train_tf = tf.fit_transform(X_train)
X_test_tf = tf.transform(X_test)
X_train_tf.shape, X_test_tf.shape



((2034, 18088), (1353, 18088))

In [68]:
nb_clf = MultinomialNB(alpha=0.05).fit(X_train_tf, y_train)
nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test)

(0.967551622418879, 0.7782705099778271)

In [69]:
df = pd.read_csv('./data/daum_movie_review.csv')
df.head(5)

Unnamed: 0,review,rating,date,title
0,돈 들인건 티가 나지만 보는 내내 하품만,1,2018.10.29,인피니티 워
1,몰입할수밖에 없다. 어렵게 생각할 필요없다. 내가 전투에 참여한듯 손에 땀이남.,10,2018.10.26,인피니티 워
2,이전 작품에 비해 더 화려하고 스케일도 커졌지만.... 전국 맛집의 음식들을 한데 ...,8,2018.10.24,인피니티 워
3,이 정도면 볼만하다고 할 수 있음!,8,2018.10.22,인피니티 워
4,재미있다,10,2018.10.20,인피니티 워


In [72]:
X_train, X_test, y_train, y_test = train_test_split(df.review, df.title, random_state=0)

In [73]:
okt = Okt()

In [88]:
def okt_tokenizer(doc, pos_list=['Noun', 'Verb', 'Adjective']):
    return [word for word, pos in okt.pos(doc, norm=True, stem=True) if pos in pos_list]

In [89]:
tf = TfidfVectorizer(tokenizer=okt_tokenizer, max_df=0.5, min_df=5, ngram_range=(1, 2))

In [90]:
X_train_tf = tf.fit_transform(X_train)
X_test_tf = tf.transform(X_test)
X_train_tf.shape, X_test_tf.shape



((11043, 5905), (3682, 5905))

In [95]:
nb_clf = MultinomialNB(alpha=0.1).fit(X_train_tf, y_train)
nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test)

(0.8252286516345196, 0.7224334600760456)

In [99]:
def okt_tokenizer2(doc, pos_list=['Noun', 'Verb', 'Adjective']):
    return ['/'.join([word, pos]) for word, pos in okt.pos(doc, norm=True, stem=True)]

In [101]:
tf = TfidfVectorizer(tokenizer=okt_tokenizer2, max_df=0.5, min_df=5, ngram_range=(1, 2))

In [102]:
X_train_tf = tf.fit_transform(X_train)
X_test_tf = tf.transform(X_test)
X_train_tf.shape, X_test_tf.shape



((11043, 9832), (3682, 9832))

In [112]:
nb_clf = MultinomialNB(alpha=0.1).fit(X_train_tf, y_train)
nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test)

(0.8529385130852124, 0.7140141227593699)

In [120]:
logreg = LogisticRegression(C=2, max_iter=1000).fit(X_train_tf, y_train)
logreg.score(X_train_tf, y_train), logreg.score(X_test_tf, y_test)

(0.8844516888526669, 0.7213470939706681)