In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer

from konlpy.tag import Okt

import re

import numpy as np
import pandas as pd

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

In [3]:
print(newsgroups_train.target_names)
print(set(newsgroups_train.target))

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
{np.int64(0), np.int64(1), np.int64(2), np.int64(3)}


In [4]:
X_train, y_train, X_test, y_test = newsgroups_train.data, newsgroups_train.target, newsgroups_test.data, newsgroups_test.target

In [5]:
tfidf = TfidfVectorizer(max_features=2000, min_df=3, max_df=0.5)
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)
print(X_train_tf.shape, X_test_tf.shape)

(2034, 2000) (1353, 2000)


In [6]:
for word, count in zip(tfidf.get_feature_names_out()[:50], X_train_tf[0].toarray()[0, :50]):
    print(word, ':', count, end=', ')

00 : 0.0, 000 : 0.0, 01 : 0.0, 04 : 0.0, 05 : 0.0, 10 : 0.0, 100 : 0.0, 1000 : 0.0, 11 : 0.0, 12 : 0.0, 128 : 0.0, 129 : 0.0, 13 : 0.0, 130 : 0.0, 14 : 0.0, 15 : 0.0, 16 : 0.0, 17 : 0.0, 18 : 0.0, 19 : 0.0, 1988 : 0.0, 1989 : 0.0, 1990 : 0.0, 1991 : 0.0, 1992 : 0.0, 1993 : 0.0, 20 : 0.0, 200 : 0.0, 202 : 0.0, 21 : 0.0, 22 : 0.0, 23 : 0.0, 24 : 0.0, 25 : 0.0, 256 : 0.0, 26 : 0.0, 27 : 0.0, 28 : 0.0, 2d : 0.0, 30 : 0.0, 300 : 0.0, 31 : 0.0, 32 : 0.0, 33 : 0.0, 34 : 0.0, 35 : 0.0, 39 : 0.0, 3d : 0.0, 40 : 0.0, 400 : 0.0, 

In [7]:
nb_clf = MultinomialNB()
nb_clf.fit(X_train_tf, y_train)
print(nb_clf.score(X_train_tf, y_train))
print(nb_clf.score(X_test_tf, y_test))

0.8623402163225172
0.7390983000739099


In [8]:
pred = nb_clf.predict(X_test_tf[:10])
for i in pred:
    print(newsgroups_train.target_names[i])

sci.space
comp.graphics
comp.graphics
comp.graphics
comp.graphics
comp.graphics
sci.space
sci.space
alt.atheism
sci.space


In [9]:
tfidf = TfidfVectorizer(token_pattern=r"[\w']{2,}", lowercase=True, max_features=5000, min_df=3, max_df=0.5)
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)
print(X_train_tf.shape, X_test_tf.shape)

(2034, 5000) (1353, 5000)


In [10]:
nb_clf = MultinomialNB(alpha=0.1)
nb_clf.fit(X_train_tf, y_train)
print(nb_clf.score(X_train_tf, y_train))
print(nb_clf.score(X_test_tf, y_test))

0.9444444444444444
0.7753141167775314


In [11]:
pred = nb_clf.predict(X_test_tf[:10])
for i, j in zip(pred, y_test):
    print('pred:', newsgroups_train.target_names[i], '/ y_test:', newsgroups_train.target_names[j], sep=' ')

pred: sci.space / y_test: sci.space
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: sci.space / y_test: sci.space
pred: sci.space / y_test: sci.space
pred: alt.atheism / y_test: alt.atheism
pred: sci.space / y_test: sci.space


In [12]:
def top_n_features(classifier, vectorizer, categories, n):
    feature_names = np.asarray(vectorizer.get_feature_names_out())
    for i, category in enumerate(categories):
        if isinstance(classifier, MultinomialNB):
            top_n = np.argsort(-classifier.feature_count_[i])[:n]
            print(f'{category}: {', '.join(feature_names[top_n])}')
        else:
            top_n = np.argsort(-classifier.coef_[i])[:n]
            print(f'{category}: {', '.join(feature_names[top_n])}')

In [13]:
top_n_features(nb_clf, tfidf, newsgroups_train.target_names, 20)

alt.atheism: you, not, are, be, this, have, as, what, if, they, do, god, but, your, or, so, was, on, an, we
comp.graphics: you, graphics, on, this, have, any, or, thanks, with, if, can, be, but, there, image, are, files, file, me, anyone
sci.space: space, on, be, was, you, this, as, are, have, they, at, would, or, if, from, not, but, with, by, nasa
talk.religion.misc: you, not, he, are, this, as, be, was, god, they, have, with, your, but, who, jesus, or, by, his, what


In [14]:
logreg_clf = LogisticRegression(max_iter=1000, C=2)
logreg_clf.fit(X_train_tf, y_train)
print(logreg_clf.score(X_train_tf, y_train), logreg_clf.score(X_test_tf, y_test))

0.9680432645034415 0.7627494456762749


In [15]:
pred = logreg_clf.predict(X_test_tf[:10])
for i, j in zip(pred, y_test):
    print('pred:', newsgroups_train.target_names[i], '/ y_test:', newsgroups_train.target_names[j], sep=' ')

pred: sci.space / y_test: sci.space
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: comp.graphics / y_test: comp.graphics
pred: sci.space / y_test: sci.space
pred: sci.space / y_test: sci.space
pred: alt.atheism / y_test: alt.atheism
pred: sci.space / y_test: sci.space


In [16]:
top_n_features(logreg_clf, tfidf, newsgroups_train.target_names, 20)

alt.atheism: atheism, religion, atheists, islam, bobby, deletion, islamic, motto, atheist, god, up, must, punishment, post, him, satan, people, cruel, bible, isn't
comp.graphics: graphics, image, file, computer, 3d, files, hi, looking, points, code, format, package, video, 68070, anyone, images, screen, color, card, windows
sci.space: space, orbit, nasa, launch, moon, spacecraft, shuttle, dc, lunar, solar, earth, flight, sci, mars, cost, get, satellite, like, at, year
talk.religion.misc: christian, christians, god, jesus, he, objective, fbi, his, blood, christ, children, see, order, koresh, rosicrucian, who, amorc, abortion, kent, values


In [17]:
def my_tokenizer(text):
    eng_stops = set(stopwords.words('english'))
    reg_tokens = RegexpTokenizer(r"[\w']{2,}").tokenize(text.lower())
    words = [word for word in reg_tokens if (word not in eng_stops)]
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(lemma_word) for lemma_word in words]
    return tokens

In [18]:
my_tfidf = TfidfVectorizer(tokenizer=my_tokenizer, max_features=10000, min_df=3, max_df=0.5)
X_train_my_tf = my_tfidf.fit_transform(X_train)
X_test_my_tf = my_tfidf.transform(X_test)



In [19]:
my_nb_clf = MultinomialNB(alpha=0.1)
my_nb_clf.fit(X_train_my_tf, y_train)
print(my_nb_clf.score(X_train_my_tf, y_train))
print(my_nb_clf.score(X_test_my_tf, y_test))

0.956243854473943
0.7871396895787139


In [20]:
top_n_features(my_nb_clf, my_tfidf, newsgroups_train.target_names, 20)

alt.atheism: god, one, people, think, would, say, religion, atheist, atheism, could, thing, islam, know, argument, well, belief, make, said, system, believe
comp.graphics: file, graphic, image, thanks, program, know, anyone, format, would, window, color, help, looking, need, please, hi, 3d, use, code, software
sci.space: space, would, nasa, like, launch, orbit, one, year, get, moon, shuttle, think, could, time, cost, thing, satellite, much, earth, also
talk.religion.misc: christian, god, jesus, people, would, one, say, think, bible, know, see, objective, child, believe, word, u, may, koresh, good, life


In [21]:
my_logreg_clf = LogisticRegression(max_iter=1000, C=5)
my_logreg_clf.fit(X_train_my_tf, y_train)
print(my_logreg_clf.score(X_train_my_tf, y_train), my_logreg_clf.score(X_test_my_tf, y_test))

0.9749262536873157 0.7605321507760532


In [22]:
top_n_features(my_logreg_clf, my_tfidf, newsgroups_train.target_names, 20)

alt.atheism: atheist, atheism, religion, deletion, motto, islam, islamic, bobby, post, god, bible, argument, satan, people, claim, must, punishment, text, define, right
comp.graphics: graphic, image, file, 3d, computer, hi, anyone, looking, package, card, code, polygon, work, point, format, 42, 68070, video, algorithm, window
sci.space: space, orbit, nasa, launch, spacecraft, moon, flight, shuttle, satellite, rocket, cost, mar, get, dc, star, earth, solar, data, idea, lunar
talk.religion.misc: christian, god, jesus, child, fbi, objective, christ, order, blood, rosicrucian, story, mr, amorc, context, hudson, commandment, koresh, see, abortion, fire


In [23]:
ngram_tfidf = TfidfVectorizer(token_pattern=r"[\w']{2,}", lowercase=True, min_df=3, max_df=0.5, ngram_range=(1, 3), stop_words='english')
X_train_ngram_tf = ngram_tfidf.fit_transform(X_train)
X_test_ngram_tf = ngram_tfidf.transform(X_test)
print(X_train_ngram_tf.shape, X_test_ngram_tf.shape)

(2034, 12959) (1353, 12959)


In [24]:
ngram_nb_clf = MultinomialNB(alpha=0.01)
ngram_nb_clf.fit(X_train_ngram_tf, y_train)
print(ngram_nb_clf.score(X_train_ngram_tf, y_train))
print(ngram_nb_clf.score(X_test_ngram_tf, y_test))

0.9695181907571289
0.7878787878787878


In [25]:
ngram_logreg_clf = LogisticRegression(max_iter=1000, C=10)
ngram_logreg_clf.fit(X_train_ngram_tf, y_train)
print(ngram_logreg_clf.score(X_train_ngram_tf, y_train), ngram_logreg_clf.score(X_test_ngram_tf, y_test))

0.976401179941003 0.7612712490761271


In [26]:
print(top_n_features(ngram_nb_clf, ngram_tfidf, newsgroups_train.target_names, 20))
print(top_n_features(ngram_logreg_clf, ngram_tfidf, newsgroups_train.target_names, 20))

alt.atheism: god, people, don't, think, just, say, religion, atheism, does, islam, know, atheists, it's, bible, like, believe, i'm, said, time, true
comp.graphics: graphics, thanks, image, files, file, know, program, does, looking, hi, i'm, need, format, use, windows, software, help, 3d, like, code
sci.space: space, nasa, like, launch, just, orbit, moon, think, shuttle, earth, lunar, don't, time, know, data, people, spacecraft, cost, it's, year
talk.religion.misc: god, jesus, people, christian, christians, don't, just, bible, think, know, objective, did, say, believe, good, koresh, does, life, christ, like
None
alt.atheism: atheism, religion, atheists, islam, deletion, atheist, motto, islamic, bible, bobby, post, loans, cruel, nanci, risk, you're right, people, punishment, perfect, define
comp.graphics: graphics, image, file, computer, hi, 3d, looking, files, package, 68070, points, video, card, code, thanks, tiff, format, screen, 42, ftp
sci.space: space, orbit, nasa, launch, spacecra

In [27]:
df = pd.read_csv('./data/daum_movie_review.csv')
print(df.title.value_counts())
df.head(5)

title
신과함께      4947
택시운전사     2322
인피니티 워    2042
범죄도시      1939
곤지암       1547
라라랜드      1150
코코         778
Name: count, dtype: int64


Unnamed: 0,review,rating,date,title
0,돈 들인건 티가 나지만 보는 내내 하품만,1,2018.10.29,인피니티 워
1,몰입할수밖에 없다. 어렵게 생각할 필요없다. 내가 전투에 참여한듯 손에 땀이남.,10,2018.10.26,인피니티 워
2,이전 작품에 비해 더 화려하고 스케일도 커졌지만.... 전국 맛집의 음식들을 한데 ...,8,2018.10.24,인피니티 워
3,이 정도면 볼만하다고 할 수 있음!,8,2018.10.22,인피니티 워
4,재미있다,10,2018.10.20,인피니티 워


In [28]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['title'], test_size=0.25, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((11043,), (3682,), (11043,), (3682,))

In [29]:
okt = Okt()
print(okt.morphs(X_train[1]))
print(okt.nouns(X_train[1]))
print(okt.pos(X_train[1]))

['몰입', '할수밖에', '없다', '.', '어렵게', '생각', '할', '필요없다', '.', '내', '가', '전투', '에', '참여', '한', '듯', '손', '에', '땀', '이남', '.']
['몰입', '생각', '내', '전투', '참여', '듯', '손', '땀', '이남']
[('몰입', 'Noun'), ('할수밖에', 'Verb'), ('없다', 'Adjective'), ('.', 'Punctuation'), ('어렵게', 'Adjective'), ('생각', 'Noun'), ('할', 'Verb'), ('필요없다', 'Adjective'), ('.', 'Punctuation'), ('내', 'Noun'), ('가', 'Josa'), ('전투', 'Noun'), ('에', 'Josa'), ('참여', 'Noun'), ('한', 'Determiner'), ('듯', 'Noun'), ('손', 'Noun'), ('에', 'Josa'), ('땀', 'Noun'), ('이남', 'Noun'), ('.', 'Punctuation')]


In [30]:
def kor_tokenizer(text):
    target_tags = ['Noun', 'Verb', 'Adjective']
    result = ['/'.join([word, tag]) for word, tag in okt.pos(text, norm=True, stem=True) if tag in target_tags]
    return result

In [31]:
kor_tfidf = TfidfVectorizer(tokenizer=kor_tokenizer, min_df=3, max_df=0.5, ngram_range=(1, 2))
X_train_kor_tf = kor_tfidf.fit_transform(X_train)
X_test_kor_tf = kor_tfidf.transform(X_test)



In [32]:
print(X_train_kor_tf.shape, X_test_kor_tf.shape)

(11043, 10897) (3682, 10897)


In [33]:
kor_nb_clf = MultinomialNB(alpha=0.1)
kor_nb_clf.fit(X_train_kor_tf, y_train)
print(kor_nb_clf.score(X_train_kor_tf, y_train), kor_nb_clf.score(X_test_kor_tf, y_test))

0.8573757131214343 0.72270505160239


In [34]:
kor_logreg_clf = LogisticRegression(max_iter=1000, C=10)
kor_logreg_clf.fit(X_train_kor_tf, y_train)
print(kor_logreg_clf.score(X_train_kor_tf, y_train), kor_logreg_clf.score(X_test_kor_tf, y_test))

0.9393280811373721 0.7175448126018468
