In [34]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from konlpy.tag import Okt
import numpy as np
import pandas as pd
import re

In [3]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

In [4]:
print(len(newsgroups_train.data), '\n\n', len(newsgroups_test.data), '\n\n',
      newsgroups_train.target_names, '\n\n', set(newsgroups_train.target))

2034 

 1353 

 ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc'] 

 {0, 1, 2, 3}


In [5]:
print(newsgroups_train.data[0], '\n\n', newsgroups_train.target[0])

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych 

 1


In [6]:
X_train, X_test = newsgroups_train.data, newsgroups_test.data
y_train, y_test = newsgroups_train.target, newsgroups_test.target

In [8]:
def my_tokenizer(doc):
    tokenizer = RegexpTokenizer(r"[\w']{2,}")
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(token) for token in tokenizer.tokenize(doc)]
    return tokens

In [9]:
cv = CountVectorizer(tokenizer=my_tokenizer, max_features=2000, min_df=5, max_df=0.5)
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)
print(X_train_cv.shape, '\n\n', X_test_cv.shape)



(2034, 2000) 

 (1353, 2000)


In [11]:
for word, count in zip(cv.get_feature_names_out()[51:100], X_train_cv[0].toarray()[0, 51:100]):
    print(word, ':', count, end=', ')

400 : 0, 42 : 0, 45 : 0, 50 : 0, 500 : 0, 60 : 0, 600 : 0, 65 : 0, 70 : 0, 75 : 0, 80 : 0, 800 : 0, 90 : 0, 900 : 0, 91 : 0, 92 : 0, 93 : 0, 95 : 0, _the : 0, a : 0, ability : 0, able : 1, abortion : 0, about : 1, above : 0, absolute : 0, absolutely : 0, abstract : 0, ac : 0, acceleration : 0, accept : 0, acceptable : 0, accepted : 0, access : 0, according : 0, account : 0, accurate : 0, acronym : 0, across : 0, act : 0, action : 0, active : 0, activity : 0, actual : 0, actually : 0, ad : 0, adam : 0, add : 0, added : 0, 

In [12]:
nb_clf = MultinomialNB().fit(X_train_cv, y_train)
nb_clf.score(X_train_cv, y_train), nb_clf.score(X_test_cv, y_test)

(0.831858407079646, 0.7405764966740577)

In [14]:
pred = nb_clf.predict(X_test_cv)
for i in range(3):
    print(newsgroups_test.target_names[pred[i]])

sci.space
comp.graphics
comp.graphics


In [15]:
tfidf = TfidfVectorizer(tokenizer=my_tokenizer, max_features=2000, min_df=5, max_df=0.5)
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)
print(X_train_tf.shape, '\n\n', X_test_tf.shape)



(2034, 2000) 

 (1353, 2000)


In [20]:
nb_clf = MultinomialNB(alpha=0.05).fit(X_train_tf, y_train)
nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test)

(0.9090462143559489, 0.7590539541759054)

In [21]:
pred = nb_clf.predict(X_test_cv)
for i in range(3):
    print(newsgroups_test.target_names[pred[i]])

sci.space
comp.graphics
comp.graphics


In [33]:
logreg = LogisticRegression(C=2, max_iter=5000).fit(X_train_tf, y_train)
logreg.score(X_train_tf, y_train), logreg.score(X_test_tf, y_test)

(0.9532940019665683, 0.7487065779748706)

In [35]:
param_grid = {'penalty':('l1', 'l2'), 'C':np.logspace(-2, 2, 5)}
grid = GridSearchCV(LogisticRegression(solver='liblinear', max_iter=5000), param_grid=param_grid, cv=5, n_jobs=-1).fit(X_train_tf, y_train)
grid.best_score_, grid.best_estimator_, grid.best_params_

(0.7954866196245507,
 LogisticRegression(C=10.0, max_iter=5000, solver='liblinear'),
 {'C': 10.0, 'penalty': 'l2'})

In [36]:
grid.score(X_train_tf, y_train), grid.score(X_test_tf, y_test)

(0.9729596853490659, 0.7398373983739838)

In [49]:
def top_n_features(classifier, vectorizer, categories, n):
    feature_names = np.asarray(vectorizer.get_feature_names_out())
    for i, category in enumerate(categories):
        top_n = np.argsort(classifier.coef_[i])[:-(n+1):-1]
        print(f'{category}, {", ".join(feature_names[top_n])}')

In [50]:
top_n_features(logreg, tfidf, newsgroups_train.target_names, 5)

alt.atheism, atheist, atheism, religion, bobby, deletion
comp.graphics, graphic, image, file, computer, 3d
sci.space, space, orbit, nasa, launch, moon
talk.religion.misc, christian, order, jesus, objective, he


In [51]:
tfidf = TfidfVectorizer(tokenizer=my_tokenizer, min_df=3, max_df=0.5, stop_words='english')
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)
print(X_train_tf.shape, '\n\n', X_test_tf.shape)



(2034, 7581) 

 (1353, 7581)


In [52]:
param_grid = {'penalty':('l1', 'l2'), 'C':np.logspace(-2, 2, 5)}
grid = GridSearchCV(LogisticRegression(solver='liblinear', max_iter=5000), param_grid=param_grid, cv=5, n_jobs=-1).fit(X_train_tf, y_train)
grid.best_score_, grid.best_estimator_, grid.best_params_

(0.8289091151160116,
 LogisticRegression(C=10.0, max_iter=5000, solver='liblinear'),
 {'C': 10.0, 'penalty': 'l2'})

In [53]:
grid.score(X_train_tf, y_train), grid.score(X_test_tf, y_test)

(0.9759095378564405, 0.7597930524759793)

In [57]:
nb_clf = MultinomialNB(alpha=0.05).fit(X_train_tf, y_train)
nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test)

(0.9601769911504425, 0.7849223946784922)

In [58]:
tfidf = TfidfVectorizer(tokenizer=my_tokenizer, min_df=3, max_df=0.5, lowercase=True, ngram_range=(1, 3), stop_words='english')
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)
print(X_train_tf.shape, '\n\n', X_test_tf.shape)



(2034, 12944) 

 (1353, 12944)


In [66]:
nb_clf = MultinomialNB(alpha=0.11).fit(X_train_tf, y_train)
nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test)

(0.9641101278269419, 0.7915742793791575)

In [67]:
df = pd.read_csv('./data/daum_movie_review.csv')
df.head(5)

Unnamed: 0,review,rating,date,title
0,돈 들인건 티가 나지만 보는 내내 하품만,1,2018.10.29,인피니티 워
1,몰입할수밖에 없다. 어렵게 생각할 필요없다. 내가 전투에 참여한듯 손에 땀이남.,10,2018.10.26,인피니티 워
2,이전 작품에 비해 더 화려하고 스케일도 커졌지만.... 전국 맛집의 음식들을 한데 ...,8,2018.10.24,인피니티 워
3,이 정도면 볼만하다고 할 수 있음!,8,2018.10.22,인피니티 워
4,재미있다,10,2018.10.20,인피니티 워


In [68]:
X_train, X_test, y_train, y_test = train_test_split(df.review, df.title, stratify=df.title, random_state=0)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [69]:
okt = Okt()

In [70]:
tfidf = TfidfVectorizer(tokenizer=okt.nouns, min_df=3, max_df=0.5, lowercase=True, ngram_range=(1, 2), stop_words='english')
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)
print(X_train_tf.shape, '\n\n', X_test_tf.shape)



(11043, 6716) 

 (3682, 6716)


In [81]:
nb_clf = MultinomialNB(alpha=0.15).fit(X_train_tf, y_train)
nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


(0.8091098433396722, 0.6917436175991309)

In [83]:
for content in zip(y_test[:10], nb_clf.predict(X_test_tf)[:10], X_test[:10]):
    print(content)

('신과함께', '신과함께', '핵  꿀잼')
('범죄도시', '범죄도시', '윤계상  진심 지렸다!')
('범죄도시', '범죄도시', '잔인하긴 하지만 몰입도는 최고 !  다들 연기력이 예술입니다 ㅋ 윤계상씨도  이제는 배우같네요 ㅋ')
('라라랜드', '라라랜드', '마지막 연주에 담긴 수많은 생각과 감정들..')
('인피니티 워', '인피니티 워', '이번엔 어벤져스보다 타노스 보다 블랙오더중 한명인 에보니모 가 더 멋졌음 다음을 기대하게 되는 영화..')
('범죄도시', '범죄도시', '내래  간만에 스트레쓰 푸러쓰. 영화는 이래야 제맛이지~~~♡♡♡♡♡♡♡♡♡')
('신과함께', '신과함께', '이제 우리나라는 신인 배우 발굴 안하나요?? 너무 다 겹치기 출연이라 식상하기까지 합니다... 외국 사람들이 한국 영화보면 배우들이 저 사람들 밖에 없나봐~ 라고 할 정도... 조연급만이라도 신선한 배우들 발굴좀 했으면... 물론 제작비 지원하는 갑들이 시키는대로 해야겠지만, 감독이 자기 작품인데 그런 욕심이 없는지.. 관객들은 톱배우들이 전부 조주연 출연하는 그런 영화를 원하지 않는다고 생각한다ㅋ 이영화는 그래서 더더욱 매력이 없엇다.. 관객은 새로운 인물을 원한다. 김동욱은 신인이 아님에도 영화계에선 신선한 인물이었기에 그의 연기밖에 기억이 안남았다.')
('택시운전사', '곤지암', '좋네요.  좀 더 욕심내고 싶지만 그러기엔 시간이 짧고..')
('신과함께', '신과함께', '취향껏 보는 거지만... 전 도저히 이해 못하겠어요.. 이런 영화도 천만이라고... 개나 소나 다천만이라네.. 마케팅,독점으로 이룬 영화..')
('곤지암', '신과함께', '.....당황그자체 ㅁ 뭐지 이영화')


In [89]:
def okt_tokenizer(doc, pos_list=['Noun', 'Verb', 'Adjective']):
    return ['/'.join([word, pos]) for word, pos in okt.pos(doc, norm=True, stem=True) if pos in pos_list]

In [90]:
tfidf = TfidfVectorizer(tokenizer=okt_tokenizer, min_df=3, max_df=0.5, ngram_range=(1, 2))
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)
print(X_train_tf.shape, '\n\n', X_test_tf.shape)



(11043, 10861) 

 (3682, 10861)


In [101]:
nb_clf = MultinomialNB(alpha=0.1).fit(X_train_tf, y_train)
nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


(0.8590962600742552, 0.7115697990222705)

In [105]:
logreg = LogisticRegression(C=3, max_iter=1000).fit(X_train_tf, y_train)
logreg.score(X_train_tf, y_train), logreg.score(X_test_tf, y_test)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


(0.9031060400253554, 0.717001629549158)

In [111]:
top_n_features(logreg, tfidf, np.unique(df.title), 5)

곤지암, 무섭다/Adjective, 공포영화/Noun, 공포/Noun, 귀신/Noun, 곤지암/Noun
라라랜드, 음악/Noun, 뮤지컬/Noun, 사랑/Noun, 꿈/Noun, 아름답다/Adjective
범죄도시, 마동석/Noun, 윤계상/Noun, 잔인하다/Adjective, 조선족/Noun, 마블리/Noun
신과함께, 원작/Noun, 신파/Noun, 차태현/Noun, 웹툰/Noun, 김동욱/Noun
인피니티 워, 마블/Noun, 노스/Noun, 어벤져스/Noun, 히어로/Noun, 번역/Noun
코코, 디즈니/Noun, 코코/Noun, 감동/Noun, 겨울왕국/Noun, 애니/Noun
택시운전사, 광주/Noun, 송강호/Noun, 역사/Noun, 택시/Noun, 전두환/Noun
