In [116]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from konlpy.tag import Okt
import numpy as np
import pandas as pd
import re

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)

In [4]:
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

In [5]:
len(newsgroups_train.data), len(newsgroups_test.data), newsgroups_train.target_names, set(newsgroups_train.target)

(2034,
 1353,
 ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc'],
 {0, 1, 2, 3})

In [12]:
print(newsgroups_train.data[0])
print(newsgroups_train.target_names[newsgroups_train.target[0]])
print(newsgroups_test.data[1])

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych
comp.graphics
The Vatican library recently made a tour of the US.
 Can anyone help me in finding a FTP site where this collection is 
 available.


In [25]:
X_train = newsgroups_train.data
y_train = newsgroups_train.target
X_test = newsgroups_test.data
y_test = newsgroups_test.target

In [26]:
cv = CountVectorizer(max_features=5000, min_df=5, max_df=0.5, stop_words='english')
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)
print(X_train_cv.shape, X_test_cv.shape)

(2034, 5000) (1353, 5000)


In [28]:
nb_clf = MultinomialNB()
nb_clf.fit(X_train_cv, y_train)
print(nb_clf.score(X_train_cv, y_train), nb_clf.score(X_test_cv, y_test))

0.8913470993117011 0.7738359201773836


In [29]:
tfidf = TfidfTransformer()
X_train_tf = tfidf.fit_transform(X_train_cv)
X_test_tf = tfidf.transform(X_test_cv)
print(X_train_tf.shape, X_test_tf.shape)

(2034, 5000) (1353, 5000)


In [30]:
nb_clf = MultinomialNB()
nb_clf.fit(X_train_tf, y_train)
print(nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test))

0.9154375614552606 0.754619364375462


In [36]:
reg_token = RegexpTokenizer("[\w']{3,}")

In [38]:
def tokenizer(text):
    tokens = reg_token.tokenize(text.lower())
    features = list(map(lambda token: WordNetLemmatizer().lemmatize(token), tokens))
    return features

In [75]:
tfidf = TfidfVectorizer(tokenizer=tokenizer, ngram_range=(1, 3), min_df=3, max_df=0.5, stop_words='english')

In [76]:
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)



In [77]:
nb_clf = MultinomialNB(alpha=0.1)
nb_clf.fit(X_train_tf, y_train)
print(nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test))

0.9626352015732547 0.7886178861788617


In [78]:
X_train_tf.shape

(2034, 12411)

In [94]:
tfidf = TfidfVectorizer(token_pattern="[\w']{3,}", decode_error='ignore', lowercase=True,
                        stop_words='english', max_df=0.5, min_df=3, ngram_range=(1, 3))

In [95]:
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [96]:
nb_clf = MultinomialNB(alpha=0.2)
nb_clf.fit(X_train_tf, y_train)
print(nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test), X_train_tf.shape)

0.95968534906588 0.78640059127864 (2034, 12453)


In [98]:
df = pd.read_csv('./data/daum_movie_review.csv')
df.head(5)

Unnamed: 0,review,rating,date,title
0,돈 들인건 티가 나지만 보는 내내 하품만,1,2018.10.29,인피니티 워
1,몰입할수밖에 없다. 어렵게 생각할 필요없다. 내가 전투에 참여한듯 손에 땀이남.,10,2018.10.26,인피니티 워
2,이전 작품에 비해 더 화려하고 스케일도 커졌지만.... 전국 맛집의 음식들을 한데 ...,8,2018.10.24,인피니티 워
3,이 정도면 볼만하다고 할 수 있음!,8,2018.10.22,인피니티 워
4,재미있다,10,2018.10.20,인피니티 워


In [99]:
df.title.value_counts()

신과함께      4947
택시운전사     2322
인피니티 워    2042
범죄도시      1939
곤지암       1547
라라랜드      1150
코코         778
Name: title, dtype: int64

In [101]:
X_train, X_test, y_train, y_test = train_test_split(df.review, df.title, random_state=0)

In [103]:
okt = Okt()
print(okt.morphs(X_train[1]), okt.nouns(X_train[1]))

['몰입', '할수밖에', '없다', '.', '어렵게', '생각', '할', '필요없다', '.', '내', '가', '전투', '에', '참여', '한', '듯', '손', '에', '땀', '이남', '.'] ['몰입', '생각', '내', '전투', '참여', '듯', '손', '땀', '이남']


In [104]:
tfidf = TfidfVectorizer(tokenizer=okt.nouns, min_df=3, max_df=0.5)

In [105]:
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)



In [115]:
nb_clf = MultinomialNB(alpha=0.15)
nb_clf.fit(X_train_tf, y_train)
print(nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test), X_train_tf.shape)

0.7724350267137553 0.6922868006518197 (11043, 3302)


In [122]:
lr_clf = LogisticRegression(C=5, max_iter=1000)
lr_clf.fit(X_train_tf, y_train)
print(lr_clf.score(X_train_tf, y_train), lr_clf.score(X_test_tf, y_test), X_train_tf.shape)

0.8342841619125237 0.6925583921781641 (11043, 3302)


In [123]:
def twit_tokenizer(text, target_tags=['Noun', 'Verb', 'Adjective']):
    result = []
    for word, tag in okt.pos(text, norm=True, stem=True):
        if tag in target_tags:
            result.append(word)
    
    return result

In [124]:
tfidf = TfidfVectorizer(tokenizer=twit_tokenizer, min_df=3, max_df=0.5)

In [125]:
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)



In [126]:
nb_clf = MultinomialNB(alpha=0.15)
nb_clf.fit(X_train_tf, y_train)
print(nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test), X_train_tf.shape)

0.8012315493978086 0.7167300380228137 (11043, 4254)


In [127]:
def twit_tokenizer2(text, target_tags=['Noun', 'Verb', 'Adjective']):
    result = []
    for word, tag in okt.pos(text, norm=True, stem=True):
        if tag in target_tags:
            result.append('/'.join([word, tag]))
    
    return result

In [128]:
tfidf = TfidfVectorizer(tokenizer=twit_tokenizer2, min_df=3, max_df=0.5)

In [129]:
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)



In [130]:
nb_clf = MultinomialNB(alpha=0.15)
nb_clf.fit(X_train_tf, y_train)
print(nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test), X_train_tf.shape)

0.8016843249117088 0.7178164041281913 (11043, 4256)
