In [20]:
from sklearn.datasets import fetch_20newsgroups

# 20개의 토픽 중 선택하고자 하는 토픽을 리스트로 생성
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

# 학습 데이터셋을 가져옴
newsgroups_train = fetch_20newsgroups(subset='train',
                                      # 메일 내용에서 hint가 되는 부분을 삭제 - 순수하게 내용만으로 분류
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

# 평가 데이터셋을 가져옴
newsgroups_test = fetch_20newsgroups(subset='test',
                                     remove=('headers', 'footers', 'quotes'),
                                     categories=categories)

print('#Train set size:', len(newsgroups_train.data))
print('#Test set size:', len(newsgroups_test.data))
print('#Selected categories:', newsgroups_train.target_names)
print('#Train labels:', set(newsgroups_train.target))

#Train set size: 2034
#Test set size: 1353
#Selected categories: ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
#Train labels: {0, 1, 2, 3}


In [21]:
print('#Train set text samples:', newsgroups_train.data[0])
print('#Train set label samples:', newsgroups_train.target[0])
print('#Test set text samples:', newsgroups_test.data[0])
print('#Test set label samples:', newsgroups_test.target[0])

#Train set text samples: Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych
#Train set label samples: 1
#Test set text samples: TRry the SKywatch project in  Arizona.
#Test set label samples: 2


In [22]:
X_train = newsgroups_train.data # 학습 데이터셋 문서
y_train = newsgroups_train.target # 학습 데이터셋 라벨

X_test = newsgroups_test.data # 평가 데이터셋 문서
y_test = newsgroups_test.target # 평가 데이터셋 라벨

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2000, min_df=5, max_df=0.5)

X_train_cv = cv.fit_transform(X_train) # train set을 변환
print('Train set dimension:', X_train_cv.shape)
X_test_cv = cv.transform(X_test) # test set을 변환
print('Test set dimension:', X_test_cv.shape)

Train set dimension: (2034, 2000)
Test set dimension: (1353, 2000)


In [23]:
for word, count in zip(
    cv.get_feature_names_out()[:100], X_train_cv[0].toarray()[0, :100]
):
  print(word, ':', count, end=', ')

00 : 0, 000 : 0, 01 : 0, 04 : 0, 05 : 0, 10 : 0, 100 : 0, 1000 : 0, 11 : 0, 12 : 0, 128 : 0, 129 : 0, 13 : 0, 130 : 0, 14 : 0, 15 : 0, 16 : 0, 17 : 0, 18 : 0, 19 : 0, 1987 : 0, 1988 : 0, 1989 : 0, 1990 : 0, 1991 : 0, 1992 : 0, 1993 : 0, 20 : 0, 200 : 0, 202 : 0, 21 : 0, 22 : 0, 23 : 0, 24 : 0, 25 : 0, 256 : 0, 26 : 0, 27 : 0, 28 : 0, 2d : 0, 30 : 0, 300 : 0, 31 : 0, 32 : 0, 33 : 0, 34 : 0, 35 : 0, 39 : 0, 3d : 0, 40 : 0, 400 : 0, 42 : 0, 45 : 0, 50 : 0, 500 : 0, 60 : 0, 600 : 0, 65 : 0, 70 : 0, 75 : 0, 80 : 0, 800 : 0, 90 : 0, 900 : 0, 91 : 0, 92 : 0, 93 : 0, 95 : 0, _the : 0, ability : 0, able : 1, abortion : 0, about : 1, above : 0, absolute : 0, absolutely : 0, ac : 0, accept : 0, acceptable : 0, accepted : 0, access : 0, according : 0, account : 0, accurate : 0, across : 0, act : 0, action : 0, actions : 0, active : 0, activities : 0, activity : 0, acts : 0, actual : 0, actually : 0, ad : 0, add : 0, added : 0, addition : 0, additional : 0, address : 0, 

In [24]:
from sklearn.naive_bayes import MultinomialNB

# 분류기 선언
NB_clf = MultinomialNB()
# train set을 이용해 분류기(classifier)를 학습
NB_clf.fit(X_train_cv, y_train)

# train_set에 대한 예측 정확도를 확인
print('Train set score: {:.3f}'.format(NB_clf.score(X_train_cv, y_train)))
# test set에 대한 예측 정확도를 확인
print('Test set score: {:.3f}'.format(NB_clf.score(X_test_cv, y_test)))

Train set score: 0.824
Test set score: 0.732


In [25]:
print('#First document and label in test data:', X_test[0], y_test[0])
print('#Second document and label in test data:', X_test[1], y_test[1])

pred = NB_clf.predict(X_test_cv[:2])

print('#Predicted labels:', pred)
print(
    '#Predicted categories:',
    newsgroups_train.target_names[pred[0]],
    newsgroups_train.target_names[pred[1]]
)

#First document and label in test data: TRry the SKywatch project in  Arizona. 2
#Second document and label in test data: The Vatican library recently made a tour of the US.
 Can anyone help me in finding a FTP site where this collection is 
 available. 1
#Predicted labels: [2 1]
#Predicted categories: sci.space comp.graphics


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# CountVectorizer와 동일한 인수를 사용
tfidf = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.5)
X_train_tfidf = tfidf.fit_transform(X_train) # train set을 변환
X_test_tfidf = tfidf.transform(X_test) # test set을 변환

# tfidf train set을 이용해 분류기로 새로 학습
NB_clf.fit(X_train_tfidf, y_train)

# train set에 대한 예측정확도를 확인
print('Train set score: {:.3f}'.format(NB_clf.score(X_train_tfidf, y_train)))

# test set에 대한 예측정확도를 확인
print('Test set score: {:.3f}'.format(NB_clf.score(X_test_tfidf, y_test)))

Train set score: 0.862
Test set score: 0.741


In [27]:
from pandas.core.internals.managers import create_block_manager_from_arrays
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def top10_features(classifier, vectorizer, categories):
  feature_names = np.asarray(vectorizer.get_feature_names_out())
  for i, category in enumerate(categories):
    # 역순으로 정렬하기 위해 계수를 음수를 취해 정렬 후 앞에서부터 10개의 값을 반환
    top10 = np.argsort(-classifier.coef_[i])[:10]
    # 카테고리와 영향이 큰 특성 10개를 출력
    print("%s: %s" % (category, ", ".join(feature_names[top10])))

top10_features(NB_clf, tfidf, newsgroups_train.target_names)

alt.atheism: you, not, are, be, this, have, as, what, they, if
comp.graphics: you, on, graphics, this, have, any, can, or, with, thanks
sci.space: space, on, you, be, was, this, as, they, have, are
talk.religion.misc: you, not, he, are, as, this, be, god, was, they


In [28]:
# sklearn이 제공하는 logistic regression을 사용
from sklearn.linear_model import LogisticRegression

# count vector에 대해 regression을 해서 NB와 비교
LR_clf = LogisticRegression() # 분류기 선언

# train data를 이용해 분류기를 학습
LR_clf.fit(X_train_tfidf, y_train)

# train data에 대한 예측 정확도
print('Train set score: {:.3f}'.format(LR_clf.score(X_train_tfidf, y_train)))
# test data에 대한 예측 정확도
print('Test set score: {:.3f}'.format(LR_clf.score(X_test_tfidf, y_test)))

Train set score: 0.930
Test set score: 0.734


In [29]:
from sklearn.linear_model import RidgeClassifier

ridge_clf = RidgeClassifier() # 릿지 분류기 선언
ridge_clf.fit(X_train_tfidf, y_train) # 학습

print('Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

Train set score: 0.960
Test set score: 0.735


In [30]:
import numpy as np
from sklearn.model_selection import train_test_split

X_train_ridge, X_val_ridge, y_train_ridge, y_val_ridge = train_test_split(
    X_train_tfidf, y_train, test_size=0.2, random_state=42)

max_score = 0
max_alpha = 0
for alpha in np.arange(0.1, 10, 0.1): # alpha를 0.1부터 10까지 0.1씩 증가
  ridge_clf = RidgeClassifier(alpha=alpha) # 릿지 분류기 선언
  ridge_clf.fit(X_train_ridge, y_train_ridge) # 학습
  # 검정 데이터셋에 대해 정확도를 측정
  score = ridge_clf.score(X_val_ridge, y_val_ridge)
  if score > max_score: # 정확도가 이전의 정확도 최댓값보다 크면 최댓값을 변경한다.
    max_score = score
    max_alpha = alpha

print('Max alpha {:.3f} at max validation score {:.3f}'.format(max_alpha, max_score))

Max alpha 1.600 at max validation score 0.826


In [31]:
ridge_clf = RidgeClassifier(alpha=1.6) # 릿지 분류기 선언
ridge_clf.fit(X_train_tfidf, y_train) # 학습

print('Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

Train set score: 0.948
Test set score: 0.739


In [32]:
top10_features(ridge_clf, tfidf, newsgroups_train.target_names)

alt.atheism: bobby, religion, atheism, atheists, motto, punishment, islam, deletion, islamic, satan
comp.graphics: graphics, computer, 3d, file, image, hi, 42, using, screen, looking
sci.space: space, orbit, nasa, spacecraft, moon, sci, launch, flight, funding, idea
talk.religion.misc: christian, christians, fbi, blood, order, jesus, objective, children, christ, hudson


In [33]:
# Lasso는 동일한 LogisticRegression을 사용하면서 매개변수로 지정
lasso_clf = LogisticRegression(penalty='l1', solver='liblinear', C=1)

lasso_clf.fit(X_train_tfidf, y_train) # train data로 학습

print('#Train set score: {:.3f}'.format(lasso_clf.score(X_train_tfidf, y_train)))
print('#Test set score: {:.3f}'.format(lasso_clf.score(X_test_tfidf, y_test)))

# 계수(coefficient) 중에서 0이 아닌 것들의 개수를 출력
print(
    '#Used features count: {}'.format(np.sum(lasso_clf.coef_ != 0)),
    'out of',
    X_train_tfidf.shape[1]
)

#Train set score: 0.819
#Test set score: 0.724
#Used features count: 437 out of 2000


In [34]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

tree = DecisionTreeClassifier(random_state=7)
tree.fit(X_train_tfidf, y_train)
print(
    "#Decision Tree train set score: {:.3f}".format(tree.score(X_train_tfidf, y_train))
)
print('#Decision Tree test set score: {:.3f}'.format(tree.score(X_test_tfidf, y_test)))

forest = RandomForestClassifier(random_state=7)
forest.fit(X_train_tfidf, y_train)
print(
    '#Random Forest train set score: {:.3f}'.format(forest.score(X_train_tfidf, y_train))
)
print('#Random Forest test set score: {:.3f}'.format(forest.score(X_test_tfidf, y_test)))

gb = GradientBoostingClassifier(random_state=7)
gb.fit(X_train_tfidf, y_train)
print(
    '#Gradient Boosting train set score: {:.3f}'.format(gb.score(X_train_tfidf, y_train))
)
print('#Gradient Boosting test set score: {:.3f}'.format(gb.score(X_test_tfidf, y_test)))

#Decision Tree train set score: 0.977
#Decision Tree test set score: 0.536
#Random Forest train set score: 0.977
#Random Forest test set score: 0.685
#Gradient Boosting train set score: 0.933
#Gradient Boosting test set score: 0.696


In [35]:
sorted_feature_importances = sorted(
    zip(tfidf.get_feature_names_out(), gb.feature_importances_),
    key=lambda x: x[1],
    reverse=True,
)

for feature, value in sorted_feature_importances[:40]:
  print('%s: %.3f' % (feature, value), end=', ')

space: 0.126, graphics: 0.080, atheism: 0.024, thanks: 0.023, file: 0.021, orbit: 0.020, jesus: 0.018, god: 0.018, hi: 0.017, nasa: 0.015, image: 0.015, files: 0.014, christ: 0.010, moon: 0.010, bobby: 0.010, launch: 0.010, christian: 0.010, looking: 0.010, atheists: 0.009, christians: 0.009, fbi: 0.009, 3d: 0.008, you: 0.008, not: 0.008, islamic: 0.007, religion: 0.007, spacecraft: 0.007, flight: 0.007, computer: 0.007, islam: 0.007, ftp: 0.006, color: 0.006, software: 0.005, atheist: 0.005, card: 0.005, people: 0.005, koresh: 0.005, his: 0.005, kent: 0.004, sphere: 0.004, 

In [36]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [37]:
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import re

RegTok = RegexpTokenizer("[\w']{3,}") # 정규표현식으로 토크나이저를 정의
english_stops = set(stopwords.words('english')) # 영어 불용어를 가져옴

def tokenizer(text):
  tokens = RegTok.tokenize(text.lower())
  # stopwords 제외
  words = [word for word in tokens if (word not in english_stops) and len(word) > 2]
  # porter stemmer 적용
  features = (list(map(lambda token: PorterStemmer().stem(token),words)))
  return features

# 새로 정의한 토크나이저 사용
tfidf = TfidfVectorizer(tokenizer=tokenizer, max_features=2000, min_df=5, max_df=0.5)

X_train_tfidf = tfidf.fit_transform(X_train) # train set을 변환
X_test_tfidf = tfidf.transform(X_test) # test set을 변환

# tfidf vector를 이용하여 분류기 학습
LR_clf = LogisticRegression() # 분류기 선언
LR_clf.fit(X_train_tfidf, y_train) # train data를 이용해 분류기를 학습

# train data에 대한 예측 정확도
print('#Train set score: {:.3f}'.format(LR_clf.score(X_train_tfidf, y_train)))
# test data에 대한 예측 정확도
print('#Test set score: {:.3f}'.format(LR_clf.score(X_test_tfidf, y_test)))
len(LR_clf.coef_[0])

#Train set score: 0.930
#Test set score: 0.751


2000

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(tokenizer=tokenizer)

X_train_tfidf = tfidf.fit_transform(X_train) # train set을 변환
# 실제로 몇 개의 특성이 사용됐는지 확인
print('#Train set dimension:', X_train_tfidf.shape)

X_test_tfidf = tfidf.transform(X_test) # test set을 변환
print('#Test set dimension:', X_test_tfidf.shape)

ridge_clf = RidgeClassifier(alpha=2.4)
ridge_clf.fit(X_train_tfidf, y_train) # 학습
print('#Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('#Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

NB_clf = MultinomialNB(alpha=0.01) # 분류기 선언
NB_clf.fit(X_train_tfidf, y_train) # train set을 이용해 분류기를 학습

# train set에 대한 예측정확도를 확인
print('#Train set score: {:.3f}'.format(NB_clf.score(X_train_tfidf, y_train)))
# test set에 대한 예측정확도를 확인
print('#Test set score: {:.3f}'.format(NB_clf.score(X_test_tfidf, y_test)))

#Train set dimension: (2034, 20085)
#Test set dimension: (1353, 20085)
#Train set score: 0.968
#Test set score: 0.768
#Train set score: 0.971
#Test set score: 0.793


In [39]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

cachedStopWords = stopwords.words("english")
tfidf = TfidfVectorizer(token_pattern="[a-zA-Z']{3,}", # 토큰화를 위한 정규식
                        decode_error='ignore',
                        lowercase=True,
                        stop_words = stopwords.words('english'),
                        max_df=0.5,
                        min_df=2)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.shape)

(2034, 11483)


In [40]:
from sklearn.linear_model import RidgeClassifier

ridge_clf = RidgeClassifier() # 릿지 분류기 선언
ridge_clf.fit(X_train_tfidf, y_train) # 학습
print('Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

Train set score: 0.976
Test set score: 0.766


In [41]:
tfidf = TfidfVectorizer(token_pattern="[a-zA-Z']{3,}",
                        decode_error='ignore',
                        lowercase=True,
                        stop_words=stopwords.words('english'),
                        ngram_range=(1, 2), # 바이그램 설정
                        max_df=0.5,
                        min_df=2)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(X_train_tfidf.shape)

(2034, 26550)


In [42]:
bigram_features = [f for f in tfidf.get_feature_names_out() if len(f.split()) > 1]
print('bi-gram samples:', bigram_features[:10])

ridge_clf.fit(X_train_tfidf, y_train) # 학습
print('Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

bi-gram samples: ["'cause can't", "'em better", "'expected errors'", "'karla' next", "'nodis' password", "'official doctrine", "'ok see", "'sci astro'", "'what's moonbase", 'aas american']
Train set score: 0.976
Test set score: 0.773


In [45]:
tfidf = TfidfVectorizer(token_pattern="[a-zA-Z']{3,}",
                        decode_error = 'ignore',
                        lowercase = True,
                        stop_words = stopwords.words('english'),
                        ngram_range=(1, 3),
                        max_df=0.5,
                        min_df=2)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
print(X_train_tfidf.shape)

trigram_features = [f for f in tfidf.get_feature_names_out() if len(f.split()) > 2]
print('tri-gram samples:', trigram_features[:10])

ridge_clf.fit(X_train_tfidf, y_train) # 학습
print('Train set score: {:.3f}'.format(ridge_clf.score(X_train_tfidf, y_train)))
print('Test set score: {:.3f}'.format(ridge_clf.score(X_test_tfidf, y_test)))

(2034, 32943)
Train set score: 0.976
Test set score: 0.775
