In [34]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from konlpy.tag import Okt
import numpy as np
import pandas as pd
import re

In [3]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

In [4]:
print(len(newsgroups_train.data), '\n\n', len(newsgroups_test.data), '\n\n',
      newsgroups_train.target_names, '\n\n', set(newsgroups_train.target))

2034 

 1353 

 ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc'] 

 {0, 1, 2, 3}


In [5]:
print(newsgroups_train.data[0], '\n\n', newsgroups_train.target[0])

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych 

 1


In [6]:
X_train, X_test = newsgroups_train.data, newsgroups_test.data
y_train, y_test = newsgroups_train.target, newsgroups_test.target

In [8]:
def my_tokenizer(doc):
    tokenizer = RegexpTokenizer(r"[\w']{2,}")
    lemma = WordNetLemmatizer()
    tokens = [lemma.lemmatize(token) for token in tokenizer.tokenize(doc)]
    return tokens

In [9]:
cv = CountVectorizer(tokenizer=my_tokenizer, max_features=2000, min_df=5, max_df=0.5)
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)
print(X_train_cv.shape, '\n\n', X_test_cv.shape)



(2034, 2000) 

 (1353, 2000)


In [11]:
for word, count in zip(cv.get_feature_names_out()[51:100], X_train_cv[0].toarray()[0, 51:100]):
    print(word, ':', count, end=', ')

400 : 0, 42 : 0, 45 : 0, 50 : 0, 500 : 0, 60 : 0, 600 : 0, 65 : 0, 70 : 0, 75 : 0, 80 : 0, 800 : 0, 90 : 0, 900 : 0, 91 : 0, 92 : 0, 93 : 0, 95 : 0, _the : 0, a : 0, ability : 0, able : 1, abortion : 0, about : 1, above : 0, absolute : 0, absolutely : 0, abstract : 0, ac : 0, acceleration : 0, accept : 0, acceptable : 0, accepted : 0, access : 0, according : 0, account : 0, accurate : 0, acronym : 0, across : 0, act : 0, action : 0, active : 0, activity : 0, actual : 0, actually : 0, ad : 0, adam : 0, add : 0, added : 0, 

In [12]:
nb_clf = MultinomialNB().fit(X_train_cv, y_train)
nb_clf.score(X_train_cv, y_train), nb_clf.score(X_test_cv, y_test)

(0.831858407079646, 0.7405764966740577)

In [14]:
pred = nb_clf.predict(X_test_cv)
for i in range(3):
    print(newsgroups_test.target_names[pred[i]])

sci.space
comp.graphics
comp.graphics


In [15]:
tfidf = TfidfVectorizer(tokenizer=my_tokenizer, max_features=2000, min_df=5, max_df=0.5)
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)
print(X_train_tf.shape, '\n\n', X_test_tf.shape)



(2034, 2000) 

 (1353, 2000)


In [20]:
nb_clf = MultinomialNB(alpha=0.05).fit(X_train_tf, y_train)
nb_clf.score(X_train_tf, y_train), nb_clf.score(X_test_tf, y_test)

(0.9090462143559489, 0.7590539541759054)

In [21]:
pred = nb_clf.predict(X_test_cv)
for i in range(3):
    print(newsgroups_test.target_names[pred[i]])

sci.space
comp.graphics
comp.graphics


In [33]:
logreg = LogisticRegression(C=2, max_iter=5000).fit(X_train_tf, y_train)
logreg.score(X_train_tf, y_train), logreg.score(X_test_tf, y_test)

(0.9532940019665683, 0.7487065779748706)

In [35]:
param_grid = {'penalty':('l1', 'l2'), 'C':np.logspace(-2, 2, 5)}
grid = GridSearchCV(LogisticRegression(solver='liblinear', max_iter=5000), param_grid=param_grid, cv=5, n_jobs=-1).fit(X_train_tf, y_train)
grid.best_score_, grid.best_estimator_, grid.best_params_

(0.7954866196245507,
 LogisticRegression(C=10.0, max_iter=5000, solver='liblinear'),
 {'C': 10.0, 'penalty': 'l2'})

In [36]:
grid.score(X_train_tf, y_train), grid.score(X_test_tf, y_test)

(0.9729596853490659, 0.7398373983739838)

In [49]:
def top_n_features(classifier, vectorizer, categories, n):
    feature_names = np.asarray(vectorizer.get_feature_names_out())
    for i, category in enumerate(categories):
        top_n = np.argsort(classifier.coef_[i])[:-(n+1):-1]
        print(f'{category}, {", ".join(feature_names[top_n])}')

In [50]:
top_n_features(logreg, tfidf, newsgroups_train.target_names, 5)

alt.atheism, atheist, atheism, religion, bobby, deletion
comp.graphics, graphic, image, file, computer, 3d
sci.space, space, orbit, nasa, launch, moon
talk.religion.misc, christian, order, jesus, objective, he
