# Document Classification using Naive Bayse Classifier

## 1. Preparing the 20 Newsgroups Data and Feature Extraction

http://scikit-learn.org/0.19/datasets/twenty_newsgroups.html

In [1]:
from sklearn.datasets import fetch_20newsgroups

# Create a list of categories to select from the 20 topics
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

# Fetch the training dataset
newsgroups_train = fetch_20newsgroups(subset='train',
# Remove parts of the email content that provide hints - classify purely based on content
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)
# Fetch the validation dataset
newsgroups_test = fetch_20newsgroups(subset='test', 
                                     remove=('headers', 'footers', 'quotes'),
                                     categories=categories)

print('#Train set size:', len(newsgroups_train.data))
print('#Test set size:', len(newsgroups_test.data))
print('#Selected categories:', newsgroups_train.target_names)
print('#Train labels:', set(newsgroups_train.target))

#Train set size: 2034
#Test set size: 1353
#Selected categories: ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
#Train labels: {0, 1, 2, 3}


In [2]:
print('#Train set text samples:', newsgroups_train.data[0])
print('#Train set label smaples:', newsgroups_train.target[0])
print('#Test set text samples:', newsgroups_test.data[0])
print('#Test set label smaples:', newsgroups_test.target[0])

#Train set text samples: Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych
#Train set label smaples: 1
#Test set text samples: TRry the SKywatch project in  Arizona.
#Test set label smaples: 2


In [3]:
X_train = newsgroups_train.data   # Training dataset documents
y_train = newsgroups_train.target  # Training dataset labels

X_test = newsgroups_test.data      # Validation dataset documents
y_test = newsgroups_test.target     # Validation dataset labels

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2000, min_df=5, max_df=0.5)

X_train_cv = cv.fit_transform(X_train)  # Transform the training set
print('Train set dimension:', X_train_cv.shape) 
X_test_cv = cv.transform(X_test)  # Transform the test set
print('Test set dimension:', X_test_cv.shape)

Train set dimension: (2034, 2000)
Test set dimension: (1353, 2000)


In [4]:
for word, count in zip(cv.get_feature_names_out()[:100], X_train_cv[0].toarray()[0, :100]):
    print(word, ':', count, end=', ')

00 : 0, 000 : 0, 01 : 0, 04 : 0, 05 : 0, 10 : 0, 100 : 0, 1000 : 0, 11 : 0, 12 : 0, 128 : 0, 129 : 0, 13 : 0, 130 : 0, 14 : 0, 15 : 0, 16 : 0, 17 : 0, 18 : 0, 19 : 0, 1987 : 0, 1988 : 0, 1989 : 0, 1990 : 0, 1991 : 0, 1992 : 0, 1993 : 0, 20 : 0, 200 : 0, 202 : 0, 21 : 0, 22 : 0, 23 : 0, 24 : 0, 25 : 0, 256 : 0, 26 : 0, 27 : 0, 28 : 0, 2d : 0, 30 : 0, 300 : 0, 31 : 0, 32 : 0, 33 : 0, 34 : 0, 35 : 0, 39 : 0, 3d : 0, 40 : 0, 400 : 0, 42 : 0, 45 : 0, 50 : 0, 500 : 0, 60 : 0, 600 : 0, 65 : 0, 70 : 0, 75 : 0, 80 : 0, 800 : 0, 90 : 0, 900 : 0, 91 : 0, 92 : 0, 93 : 0, 95 : 0, _the : 0, ability : 0, able : 1, abortion : 0, about : 1, above : 0, absolute : 0, absolutely : 0, ac : 0, accept : 0, acceptable : 0, accepted : 0, access : 0, according : 0, account : 0, accurate : 0, across : 0, act : 0, action : 0, actions : 0, active : 0, activities : 0, activity : 0, acts : 0, actual : 0, actually : 0, ad : 0, add : 0, added : 0, addition : 0, additional : 0, address : 0, 

## 2. Document Classification using k-Nearest Neighbors Classifier

http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

In [8]:
from sklearn.neighbors import KNeighborsClassifier  # Import KNN classifier
KNN_clf = KNeighborsClassifier(n_neighbors=2)  # You can choose the number of neighbors

KNN_clf.fit(X_train_cv, y_train)  # Train the classifier using the CountVectorizer training set

print('Train set score: {:.3f}'.format(KNN_clf.score(X_train_cv, y_train)))  # Check prediction accuracy on the training set
print('Test set score: {:.3f}'.format(KNN_clf.score(X_test_cv, y_test)))  # Check prediction accuracy on the test set

Train set score: 0.739
Test set score: 0.387


In [6]:
print('#First document and label in test data:', X_test[0], y_test[0])
print('#Second document and label in test data:', X_test[1], y_test[1])

pred = KNN_clf.predict(X_test_cv[:2])  # Predictions for the test set using CountVectorizer

print('#Predicted labels:', pred)
print('#Predicted categories:', newsgroups_train.target_names[pred[0]], newsgroups_train.target_names[pred[1]])

#First document and label in test data: TRry the SKywatch project in  Arizona. 2
#Second document and label in test data: The Vatican library recently made a tour of the US.
 Can anyone help me in finding a FTP site where this collection is 
 available. 1
#Predicted labels: [1 3]
#Predicted categories: comp.graphics talk.religion.misc


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use the same arguments as CountVectorizer
tfidf = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.5) 
X_train_tfidf = tfidf.fit_transform(X_train)  # Transform the training set
X_test_tfidf = tfidf.transform(X_test)  # Transform the test set

KNN_clf.fit(X_train_tfidf, y_train)  # Train the classifier using the TF-IDF training set
print('Train set score: {:.3f}'.format(KNN_clf.score(X_train_tfidf, y_train)))  # Check prediction accuracy on the training set
print('Test set score: {:.3f}'.format(KNN_clf.score(X_test_tfidf, y_test)))  # Check prediction accuracy on the test set

Train set score: 0.830
Test set score: 0.336


In [11]:
from sklearn.ensemble import RandomForestClassifier

# Random Forest Classifier로 모델 재학습
rf_clf = RandomForestClassifier()
rf_clf.fit(tfidf.transform(newsgroups_train.data), newsgroups_train.target)

def top10_features(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names_out())
    for i, category in enumerate(categories):
        # Feature importance를 기반으로 정렬
        top10 = np.argsort(classifier.feature_importances_)[-10:]  # 상위 10개 특성
        # Print the category and the top 10 influential features
        print("%s: %s" % (category, ", ".join(feature_names[top10])))

top10_features(rf_clf, tfidf, newsgroups_train.target_names)

alt.atheism: image, orbit, 3d, file, thanks, you, nasa, god, graphics, space
comp.graphics: image, orbit, 3d, file, thanks, you, nasa, god, graphics, space
sci.space: image, orbit, 3d, file, thanks, you, nasa, god, graphics, space
talk.religion.misc: image, orbit, 3d, file, thanks, you, nasa, god, graphics, space
