In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer



# 備註--------------------------------------------------

In [2]:
#vectorizer=CountVectorizer()
#transformer=TfidfTransformer()
#tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))

#等價於：

#transformer=TfidfVectorizer()
#tfidf2=transformer.fit_transform(corpus)

# ---------------------------------------------------------------

In [3]:
#|--20news-bydate
#            |--20news-bydate-train
#            |--20news-bydate-test

twenty_train = fetch_20newsgroups(subset='train')
twenty_test = fetch_20newsgroups(subset='test')

In [4]:
from pprint import pprint
pprint(list(twenty_train.target_names))
# 20 newsgroups dataset 
# 18000 newsgroups posts on 20 topics split in two subsets: one for training  and  one for testing . 

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [5]:
len(twenty_train.target_names),len(twenty_train.data),len(twenty_train.filenames),len(twenty_test.data)

(20, 11314, 11314, 7532)

In [6]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))


From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu


In [7]:
print(twenty_train.target_names[twenty_train.target[0]])



rec.autos


In [8]:
twenty_train.target[:10]


array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

# 計算詞頻

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words="english",decode_error='ignore')
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(11314, 129796)

# TF-IDF

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 129796)

# 貝氏分類器

In [11]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf,twenty_train.target)

In [12]:
# 對新的樣本進行預測
docs_new = ['God is love','OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc,category in zip(docs_new,predicted):
    print('%r => %s' % (doc,twenty_train.target_names[category]))


'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


# 分類效果

In [13]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect',CountVectorizer(stop_words="english",decode_error='ignore')),
                    ('tfidf',TfidfTransformer()),
                    ('clf',MultinomialNB()),
                    ])

text_clf = text_clf.fit(twenty_train.data,twenty_train.target)

In [14]:
docs_test = twenty_test.data
y_pred = text_clf.predict(docs_test)

from sklearn.metrics import accuracy_score

print('Accuracy: %.2f' % accuracy_score(twenty_test.target, y_pred))

Accuracy: 0.82


# 用svm分類

In [15]:
from sklearn.linear_model import SGDClassifier
text_clf_2 = Pipeline([('vect',CountVectorizer(stop_words='english',decode_error='ignore')),
                      ('tfidf',TfidfTransformer()),
                      ('clf',SGDClassifier(loss = 'hinge',penalty = 'l2',
                                          alpha = 1e-3,n_iter = 5, random_state = 42)),
                      ])

_ = text_clf_2.fit(twenty_train.data,twenty_train.target)
y_pred = text_clf_2.predict(docs_test)
print('Accuracy: %.2f' % accuracy_score(twenty_test.target, y_pred))


Accuracy: 0.82


In [16]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target,y_pred,
                                   target_names = twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.72      0.71      0.71       319
           comp.graphics       0.79      0.70      0.74       389
 comp.os.ms-windows.misc       0.73      0.77      0.75       394
comp.sys.ibm.pc.hardware       0.71      0.68      0.69       392
   comp.sys.mac.hardware       0.82      0.82      0.82       385
          comp.windows.x       0.84      0.77      0.80       395
            misc.forsale       0.82      0.87      0.85       390
               rec.autos       0.91      0.89      0.90       396
         rec.motorcycles       0.92      0.97      0.94       398
      rec.sport.baseball       0.90      0.91      0.90       397
        rec.sport.hockey       0.86      0.98      0.92       399
               sci.crypt       0.85      0.96      0.90       396
         sci.electronics       0.81      0.62      0.70       393
                 sci.med       0.90      0.87      0.88       396
         

In [17]:
metrics.confusion_matrix(twenty_test.target,y_pred)


array([[225,   1,   0,   1,   0,   1,   2,   0,   2,   3,   0,   2,   1,
          8,   6,  47,   2,   6,   1,  11],
       [  2, 273,  20,   8,   9,  28,   3,   1,   4,   7,   3,   9,   4,
          1,   9,   2,   2,   3,   0,   1],
       [  0,  10, 304,  24,  11,  11,   1,   2,   1,   5,   3,   8,   2,
          1,   7,   1,   0,   1,   0,   2],
       [  3,   8,  32, 265,  19,   4,  17,   2,   3,   3,   2,   3,  20,
          1,   5,   0,   1,   2,   1,   1],
       [  1,   4,   8,  26, 315,   2,  10,   0,   1,   2,   3,   1,   6,
          1,   1,   0,   2,   0,   2,   0],
       [  1,  29,  41,   0,   3, 303,   2,   0,   1,   1,   1,   2,   1,
          1,   7,   1,   1,   0,   0,   0],
       [  0,   3,   0,  18,   6,   0, 340,   8,   1,   2,   3,   1,   3,
          2,   2,   0,   1,   0,   0,   0],
       [  1,   1,   1,   2,   1,   0,  10, 354,   7,   1,   0,   0,   9,
          1,   3,   0,   4,   0,   1,   0],
       [  0,   0,   0,   1,   0,   0,   4,   5, 385,   1,   0,  

# LDA

In [1]:
from __future__ import print_function

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()


# Load the 20 newsgroups dataset and vectorize it. We use a few heuristics
# to filter out useless terms early on: the posts are stripped of headers,
# footers and quoted replies, and common English words, words occurring in
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]



# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..."
      % (n_samples, n_features))
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

lda.fit(tf)

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Loading dataset...
Extracting tf features for LDA...
Fitting LDA models with tf features, n_samples=2000 and n_features=1000...

Topics in LDA model:
Topic #0:
edu com mail send graphics ftp pub available contact university list faq ca information cs 1993 program sun uk mit
Topic #1:
don like just know think ve way use right good going make sure ll point got need really time doesn
Topic #2:
christian think atheism faith pittsburgh new bible radio games alt lot just religion like book read play time subject believe
Topic #3:
drive disk windows thanks use card drives hard version pc software file using scsi help does new dos controller 16
Topic #4:
hiv health aids disease april medical care research 1993 light information study national service test led 10 page new drug
Topic #5:
god people does just good don jesus say israel way life know true fact time law want believe make think
Topic #6:
55 10 11 18 15 team game 19 period play 23 12 13 flyers 20 25 22 17 24 16
Topic #7:
car year just