In [1]:
from sklearn.datasets import load_files

In [2]:
news_train = load_files('data/379/train')

In [3]:
news_train.target

array([18, 13,  1, ..., 14, 15,  4])

In [4]:
news_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [6]:
news_train.target_names[news_train.target[0]]

'talk.politics.misc'

In [8]:
news_train.data[0]



In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(encoding='latin-1')
X_train = vect.fit_transform(news_train.data)

In [12]:
import numpy as np
X_train.shape

(13180, 130274)

In [14]:
print(X_train[0,:])

  (0, 56813)	0.0143326637736
  (0, 45689)	0.0837334394975
  (0, 46084)	0.0810973352979
  (0, 125882)	0.087315770484
  (0, 50150)	0.0206543137216
  (0, 87702)	0.0464323558506
  (0, 33334)	0.102540565819
  (0, 111805)	0.0143326637736
  (0, 115086)	0.0771294700855
  (0, 99721)	0.0503908022215
  (0, 109314)	0.117229781513
  (0, 89035)	0.171338561509
  (0, 117388)	0.0687904270169
  (0, 66565)	0.033735911672
  (0, 120409)	0.0422379709863
  (0, 62408)	0.366956013238
  (0, 36885)	0.179119457807
  (0, 113268)	0.0457228945534
  (0, 36634)	0.108873206102
  (0, 95990)	0.195258654337
  (0, 67717)	0.155844088862
  (0, 124607)	0.032838970724
  (0, 59746)	0.0524729088882
  (0, 115068)	0.0385059369694
  (0, 89566)	0.0332115808717
  :	:
  (0, 115836)	0.0544558533939
  (0, 28463)	0.0774370963823
  (0, 79247)	0.0406594603409
  (0, 92698)	0.0357445888645
  (0, 59404)	0.0417805466083
  (0, 55031)	0.0675798075811
  (0, 33945)	0.0904698434879
  (0, 22149)	0.100185665428
  (0, 115213)	0.0274666381668
  (0, 877

### 训练模型

In [15]:
from sklearn.naive_bayes import MultinomialNB
y_train = news_train.target
clf = MultinomialNB(alpha=0.0001)
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)

In [16]:
train_score

0.99787556904400609

In [17]:
news_test = load_files('data/379/test')

In [18]:
X_test = vect.transform(news_test.data)
y_test = news_test.target

In [19]:
pred = clf.predict(X_test[0])

In [20]:
pred

array([7])

In [21]:
news_test.target[0]

7

### 模型评价

In [22]:
clf.score(X_test, y_test)

0.90881728045325783

In [24]:
from sklearn.metrics import classification_report

pred = clf.predict(X_test)

print(classification_report(y_test, pred, target_names=news_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.90      0.91      0.91       245
           comp.graphics       0.80      0.90      0.85       298
 comp.os.ms-windows.misc       0.82      0.79      0.80       292
comp.sys.ibm.pc.hardware       0.81      0.80      0.81       301
   comp.sys.mac.hardware       0.90      0.91      0.91       256
          comp.windows.x       0.88      0.88      0.88       297
            misc.forsale       0.87      0.81      0.84       290
               rec.autos       0.92      0.93      0.92       324
         rec.motorcycles       0.96      0.96      0.96       294
      rec.sport.baseball       0.97      0.94      0.96       315
        rec.sport.hockey       0.96      0.99      0.98       302
               sci.crypt       0.95      0.96      0.95       297
         sci.electronics       0.91      0.85      0.88       313
                 sci.med       0.96      0.96      0.96       277
         

In [25]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, pred)

print(cm)

[[224   0   0   0   0   0   0   0   0   0   0   0   0   0   2   5   0   0
    1  13]
 [  1 267   5   5   2   8   1   1   0   0   0   2   3   2   1   0   0   0
    0   0]
 [  1  13 230  24   4  10   5   0   0   0   0   1   2   1   0   0   0   0
    1   0]
 [  0   9  21 242   7   2  10   1   0   0   1   1   7   0   0   0   0   0
    0   0]
 [  0   1   5   5 233   2   2   2   1   0   0   3   1   0   1   0   0   0
    0   0]
 [  0  20   6   3   1 260   0   0   0   2   0   1   0   0   2   0   2   0
    0   0]
 [  0   2   5  12   3   1 235  10   2   3   1   0   7   0   2   0   2   1
    4   0]
 [  0   1   0   0   1   0   8 300   4   1   0   0   1   2   3   0   2   0
    1   0]
 [  0   1   0   0   0   2   2   3 283   0   0   0   1   0   0   0   0   0
    1   1]
 [  0   1   1   0   1   2   1   2   0 297   8   1   0   1   0   0   0   0
    0   0]
 [  0   0   0   0   0   0   0   0   2   2 298   0   0   0   0   0   0   0
    0   0]
 [  0   1   2   0   0   1   1   0   0   0   0 284   2   1   0   0