In [None]:
# https://github.com/jukyellow/Text_Classification

In [1]:
from sklearn.datasets import fetch_20newsgroups
from keras.layers import  Dropout, Dense
from keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics

Using TensorFlow backend.


In [2]:
def TFIDF(X_train, X_test,MAX_NB_WORDS=75000):
    vectorizer_x = TfidfVectorizer(max_features=MAX_NB_WORDS)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()
    print("tf-idf with",str(np.array(X_train).shape[1]),"features")
    return (X_train,X_test)

In [3]:
def Build_Model_DNN_Text(shape, nClasses, dropout=0.5):
    """
    buildModel_DNN_Tex(shape, nClasses,dropout)
    Build Deep neural networks Model for text classification
    Shape is input feature space
    nClasses is number of classes
    """
    model = Sequential()
    node = 512 # number of nodes
    nLayers = 4 # number of  hidden layer

    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()

    return model

In [4]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
X_train = newsgroups_train.data
X_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [5]:
X_train_tfidf,X_test_tfidf = TFIDF(X_train,X_test)

tf-idf with 75000 features


In [6]:
model_DNN = Build_Model_DNN_Text(X_train_tfidf.shape[1], 20)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               38400512  
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 512)               262656    
_________________________________________________________________
dropout_3 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 512)              

In [7]:
model_DNN.fit(X_train_tfidf, y_train,
                              validation_data=(X_test_tfidf, y_test),
                              epochs=10,
                              batch_size=128,
                              verbose=2)


Train on 11314 samples, validate on 7532 samples
Epoch 1/10
 - 9s - loss: 2.7730 - accuracy: 0.1085 - val_loss: 1.8301 - val_accuracy: 0.3759
Epoch 2/10
 - 7s - loss: 1.4199 - accuracy: 0.4808 - val_loss: 0.9671 - val_accuracy: 0.6826
Epoch 3/10
 - 7s - loss: 0.6188 - accuracy: 0.7724 - val_loss: 0.7831 - val_accuracy: 0.7643
Epoch 4/10
 - 7s - loss: 0.3080 - accuracy: 0.8955 - val_loss: 0.7867 - val_accuracy: 0.7812
Epoch 5/10
 - 7s - loss: 0.1584 - accuracy: 0.9481 - val_loss: 0.8195 - val_accuracy: 0.8054
Epoch 6/10
 - 7s - loss: 0.0977 - accuracy: 0.9686 - val_loss: 0.8653 - val_accuracy: 0.8151
Epoch 7/10
 - 7s - loss: 0.0725 - accuracy: 0.9808 - val_loss: 0.8776 - val_accuracy: 0.8072
Epoch 8/10
 - 7s - loss: 0.0514 - accuracy: 0.9862 - val_loss: 0.8967 - val_accuracy: 0.8234
Epoch 9/10
 - 7s - loss: 0.0496 - accuracy: 0.9869 - val_loss: 0.9250 - val_accuracy: 0.8140
Epoch 10/10
 - 7s - loss: 0.0485 - accuracy: 0.9866 - val_loss: 0.8970 - val_accuracy: 0.8176


<keras.callbacks.callbacks.History at 0x7fc4818bacf8>

In [12]:
predicted = model_DNN.predict(X_test_tfidf)
print(predicted)
print(np.argmax(predicted, axis=-1))

[[4.6388433e-07 5.2116445e-04 3.4765373e-05 ... 2.2602130e-06
  1.5863210e-05 2.4647265e-05]
 [4.7071222e-03 2.1579240e-02 9.5080268e-03 ... 5.9332405e-03
  2.0314550e-02 3.1350091e-02]
 [9.9999118e-01 2.3446174e-11 3.3761452e-15 ... 4.1729034e-08
  1.1419759e-07 7.8155144e-06]
 ...
 [6.1763711e-07 6.6114805e-04 4.7359590e-05 ... 2.4479933e-08
  6.3384402e-07 3.7599170e-07]
 [5.1017798e-11 3.3610075e-05 4.9444593e-06 ... 1.2882909e-09
  8.6406056e-09 1.4537258e-07]
 [1.2121112e-05 5.1949387e-09 3.1907932e-09 ... 5.5452908e-08
  7.6236307e-07 1.1531975e-04]]
[12 11  0 ...  9 12 15]


In [14]:
print(y_test)

[ 7  5  0 ...  9  6 15]


In [13]:
pre_classes = np.argmax(predicted, axis=-1)
print(metrics.classification_report(y_test, pre_classes))

              precision    recall  f1-score   support

           0       0.81      0.71      0.76       319
           1       0.74      0.73      0.73       389
           2       0.77      0.67      0.72       394
           3       0.69      0.78      0.73       392
           4       0.67      0.86      0.75       385
           5       0.77      0.77      0.77       395
           6       0.72      0.85      0.78       390
           7       0.92      0.85      0.88       396
           8       0.96      0.93      0.95       398
           9       0.93      0.92      0.93       397
          10       0.96      0.96      0.96       399
          11       0.95      0.91      0.93       396
          12       0.69      0.73      0.71       393
          13       0.90      0.74      0.81       396
          14       0.87      0.89      0.88       394
          15       0.92      0.92      0.92       398
          16       0.75      0.90      0.82       364
          17       0.97    