### download text data

In [1]:
import random
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\0\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [2]:
reviews = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
print(len(reviews))

2000


In [3]:
print(stopwords.words("english"))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
stop_words = stopwords.words("english") + ['\'', '-', '(', ')', ':', '"', ';', '--']
texts, cats = [], []
for text, cat in reviews:
    text = ' '.join([w.lower() for w in text if not w.lower() in stop_words])
    texts.append(text)
    cats.append(cat)
print(len(texts), len(cats))
print(texts[404])
print(cats[4:10])

2000 2000
horror movie truly called horror movie scares , suspense , even eerie elements ? think , children corn 666 issac return wants us believe . sixth installment horrible , worn series far worst date . unlike five chapters , children corn 666 confusing , brainless thriller takes psychological horror route rather slasher horror , either way , none movies least bit scary . film follows hannah natalie ramsey teen looking mother gatlin , nebraska , eve 21st birthday . starts daughter desperate search long lost mother turns story hannah first daughter children corn , roam cornfields looking adults murder . understandable film , learn much , issac john franklin led children corn previous chapter , older , strange man , looking hannah fulfill prophecy . supposed make sense . really . start film unclear going , developing characters throwing concrete plot details across table , constantly introducing new characters without personalities slightest hint individuality , sub plots nothing see

### preprocess data

In [5]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [63]:
MAX_NB_WORDS = 2000
MAX_SEQUENCE_LENGTH = 300
EMBEDDING_DIM = 32  # 50, 100, 200, 300
VALIDATION_SPLIT = 0.15

In [64]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
# print(sequences[:2])

In [65]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 39304 unique tokens.


In [66]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(type(data), data.shape)
print(data)

<class 'numpy.ndarray'> (2000, 300)
[[   0    0    0 ...  192  873  192]
 [   0    0    0 ...   10  406    3]
 [   0    0    0 ...  465 1391    1]
 ...
 [ 756  218 1007 ... 1609 1362  552]
 [   0    0    0 ...    1    1   66]
 [ 331 1397 1505 ...    1  904   71]]


In [67]:
lb_encoder = LabelEncoder()
labels = lb_encoder.fit_transform(cats)
labels = to_categorical(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (2000, 300)
Shape of label tensor: (2000, 2)


In [68]:
# split the data into a training set and a test set
indices = np.arange(data.shape[0])
np.random.seed(42)
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_test = data[-nb_validation_samples:]
y_test = labels[-nb_validation_samples:]
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(1700, 300) (1700, 2)
(300, 300) (300, 2)


### text classification

In [74]:
import os
from keras.layers import *
from keras.models import Model
from keras.models import Sequential

#### word embedding

use keras word vectors

In [69]:
embed_default = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, trainable=True)

use GloVe vectors

In [14]:
embeddings_index = {}
with open(os.path.join('./glove.6B', 'glove.6B.{}d.txt'.format(EMBEDDING_DIM)), 'r', encoding='UTF-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Found 400001 word vectors.


In [15]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [16]:
embed_glove = Embedding(len(word_index) + 1,
                        EMBEDDING_DIM,
                        weights=[embedding_matrix],
                        input_length=MAX_SEQUENCE_LENGTH,
                        trainable=True)

#### build CNN model and train

In [70]:
use_default = True

input = Input(shape=(MAX_SEQUENCE_LENGTH,))
if use_default:
    x = embed_default(input)
else:
    x = embed_glove(input)

net = Dropout(0.2)(x)
net = BatchNormalization()(net)

net = Conv1D(32, 7, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)

net = Conv1D(2, 1)(net)
net = GlobalAveragePooling1D()(net)
output = Activation('softmax')(net)
model = Model(inputs = input, outputs = output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 300, 32)           64000     
_________________________________________________________________
dropout_7 (Dropout)          (None, 300, 32)           0         
_________________________________________________________________
batch_normalization_31 (Batc (None, 300, 32)           128       
_________________________________________________________________
conv1d_31 (Conv1D)           (None, 300, 32)           7200      
_________________________________________________________________
batch_normalization_32 (Batc (None, 300, 32)           128       
_________________________________________________________________
conv1d_32 (Conv1D)           (None, 300, 32)           3104      
__________

In [71]:
model.fit(x_train, y_train, batch_size=512, epochs=20, validation_split=0.1, shuffle=True)

Train on 1530 samples, validate on 170 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x43a70208>

In [72]:
loss, acc = model.evaluate(x_test, y_test)
print('loss = {}\nacc  = {}'.format(loss, acc))

loss = 0.40925220559040704
acc  = 0.8433333333333334


#### build LSTM model and train

In [96]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 300, 32)           64000     
_________________________________________________________________
conv1d_38 (Conv1D)           (None, 300, 32)           3104      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 150, 32)           0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_11 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 2)                 202       
Total params: 120,506
Trainable params: 120,506
Non-trainable params: 0
_________________________________________________________________


In [97]:
model.fit(x_train, y_train, batch_size=512, epochs=20, validation_split=0.1, shuffle=True)

Train on 1530 samples, validate on 170 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x64322358>

In [98]:
loss, acc = model.evaluate(x_test, y_test, verbose=0)
print('loss = {}\nacc  = {}'.format(loss, acc))

loss = 0.42293044368426
acc  = 0.8199999992052714


#### SVM

In [20]:
from sklearn.svm import SVC
import nltk
nltk.download('stopwords')
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
import numpy as np
import random

documents = [(list(movie_reviews.words(fileid)),category) 
             for category in movie_reviews.categories() 
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
all_words = all_words.most_common(2000)
stop_words = stopwords.words("english")
word_features = [w for (w,f) in all_words if w not in stop_words]
features = np.zeros([len(documents),len(word_features)],dtype=float)
for n in range(len(documents)):
    document_words = set(documents[n][0])
    for m in range(len(word_features)):
        if word_features[m] in document_words:
            features[n,m] = 1 # 文件-词集矩阵
target = [c for (d,c) in documents]
train_set = features[:1500,:]
target_train = target[:1500]
test_set = features[1500:,:]
target_test = target[1500:]

# svc= SVC()
# svc.fit(train_set,target_train)
# pred = svc.predict(test_set)
# print("支持向量机准确率:"+str(sum([1 for n in range(len(target_test)) if pred[n]==target_test[n] ])/len(target_test)))
tuned_parameters = [{'kernel': ['rbf','poly','linear','sigmoid']}]
svm_clf = GridSearchCV(SVC(gamma='auto'), tuned_parameters, cv=10)
svm_clf.fit(train_set,target_train)
print("The best parameters are %s with a score of %0.2f" % (svm_clf.best_params_,svm_clf.best_score_))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\0\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


The best parameters are {'kernel': 'linear'} with a score of 0.81
