In [1]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import numpy as np
import re
import os
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation,Flatten
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


In [2]:
! wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
! tar zxf aclImdb_v1.tar.gz

--2020-06-20 08:03:06--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2020-06-20 08:03:17 (7.27 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [3]:
re_tag = re.compile(r'<[^>]+>')

def rm_tags(text):
    return re_tag.sub('', text)

In [4]:
def read_files(filetype):
    path = "/content/aclImdb/"
    file_list = []

    positive_path = path + filetype+"/pos/"
    for f in os.listdir(positive_path):
        file_list += [positive_path+f]
    
    negative_path = path + filetype+"/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path+f]
        
    print('read', filetype, 'files:', len(file_list))
       
    all_labels = ([1] * 12500 + [0] * 12500) 
    
    all_texts  = []
    
    for fi in file_list:
        with open(fi,encoding='utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
            
    return all_labels,all_texts

In [5]:
y_train, train_text = read_files("train")

read train files: 25000


In [6]:
y_test, test_text = read_files("test")

read test files: 25000


In [7]:
token = Tokenizer(num_words = 2000)
token.fit_on_texts(train_text)

In [8]:
print(token.document_count)

25000


In [9]:
print(token.word_index)



In [10]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq  = token.texts_to_sequences(test_text)

In [11]:
print(train_text[0])

This film is excellently paced, you never have to wait for a belly laugh to come up for more than about a minute and there's much more going on than the initial premise of the film. Throughout it there are mockeries of the traditional schmaltzy local-boys-done-good-overcoming-adversity genre of which this parodies. Don't let anyone tell you that they're trying to get cheap laughs just by using obscenities;- sure, there's plenty of that but it's all contextual, not gratuitous. I loved this film and it only cost me £2.99 on DVD , so in terms of entertainment value for money, it has been the best film I've seen this year.


In [12]:
print(x_train_seq[0])

[10, 18, 6, 1779, 21, 111, 24, 5, 852, 14, 3, 456, 5, 212, 52, 14, 49, 70, 40, 3, 780, 2, 221, 72, 49, 166, 19, 70, 1, 858, 4, 1, 18, 465, 8, 46, 22, 4, 1, 715, 1008, 220, 48, 509, 4, 59, 10, 88, 383, 255, 371, 21, 11, 502, 265, 5, 74, 701, 915, 39, 30, 767, 248, 221, 954, 4, 11, 17, 41, 28, 20, 9, 442, 10, 18, 2, 8, 60, 68, 19, 284, 34, 7, 1297, 4, 719, 1103, 14, 274, 8, 43, 73, 1, 114, 18, 203, 106, 10, 287]


In [13]:
x_train = sequence.pad_sequences(x_train_seq, maxlen = 100)
x_test  = sequence.pad_sequences(x_test_seq,  maxlen = 100)

In [14]:
print('length before processing = ', len(x_train_seq[2]))
print(x_train_seq[2])

length before processing =  83
[47, 3, 399, 457, 4, 324, 6, 49, 60, 43, 407, 2, 138, 36, 711, 47, 3, 36, 3, 325, 2, 424, 50, 125, 740, 43, 5, 7, 1, 1, 1, 1739, 449, 1, 28, 5, 93, 3, 1483, 10, 6, 27, 4, 144, 346, 462, 98, 9, 187, 408, 205, 204, 52, 46, 15, 1, 36, 1314, 44, 21, 419, 10, 813, 42, 1164, 237, 1, 1314, 2, 195, 1187, 77, 15, 2, 33, 12, 7, 1625, 77, 1, 1187, 2, 1187]


In [15]:
print('length after processing = ', len(x_train[2]))
print(x_train[2])

length after processing =  100
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0   47    3  399  457    4  324    6   49   60   43  407
    2  138   36  711   47    3   36    3  325    2  424   50  125  740
   43    5    7    1    1    1 1739  449    1   28    5   93    3 1483
   10    6   27    4  144  346  462   98    9  187  408  205  204   52
   46   15    1   36 1314   44   21  419   10  813   42 1164  237    1
 1314    2  195 1187   77   15    2   33   12    7 1625   77    1 1187
    2 1187]


In [16]:
model = Sequential()

In [17]:
model.add(Embedding(output_dim = 32,
                    input_dim = 2000, 
                    input_length = 100))

In [18]:
model.add(Flatten())

In [19]:
model.add(Dense(units = 256,
                activation = 'relu' ))

In [20]:
model.add(Dense(units = 1,
                activation = 'sigmoid' ))

In [21]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               819456    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 883,713
Trainable params: 883,713
Non-trainable params: 0
_________________________________________________________________


In [22]:
model.compile(loss = 'binary_crossentropy', 
              optimizer = 'adam', 
              metrics = ['accuracy'])

In [23]:
train_history = model.fit(x_train, y_train, batch_size = 100, 
                         epochs = 10, verbose = 0,
                         validation_split = 0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [24]:
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]



0.8110399842262268

In [25]:
predict = model.predict_classes(x_test)

In [26]:
predict[:10]

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]], dtype=int32)

In [27]:
predict_classes = predict.reshape(-1)
predict_classes[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [28]:
print(y_test[:10])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [29]:
SentimentDict = {1:'正面評價', 0:'負面評價'}
def display_test_Sentiment(i):
    print(test_text[i])
    print('標籤:', SentimentDict[y_test[i]],
          '預測:', SentimentDict[predict_classes[i]])

In [30]:
display_test_Sentiment(50)

Tim Burton the man behind the original Batman film and Beetlejuice, brings the world the sequel to Batman, that exceeds the original in more ways than one. Firstly Michelle Pfeiffer and Danny De Vito are a great mix of Batmans enemies. The dark, deadly and sexy Catwoman works well to rattle some heads within the story and penguin works in the same way that the joker worked in the original. The sets are stunning and immaculate. Gotham city has so many dark alleyways that you could never know what's happening at one time or another. The only thing that gives it a bad name is its script, which at times seems to lapse and then not recover for while. Tim Burtons direction bring superhero films into a new realm. Beats all the superman films and the other Batman films by a mile. Though in terms of realisation the new Batman begins has a bite where this one lacks, but Burton is a more original director than Nolan.
標籤: 正面評價 預測: 負面評價


In [31]:
display_test_Sentiment(20050)

Imagine a film the complete opposite of Lawrence of Arabia, instead of having an all male cast, it has an all female cast. Instead of being set in the barren deserts of Arabia, it is set in the bulging metropolis of New York City. And instead of it being one of the greatest films ever made, it is one of the most pointless, boring and forgettable.The film concerns Mary Haines (Meg Ryan) a perfect wife and mother, the envy of all others in her high society Manhatten social circle. She is painted as a women bearing the weight of the world on her shoulders, despite the fact she needs a live in nanny and housekeeper to cope with her one child. But I don't want to be too hard on her, Mary does all this whilst taking a liassez-faire attitude towards the fashion designing job her father has given her. This idyllic lifestyle cannot last forever though and things start to crash in a very real way.Mary's husband is cheating on her and her father fires her for not working hard enough. She is quite

In [54]:
def predict_review(input_text):
    input_seq = token.texts_to_sequences([input_text])
    pad_input_seq = sequence.pad_sequences(input_seq , maxlen = 100)
    predict_result = model.predict_classes(pad_input_seq)
    print(SentimentDict[predict_result[0][0]])

In [61]:
predict_review("I hate this movie")
predict_review("I will watch this again")

負面評價
正面評價
