In [1]:
import os 
import tarfile
import urllib.request

import tensorflow as tf
from tensorflow import keras
import numpy as np

import re 
import string
from random import randint

In [2]:
url="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filepath="data/aclImdb_v1.tar.gz"

In [3]:
if not os.path.exists("data"):
    os.makedirs("data")
if not os.path.isfile(filepath):
    print('downloading...')
    result=urllib.request.urlretrieve(url,filepath)
    print('downloaded:',result)
else:
    print(filepath,'is existed!')

downloading...
downloaded: ('data/aclImdb_v1.tar.gz', <http.client.HTTPMessage object at 0x000002B4CAB05B10>)


In [4]:
#解压数据
if not os.path.exists("data/aclImdb"):
    tfile=tarfile.open(filepath,"r:gz")
    print('extracting...')
    result=tfile.extractall("data/")
    print('extraction completed')
else:
    print("data/aclImdb is existed")

extracting...
extraction completed


In [6]:
#text clear
def remove_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('',text)

In [7]:
#read file data
def read_files(filetype):
    path = "data/aclImdb/"
    file_list = []
    positive_path=path +filetype +"/pos/"
    for f in os.listdir(positive_path):
        file_list+=[positive_path+f]
    pos_files_num =len(file_list)

    negative_path=path + filetype +"/neg/"
    for f in os.listdir(negative_path):
        file_list +=[negative_path+f]
    neg_files_num=len(file_list)-pos_files_num

    print('read',filetype,'files:',len(file_list))
    print(pos_files_num,'pos files in',filetype,'files')
    print(neg_files_num,'pos files in',filetype,'files')

    all_labels = ([[1,0]]* pos_files_num + [[0,1]]*neg_files_num)

    all_texts= []
    for fi in file_list:
        with open(fi,encoding='utf8') as file_input:
            all_texts += [remove_tags(" ".join(file_input.readlines()))]
    return all_labels,all_texts

In [9]:
#read_data
train_labels,train_texts=read_files("train")
test_labels,test_texts=read_files("test")

read train files: 25000
12500 pos files in train files
12500 pos files in train files
read test files: 25000
12500 pos files in test files
12500 pos files in test files


In [10]:
#data arrange
token = keras.preprocessing.text.Tokenizer(num_words=4000)
token.fit_on_texts(train_texts)


In [11]:
token.document_count

25000

In [None]:
print(token.word_index)


In [13]:
#建立词汇字典
token.word_docs

defaultdict(int,
            {'such': 4022,
             'inspector': 110,
             'me': 7329,
             'to': 23474,
             'burn': 125,
             'through': 3992,
             'bromwell': 4,
             'teachers': 64,
             'high': 1837,
             'same': 3323,
             'believe': 2209,
             'students': 276,
             'see': 8080,
             'pomp': 8,
             'pettiness': 2,
             'than': 7117,
             'my': 8106,
             'remind': 152,
             'survive': 239,
             'it': 21332,
             'cartoon': 367,
             'which': 7572,
             'who': 11132,
             'immediately': 434,
             'your': 4266,
             'pity': 225,
             'school': 1240,
             'classic': 1543,
             '35': 94,
             'situation': 600,
             'much': 7084,
             'and': 24161,
             'line': 1613,
             'episode': 984,
             'here': 4199,
             

In [14]:
#查看词汇频率排名
print=(token.word_counts)

In [15]:
#文字转数字列表
#texts_tpo_sequences(texts)    
#texts带转换的序列文本列表
#返回值。序列的列表，列表中每个序列对于一段输入文本
train_sequences = token.texts_to_sequences(train_texts)
text_sequences = token.texts_to_sequences(test_texts)

In [35]:
x_train.shape

(25000, 400)

In [37]:
y_train=np.array(train_labels)
y_test=np.array(test_labels)

In [38]:
#keep same length
#填充序列pad_sequences
#keras_pre@processing.sequence.pad_sequences(sequences,maxlen=None,
# dtype='int32',padding='pre',truncating='pre'value=0)
x_train = keras.preprocessing.sequence.pad_sequences(train_sequences,
                                                    padding='post',
                                                    truncating='post',
                                                    maxlen=400)
x_test = keras.preprocessing.sequence.pad_sequences(text_sequences,
                                                    padding='post',
                                                    truncating='post',
                                                    maxlen=400)

In [26]:
x_train.shape

(25000, 400)

In [27]:
#モデル
model=keras.models.Sequential()

In [28]:
model.add(keras.layers.Embedding(output_dim=32,
                                input_dim=4000,
                                input_length=400))

In [29]:
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=8)))
#全连接
model.add(keras.layers.Dense(units=32,activation='relu'))

In [30]:
model.add(keras.layers.Dropout(0.3))

In [31]:
#输出层
model.add(keras.layers.Dense(units=2,activation='softmax'))

In [32]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 400, 32)           128000    
                                                                 
 bidirectional (Bidirectiona  (None, 16)               2624      
 l)                                                              
                                                                 
 dense (Dense)               (None, 32)                544       
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 2)                 66        
                                                                 
Total params: 131,234
Trainable params: 131,234
Non-trainable params: 0
__________________________________________________

In [33]:
#模型训练
#one-hot多分类模型 损失函数用categoricalcrossentropy
#不是多分类的one-hot 编码模型 损失函数用sparse_categorical_crossentropy
#二分类用binary_crossentropy
model.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['accuracy'])

In [39]:
history = model.fit(x_train,y_train,
validation_split=0.2,
epochs = 6,
batch_size =128,
verbose=1)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
#可视化
