## 数据集探索
[read_table / read_csv参数](https://www.cnblogs.com/mahailuo/p/8325288.html)

#### THUCNews数据集
先简单认识一下数据集

In [62]:
import pandas as pd

In [10]:
train = pd.read_table('./cnews/cnews.train.txt',sep='\t',encoding='utf-8',header=None,names=['label','content'])
train.head()

Unnamed: 0,label,content
0,体育,马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道 来到沈阳，国奥队依然没有...
1,体育,商瑞华首战复仇心切 中国玫瑰要用美国方式攻克瑞典多曼来了，瑞典来了，商瑞华首战求3分的信心也...
2,体育,冠军球队迎新欢乐派对 黄旭获大奖张军赢下PK赛新浪体育讯12月27日晚，“冠军高尔夫球队迎新...
3,体育,辽足签约危机引注册难关 高层威逼利诱合同笑里藏刀新浪体育讯2月24日，辽足爆发了集体拒签风波...
4,体育,揭秘谢亚龙被带走：总局电话骗局 复制南杨轨迹体坛周报特约记者张锐北京报道 谢亚龙已经被公安...


In [11]:
test = pd.read_table('./cnews/cnews.test.txt',sep='\t',encoding='utf-8',header=None,names=['label','content'])
val = pd.read_table('./cnews/cnews.val.txt',sep='\t',encoding='utf-8',header=None,names=['label','content'])

In [21]:
print('train.shape:',train.shape)
print('train.label:',train.label.unique())
print('train.label_num:',len(train.label.unique()))

print(20*'*')
print('test.shape:',test.shape)
print('test.label:',test.label.unique())
print('test.label_num:',len(test.label.unique()))

print(20*'*')
print('val.shape:',val.shape)
print('val.label:',val.label.unique())
print('val.label_num:',len(val.label.unique()))

train.shape: (50000, 2)
train.label: ['体育' '娱乐' '家居' '房产' '教育' '时尚' '时政' '游戏' '科技' '财经']
train.label_num: 10
********************
test.shape: (10000, 2)
test.label: ['体育' '娱乐' '家居' '房产' '教育' '时尚' '时政' '游戏' '科技' '财经']
test.label_num: 10
********************
val.shape: (5000, 2)
val.label: ['体育' '娱乐' '家居' '房产' '教育' '时尚' '时政' '游戏' '科技' '财经']
val.label_num: 10


#### 数据集预处理

In [63]:
import sys
from collections import Counter #counter作用就是在一个数组内，遍历所有元素，将元素出现的次数记下来
#Counter(a).most_common(2) 输出数组a中出现次数最多的前2个元素

import tensorflow.keras as kr
import numpy as np

In [44]:
#python2和python3版本不同 #py2中是unicode类型。py3中是str类型
if sys.version_info[0]>2:
    is_py3 = True
else:
    reload(sys)
    sys.setdefaultencoding("utf-8")
    is_py3 = False
    
#如果为python2版本(is_py3 = False) 转换编码格式 
def native_word(word,encoding="utf-8"):
    if not is_py3:
        return word.encode(encoding)
    else:
        return word
    
    
'''读取数据文件(兼顾python版本)'''  
def open_file(path,mode='r'):
    '''
    path:文件路径
    mode: r-读 w-写
    '''
    if is_py3:
        return open(path,mode,encoding='utf-8',errors='ignore')
    else:
        return open(path,mode)

def read_file(path):
    labels,contents, = [],[]
    with open_file(path) as f:
        for line in f:
            try:
                label,content = line.strip().split('\t')
                if content:
                    contents.append(list(native_word(content)))
                    labels.append(native_word(label))
            except:
                pass
    return labels,contents

In [69]:
labels,contents = read_file('./cnews/cnews.train.txt')

In [74]:
'''根据训练集构建词汇表，并储存'''
def build_vocab(train_path,vocab_path,vocab_size=5000):
    _,train_x = read_file(train_path)
    
    all_data = []
    for content in train_x:
        all_data.extend(content) #整合训练集所有内容为一个list[] #未分词，一个字为一个元素
    
    counter = Counter(all_data) #{'a':3,'b':4} #每个元素的个数
    count_pairs = counter.most_common(vocab_size) #前5000个出现频率最高的元素
    words,_ = list(zip(*count_pairs)) #解压
    '''添加一个 '<pad>' 来将所有文本pad为同一长度'''
    words = ['<PAD>']+ list(words)
    open_file(vocab_path,mode='w').write('\n'.join(words)+'\n')
    
    
'''读取上面储存好的词汇表'''
def read_vocab(vocab_path):
    with open_file(vocab_path) as f:
        words = [native_word(i.strip()) for i in f.readlines()]
    word_to_id = dict(zip(words,range(len(words))))
    return words,word_to_id

In [76]:
#build_vocab('./cnews/cnews.train.txt','./cnews/cnews.vocab.txt',vocab_size=5000)

In [77]:
words,word_to_id = read_vocab('./cnews/cnews.vocab.txt')

In [78]:
'''读取类别结果'''
def read_labels():
    labels = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
    labels = [native_word(i) for i in labels]
    label_to_id = dict(zip(labels,range(len(labels))))
    return labels,label_to_id

labels,label_to_id = read_labels()

In [56]:
read_labels()

(['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐'],
 {'体育': 0,
  '娱乐': 9,
  '家居': 3,
  '房产': 2,
  '教育': 4,
  '时尚': 6,
  '时政': 7,
  '游戏': 8,
  '科技': 5,
  '财经': 1})

In [60]:
'''将id的内容转回文字'''
def to_words(contents,words):
    return ''.join(words[x] for x in content)

In [82]:
'''将文件转为id表示'''
def process_file(file_name,word_to_id,label_to_id,max_length=600):
    labels,contents = read_file(file_name)
    data_id,label_id = [] , []
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(label_to_id[labels[i]])
    
    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id,max_length)
    y_pad = kr.utils.to_categorical(label_id,num_classes=len(label_to_id)) #one-hot编码
    return x_pad,y_pad

In [83]:
x_pad,y_pad = process_file('./cnews/cnews.train.txt',word_to_id,label_to_id,max_length=600)

In [84]:
x_pad

array([[1609,  659,   56, ...,    9,  311,    3],
       [   2,  101,   16, ..., 1168,    3,   24],
       [ 465,  855,  521, ...,  116,  136,   85],
       ...,
       [  49,   18,   79, ...,  836, 1928, 1072],
       [ 166,  110,  714, ...,  836, 1928, 1072],
       [   1,   80,  551, ...,   78,  192,    3]], dtype=int32)

In [85]:
x_pad.shape

(50000, 600)

In [86]:
y_pad

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [87]:
y_pad.shape

(50000, 10)

In [67]:
'''生成批次数据'''
def batch_iter(x,y,batch_size=64):
    data_len = len(x)
    num_batch = int((data_len-1)/batch_size)+1 #int(1.5)=1
    
    indices = np.random.permutation(np.arange(data_len))
    '''函数shuffle与permutation都是对原来的数组进行重新洗牌（即随机打乱原来的元素顺序）；
       区别在于shuffle直接在原来的数组上进行操作，改变原来数组的顺序，无返回值。
       而permutation不直接在原来的数组上进行操作，而是返回一个新的打乱顺序的数组，并不改变原来的数组
    '''
    x_shuffle = x[indices]
    y_shuffle = y[indices]
    
    for i in range(num_batch):
        start_id = i*batch_size
        end_id = min((i+1)*batch_size,data_len)
        '''
        yield和return的区别:带yield的函数是一个生成器，而不是一个函数，
        这个生成器有一个函数就是next函数，next就相当于“下一步”生成哪个数，
        这一次的next开始的地方是接着上一次的next停止的地方执行的，
        所以调用next的时候，生成器并不会从foo函数的开始执行，只是接着上一步停止的地方开始，
        然后遇到yield后，return出要生成的数，此步就结束。
        '''
        yield x_shuffle[start_id:end_id],y_shuffle[start_id:end_id]

## IMDB 数据集探索

In [2]:
import tensorflow as tf
import tensorflow.keras as kr
import numpy as np
print(tf.__version__)

1.13.1


In [26]:
imdb = kr.datasets.imdb

(train_data, train_labels), (test_data, test_labels) = imdb.load_data('./imdb.npz',num_words=10000)
#参数 num_words=10000 会保留训练数据中出现频次在前 10000 位的字词。为确保数据规模处于可管理的水平，罕见字词将被舍弃。

In [14]:
# import numpy as np
# data = np.load('imdb.npz')
# data.files
# train_data = data['x_train']
# train_labels = data['y_train']
# test_data = data['x_test']
# test_labels = data['y_test']

In [27]:
print(train_data.shape)
print(len(train_labels))
print(len(test_data))
print(len(test_labels))

(25000,)
25000
25000
25000


In [28]:
#影评文本已转换为整数，其中每个整数都表示字典中的一个特定字词。第一条影评如下所示：
print(train_data[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [29]:
#每条影评长度不同。如第一、二条影评长度如下
len(train_data[0]),len(train_data[1])

(218, 189)

In [31]:
'''将整数转换回字词'''
# A dictionary mapping words to an integer index
word_index = imdb.get_word_index('./imdb.npz')

# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

A local file was found, but it seems to be incomplete or outdated because the auto file hash does not match the original value of bfafd718b763782e994055a2d397834f so we will re-download the data.
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [33]:
imdb.get_word_index('./imdb.npz')

{'fawn': 34701,
 'tsukino': 52006,
 'nunnery': 52007,
 'sonja': 16816,
 'vani': 63951,
 'woods': 1408,
 'spiders': 16115,
 'hanging': 2345,
 'woody': 2289,
 'trawling': 52008,
 "hold's": 52009,
 'comically': 11307,
 'localized': 40830,
 'disobeying': 30568,
 "'royale": 52010,
 "harpo's": 40831,
 'canet': 52011,
 'aileen': 19313,
 'acurately': 52012,
 "diplomat's": 52013,
 'rickman': 25242,
 'arranged': 6746,
 'rumbustious': 52014,
 'familiarness': 52015,
 "spider'": 52016,
 'hahahah': 68804,
 "wood'": 52017,
 'transvestism': 40833,
 "hangin'": 34702,
 'bringing': 2338,
 'seamier': 40834,
 'wooded': 34703,
 'bravora': 52018,
 'grueling': 16817,
 'wooden': 1636,
 'wednesday': 16818,
 "'prix": 52019,
 'altagracia': 34704,
 'circuitry': 52020,
 'crotch': 11585,
 'busybody': 57766,
 "tart'n'tangy": 52021,
 'burgade': 14129,
 'thrace': 52023,
 "tom's": 11038,
 'snuggles': 52025,
 'francesco': 29114,
 'complainers': 52027,
 'templarios': 52125,
 '272': 40835,
 '273': 52028,
 'zaniacs': 52130,

In [32]:
word_index

{'fawn': 34704,
 'tsukino': 52009,
 'nunnery': 52010,
 'sonja': 16819,
 'vani': 63954,
 'woods': 1411,
 'spiders': 16118,
 'hanging': 2348,
 'woody': 2292,
 'trawling': 52011,
 "hold's": 52012,
 'comically': 11310,
 'localized': 40833,
 'disobeying': 30571,
 "'royale": 52013,
 "harpo's": 40834,
 'canet': 52014,
 'aileen': 19316,
 'acurately': 52015,
 "diplomat's": 52016,
 'rickman': 25245,
 'arranged': 6749,
 'rumbustious': 52017,
 'familiarness': 52018,
 "spider'": 52019,
 'hahahah': 68807,
 "wood'": 52020,
 'transvestism': 40836,
 "hangin'": 34705,
 'bringing': 2341,
 'seamier': 40837,
 'wooded': 34706,
 'bravora': 52021,
 'grueling': 16820,
 'wooden': 1639,
 'wednesday': 16821,
 "'prix": 52022,
 'altagracia': 34707,
 'circuitry': 52023,
 'crotch': 11588,
 'busybody': 57769,
 "tart'n'tangy": 52024,
 'burgade': 14132,
 'thrace': 52026,
 "tom's": 11041,
 'snuggles': 52028,
 'francesco': 29117,
 'complainers': 52030,
 'templarios': 52128,
 '272': 40838,
 '273': 52031,
 'zaniacs': 52133,

In [36]:
reverse_word_index = dict([(v,k) for k,v in word_index.items()])

In [39]:
def decode_review(sample):
    return ' '.join([reverse_word_index.get(i,'?') for i in sample])
    #return ' '.join([reverse_word_index.get(i) for i in sample]) ==

In [40]:
decode_review(train_data[0])

"<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little boy's that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for wh

#### 准备数据

In [41]:
word_index ["<PAD>"]

0

In [46]:
train_data = kr.preprocessing.sequence.pad_sequences(train_data, 
                    value=word_index ["<PAD>"], #填充值 此处为填充0
                    padding='post', 
                    #padding：‘pre’或‘post’，确定当需要补0时，在序列的起始还是结尾补
                    #truncating：‘pre’或‘post’，确定当需要截断序列时，从起始还是结尾截断'''
                    maxlen=256)  #最大长度，短了填充，长了截断

test_data = kr.preprocessing.sequence.pad_sequences(test_data,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=256)

In [47]:
len(train_data[0]), len(train_data[1])

(256, 256)

In [48]:
print(train_data[0])

[   1   14   22   16   43  530  973 1622 1385   65  458 4468   66 3941
    4  173   36  256    5   25  100   43  838  112   50  670    2    9
   35  480  284    5  150    4  172  112  167    2  336  385   39    4
  172 4536 1111   17  546   38   13  447    4  192   50   16    6  147
 2025   19   14   22    4 1920 4613  469    4   22   71   87   12   16
   43  530   38   76   15   13 1247    4   22   17  515   17   12   16
  626   18    2    5   62  386   12    8  316    8  106    5    4 2223
 5244   16  480   66 3785   33    4  130   12   16   38  619    5   25
  124   51   36  135   48   25 1415   33    6   22   12  215   28   77
   52    5   14  407   16   82    2    8    4  107  117 5952   15  256
    4    2    7 3766    5  723   36   71   43  530  476   26  400  317
   46    7    4    2 1029   13  104   88    4  381   15  297   98   32
 2071   56   26  141    6  194 7486   18    4  226   22   21  134  476
   26  480    5  144   30 5535   18   51   36   28  224   92   25  104
    4 