In [1]:
#ch13_NLP_01_20171204.ipynb

In [2]:
#step1. 下載IMDb資料集

In [3]:
#step1.1 匯入所需模組
import urllib.request
import os
import tarfile

In [4]:
#step1.2 下載IMDb 資料集, 現行目錄下要先建立 data/ 目錄
url="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filepath="data/aclImdb_v1.tar.gz"
if not os.path.isfile(filepath):
    result=urllib.request.urlretrieve(url,filepath)
    print('downloaded:',result)

In [5]:
#step1.3 解壓縮下載檔案
if not os.path.exists("data/aclImdb"):
    tfile = tarfile.open("data/aclImdb_v1.tar.gz", 'r:gz')
    result=tfile.extractall('data/')

In [None]:
#step1.4 查看下載檔案 'aclImdb_v1.tar.gz' 及 解壓縮目錄 data/aclImdb/

In [7]:
#step2. 讀取IMDb資料
#       IMDb檔案下載解壓縮後,共有 50000筆 "文字檔",
#       我們在讀取後,將它們分為訓練資料及測試資料

In [8]:
#step 2.1 匯入所需模組,
#             Tokenizer -->Class for vectorizing texts, or/and turning texts into sequences
#             (=list of word indexes, where the word of rank i in the dataset (starting at 1) has index i).
from keras.preprocessing import sequence       #sequence module, 用於截長補短,讓所有「數字list」長度相同
from keras.preprocessing.text import Tokenizer #Tokenizer module, 用於建立 token 字典

Using TensorFlow backend.


In [12]:
#setp 2.2 建立 rm_tag() function 移除文字中的 html tag
import re
def rm_tag(text):
    re_tag=re.compile(r'<[^>]+>') #規則表示式 r'<[^>]+>', r--> 跳脫 '跳脫字元'    
    return re_tag.sub('',text)   #將符合規則表示式的字移除(置換成空字串)

In [13]:
#step 2.3 建立 read_files() 函數讀取IMDb檔案目錄
import os
def read_files(filetype): #filetype: 1.train, 2.test
    path="data/aclImdb/"
    file_list=[]  #empty list 'file_list'
    
    positive_path=path+filetype+"/pos/"  #"data/aclImdb/train/pos/" or "data/aclImdb/test/pos/" 
    for f in os.listdir(positive_path):
        file_list+=[positive_path+f]
        
    negative_path=path+filetype+"/neg/" #"data/aclImdb/train/neg/" or "data/aclImdb/test/neg/" 
    for f in os.listdir(negative_path):
        file_list+=[negative_path+f]
    
    print('read',filetype,'files:',len(file_list))
    
    all_labels=([1]*12500+[0]*12500)  #產生12500個1+12500個0的 list 'all_labels'
    
    all_texts=[] #empty list 'all_texts'
    for fi in file_list:  #對目前 list 'file_list' 做迭代處理
        with open(fi, encoding='utf8') as file_input: #以 with open() 逐個開啟檔案
            all_texts+=[rm_tag(" ".join(file_input.readlines()))] #說明1
    
    return all_labels,all_texts

#說明1: 

In [14]:
#step 2.4 讀取IMDb資料集目錄
#     2.4.a 使用 read_files(), 傳入參數為 "train",讀取訓練資料
y_train,train_text=read_files("train")

read train files: 25000


In [15]:
#    2.4.b 使用 read_files(), 傳入參數為 "test",讀取測試資料
y_test,test_text=read_files("test")

read test files: 25000


In [16]:
#============================================================
#step 3. 檢視IMDb資料

In [17]:
#step 3.1 檢視第0筆「影評文字」
train_text[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [18]:
#step 3.2 檢視第0筆 label, 1-->正面評價
y_train[0]

1

In [19]:
#step 3.3 檢視第12501筆「影評文字」
train_text[12501]

"Airport '77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman Philip Stevens (James Stewart) who is flying them & a bunch of VIP's to his estate in preparation of it being opened to the public as a museum, also on board is Stevens daughter Julie (Kathleen Quinlan) & her son. The luxury jetliner takes off as planned but mid-air the plane is hi-jacked by the co-pilot Chambers (Robert Foxworth) & his two accomplice's Banker (Monte Markham) & Wilson (Michael Pataki) who knock the passengers & crew out with sleeping gas, they plan to steal the valuable cargo & land on a disused plane strip on an isolated island but while making his descent Chambers almost hits an oil rig in the Ocean & loses control of the plane sending it crashing into the sea where it sinks to the bottom right bang in the middle of the Bermuda Triangle. With air in short supply, water leaking in & having flown over 200 miles off course the problems mount for 

In [20]:
#step 3.4 檢視第12501筆 label, 0-->負面評價
y_train[12501]

0

In [21]:
#===============================================================
#step 4. 建立 token

In [40]:
#step 4.1 以Tokenizer module 建立 token 
#https://keras.io/preprocessing/text/#tokenizer
token=Tokenizer(num_words=2000) #建立一個2000字的字典

In [41]:
token.fit_on_texts(train_text) #訓練資料集內，排名前2000名的字，會列入字典中

In [42]:
#step 4.2 檢視token.document_count 屬性,看看token讀取多少文章
print(token.document_count)

25000


In [43]:
#step 4.3 檢視token.word_index 屬性(dict.),查看每個字在所有文章出現次數
#         Tokenizer.word_index 屬性值，是一個字典。
#         值是「word出現次數排名」，會全部排完(超過字典2000)，且display時未排序
print(token.word_index)



In [44]:
#step 4.3.a 將字典 token.word_index 依值(排名)出現
sorted_token_index=sorted(token.word_index.items(), key=lambda x:(x[1],x[0]))

In [45]:
print(sorted_token_index)



In [35]:
#====================================================================
#step5. 使用 token 將「影評文字」轉換成「數字list」
#     Tokenizer.texts_to_sequences() method
x_train_seq=token.texts_to_sequences(train_text)
x_test_seq=token.texts_to_sequences(test_text)

In [36]:
#檢視轉換為 sequences 後之結果
print(train_text[0])

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


In [37]:
print(x_train_seq[0])

[308, 6, 3, 1068, 208, 8, 29, 1, 168, 54, 13, 45, 81, 40, 391, 109, 137, 13, 57, 149, 7, 1, 481, 68, 5, 260, 11, 6, 72, 5, 631, 70, 6, 1, 5, 1, 1530, 33, 66, 63, 204, 139, 64, 1229, 1, 4, 1, 222, 899, 28, 68, 4, 1, 9, 693, 2, 64, 1530, 50, 9, 215, 1, 386, 7, 59, 3, 1470, 798, 5, 176, 1, 391, 9, 1235, 29, 308, 3, 352, 343, 142, 129, 5, 27, 4, 125, 1470, 5, 308, 9, 532, 11, 107, 1466, 4, 57, 554, 100, 11, 308, 6, 226, 47, 3, 11, 8, 214]


In [46]:
#=====================================================================
#step 6. 截長補短,讓轉換後的「數字list」長度相同, sequence.pad_sequences()

In [55]:
#step 6.1 檢視第10及11筆「數字list」, 分別是 236,77
print('len(x_train_seq[10]):',len(x_train_seq[10]),' len(x_train_seq[11]):',len(x_train_seq[11]))

len(x_train_seq[10]): 236  len(x_train_seq[11]): 77


In [56]:
#step 6.2 使用sequence.pad_sequences() 截長補短, 讓每筆「數字list」長度都是100
x_train=sequence.pad_sequences(x_train_seq,maxlen=100)
x_test=sequence.pad_sequences(x_test_seq,maxlen=100)

In [58]:
#step 6.3.a 檢視 x_train[10], 原先 x_train_seq[10] 長度為 236, 
#           sequence.pad_sequence處理後
print('len(x_train[10]):',len(x_train[10]))
print(x_train[10])

len(x_train[10]): 100
[  75   11   87   80   76  792   10   18    9   60  131   11   83    1  853
  295   76  238  100 1691    7    3  617  213   11  156  153  154  588  198
   10   16    3  576 1691    1   71   28  211  165    2  136  139    7    5
   25  124   80   36   23   16   27  530   26   43   52   13   31  280    2
   89  155  485  414    4  494  445    2  154    9  855    9  260   13  171
  132   26    6    7   26   43    1  966  128  268    2  264 1360   41   10
  648   26   96 1661   23  201   16  204  146  586]


In [59]:
#step 6.3.a 檢視 x_train[11], 原先 x_train_seq[11] 長度為 77, 
#           sequence.pad_sequence處理後
print('len(x_train[11]):',len(x_train[11]))
print(x_train[11])

len(x_train[11]): 100
[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    9  419    1   18   45    4    1
  202  135   67   51  217    2   69  220    9  257  419    1  632  132   59
   65    3    7    8    3   51  202  132   11  464   69  220   45    4    1
  135   67  810    7  217  765  137   13   54 1374 1726   38  217   77    1
   18    6  158    6  446  527    9   77  419   85    1  441  229   12  995
   95  470  198    1   18   31  708   42    4  160]
