In [1]:
%load_ext autotime

time: 0 ns


### 載入套件

In [167]:
# Import torch and other required modules
import glob
import torch
import re
import nltk
import numpy as np
from torch.utils.data import Dataset, DataLoader, RandomSampler
from sklearn.datasets import load_svmlight_file
from nltk.corpus import stopwords

nltk.download('stopwords') #下載stopwords
nltk.download('punkt') #下載word_tokenize需要的corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\perry.wei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\perry.wei\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

time: 469 ms


### 探索資料與資料前處理
這份作業我們使用test資料中的pos與neg


In [147]:
# 讀取字典，這份字典為review內所有出現的字詞
with open('../nlp_dl_data/aclImdb/imdb.vocab', encoding='utf-8') as f:
    vocab = set(i.lower().replace('\n','') for i in f.readlines())

# 以nltk stopwords移除贅字，過多的贅字無法提供有用的訊息，也可能影響模型的訓練

print(f"vocab length before removing stopwords: {len(vocab)}")

vocab = vocab.difference(set(stopwords.words('english')))
print(f"vocab length after removing stopwords: {len(vocab)}")

# 將字典轉換成dictionary
vocab = dict(zip(vocab,range(len(vocab))))

vocab length before removing stopwords: 89527
vocab length after removing stopwords: 89356
time: 109 ms


In [63]:
# 將資料打包成(x, y)配對，其中x為review的檔案路徑，y為正評(1)或負評(0)
# 這裡將x以檔案路徑代表的原因是讓同學練習不一次將資料全讀取進來，若電腦記憶體夠大(所有資料檔案沒有很大)
# 可以將資料全一次讀取，可以減少在訓練時I/O時間，增加訓練速度

pos_path = glob.glob('../nlp_dl_data/aclImdb/test/pos/*.txt')
neg_path = glob.glob('../nlp_dl_data/aclImdb/test/neg/*.txt')

review_pairs = []

for i in pos_path:
    review_pairs.append((i,1))

for i in neg_path:
    review_pairs.append((i,0))
    
print(review_pairs[:2])
print(f"Total reviews: {len(review_pairs)}")

[('../nlp_dl_data/aclImdb/test/pos\\0_10.txt', 1), ('../nlp_dl_data/aclImdb/test/pos\\10000_7.txt', 1)]
Total reviews: 25000
time: 140 ms


### 建立Dataset, DataLoader, Sampler與Collate_fn讀取資料
這裡我們會需要兩個helper functions，其中一個是讀取資料與清洗資料的函式(load_review)，另外一個是生成詞向量函式
(generate_vec)，注意這裡我們用來產生詞向量的方法是單純將文字tokenize(為了使產生的文本長度不同，而不使用BoW)

In [225]:
def load_review(review_path):
    with open(review_path, encoding='utf-8') as f:
        word = f.read()

    word = re.sub(r"[^A-Za-z]",' ',word).lower() #轉成小寫，因為 NLTK的 stopwords 均為小寫
    word = nltk.word_tokenize(word)
    word = set(word).difference(set(stopwords.words('english')))
    
    return word

def generate_vec(review, vocab_dic):
    
    tokenizer = lambda x: vocab_dic.get(x,-1) #self, 將不存在字典檔的文字代碼設為 -1
    
    return [tokenizer(x)  for x in review]

time: 0 ns


In [253]:
#建立客製化dataset

class dataset(Dataset):
    '''custom dataset to load reviews and labels
    Parameters
    ----------
    data_pairs: list
        directory of all review-label pairs
    vocab: list
        list of vocabularies
    '''
    def __init__(self, data_pairs, vocab):
        self.data_pairs = data_pairs
        self.vocab = vocab
        
    def __len__(self):
        return len(self.data_pairs)
        
    def __getitem__(self, idx):
        return generate_vec(load_review(self.data_pairs[idx][0]), self.vocab), self.data_pairs[idx][1]
        

# #建立客製化collate_fn，將長度不一的文本pad 0 變成相同長度
def collate_fn(batch):
    corpus, label = zip(*batch)
    
    length = [len(i) for i in corpus]
    max_length = max(length)
    
    batch_corpus = []
    
    for i in range(len(corpus)):
        tmp = np.zeros(max_length, dtype=np.int)
        tmp[0:len(corpus[i])] = corpus[i]
        batch_corpus.append(tmp)
        
    return batch_corpus, label, length

time: 0 ns


In [254]:
custom_dst = dataset(review_pairs, vocab)
custom_sampler = RandomSampler(data_source=custom_dst, replacement=True, num_samples=4)
data_loader = DataLoader(custom_dst, sampler=custom_sampler, collate_fn=collate_fn, batch_size=4)
next(iter(data_loader))

([array([35836, 33160, 80674, 15366,  8600,    42,  4832, 55412,  5367,
         28644, 85850, 19044,    -1, 83948, 66435, 67594, 55238, 18778,
         62423, 10341,  1185, 19755, 58948, 15634, 29091, 76665, 57092,
         27248, 73312, 50060, 16869, 33350, 68022, 72260, 27209, 34479,
         23291, 65476, 60009, 26557, 31460, 26760, 10949, 58026, 31076,
         29736, 13395, 55005, 13790, 34944, 57357,  9160, 14510, 36618,
         50773, 84415, 67662, 33802, 65030, 40923, 80107, 30953, 89109,
         33989, 47310, 83229, 36957,  1006,  4202, 29859, 29373, 55536,
         13764, 80170, 47967,  2907,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,

time: 47 ms


In [256]:
# 使用Pytorch的RandomSampler來進行indice讀取並建立dataloader
### <your code> ###
next(iter(data_loader))

([array([69033, 14358, 57082, 33160,  6919, 77574, 74247, 52295, 25021,
         76503, 27291,   704,  2111, 70445, 59345, 88145, 57092, 47047,
         66351, 61667,  1069, 33345, 27813, 70457, 35470, 69232,  7125,
         69567, 49944, 82476, 33660, 64039,  9245, 45507, 36902, 88882,
         55005, 62036, 80970, 68742, 66914, 51009, 60097,   216, 47386,
         35303, 39591,    -1, 79089, 37482, 78895,  5660, 85596, 55710,
         72206, 75180, 66932, 83890, 67460,  7525, 18778, 62423, 33876,
         76011, 68371, 35676, 76549, 24710, 59753, 64263, 51031, 67315,
         62072, 10658, 83034, 36558, 55731, 20959, 34479, 22340, 20770,
          9803, 56262, 47414, 68788,  2504, 26188, 82546, 11688, 49834,
         39108, 65030, 14598, 69294, 30463, 25827, 69299, 76041, 42937,
         76043, 42373,    -1, 57328, 43271, 14960, 84610, 32228, 50011,
         51899, 34106, 18820, 48963, 58948,  2188, 59797, 61048, 83952,
         83953, 49158, 81886, 45747, 13058, 87867, 68431, 51746,

time: 78 ms
