### 載入套件

In [1]:
%load_ext autotime

time: 0 ns


In [2]:
# Import torch and other required modules
import glob
import torch
import re
import nltk
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.datasets import load_svmlight_file
from nltk.corpus import stopwords

nltk.download('stopwords') #下載stopwords
nltk.download('punkt') #下載word_tokenize需要的corpus

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\perry.wei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\perry.wei\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

time: 5.8 s


### 探索資料與資料前處理
在train資料中，有分成pos(positive)與neg(negative)，分別為正評價與負評價，此評價即為label。

In [3]:
# 讀取字典，這份字典為review內所有出現的字詞
with open('./aclImdb/imdb.vocab', mode='r', encoding='utf-8') as f:
    a = f.readlines()
    vocab = ['.'+i.replace('\n','')  for i in a]

# 以nltk stopwords移除贅字，過多的贅字無法提供有用的訊息，也可能影響模型的訓練
print(f"vocab length before removing stopwords: {len(vocab)}")

vocab = [i for i in vocab if i not in stopwords.words(fileids='english')]

print(f"vocab length after removing stopwords: {len(vocab)}")

# 將字典轉換成dictionary
vocab_dic = {}
for i, j in enumerate(vocab):
    vocab_dic[j] = i

vocab_dic

vocab length before removing stopwords: 89527
vocab length after removing stopwords: 89527


{'.the': 0,
 '.and': 1,
 '.a': 2,
 '.of': 3,
 '.to': 4,
 '.is': 5,
 '.it': 6,
 '.in': 7,
 '.i': 8,
 '.this': 9,
 '.that': 10,
 '.was': 11,
 '.as': 12,
 '.for': 13,
 '.with': 14,
 '.movie': 15,
 '.but': 16,
 '.film': 17,
 '.on': 18,
 '.not': 19,
 '.you': 20,
 '.he': 21,
 '.are': 22,
 '.his': 23,
 '.have': 24,
 '.be': 25,
 '.one': 26,
 '.!': 27,
 '.all': 28,
 '.at': 29,
 '.by': 30,
 '.an': 31,
 '.who': 32,
 '.they': 33,
 '.from': 34,
 '.so': 35,
 '.like': 36,
 '.there': 37,
 '.her': 38,
 '.or': 39,
 '.just': 40,
 '.about': 41,
 '.if': 42,
 '.has': 43,
 '.out': 44,
 '.what': 45,
 '.?': 46,
 '.some': 47,
 '.good': 48,
 '.more': 49,
 '.when': 50,
 '.she': 51,
 '.very': 52,
 '.even': 53,
 '.my': 54,
 '.no': 55,
 '.up': 56,
 '.time': 57,
 '.would': 58,
 '.which': 59,
 '.only': 60,
 '.story': 61,
 '.really': 62,
 '.their': 63,
 '.see': 64,
 '.had': 65,
 '.can': 66,
 '.were': 67,
 '.me': 68,
 '.we': 69,
 '.than': 70,
 '.well': 71,
 '.much': 72,
 '.been': 73,
 '.get': 74,
 '.people': 75,
 '.will

time: 30.2 s


In [4]:
# 將資料打包成(x, y)配對，其中x為review的檔案路徑，y為正評(1)或負評(0)
# 這裡將x以檔案路徑代表的原因是讓同學練習不一次將資料全讀取進來，若電腦記憶體夠大(所有資料檔案沒有很大)
# 可以將資料全一次讀取，可以減少在訓練時I/O時間，增加訓練速度

review_pairs = []
tmp = [(i,1) for i in glob.glob('./aclImdb/train/pos/*.txt')]
review_pairs.extend(tmp)
tmp = [(i,0) for i in glob.glob('./aclImdb/train/neg/*.txt')]
review_pairs.extend(tmp)

print(review_pairs[:2])
print(f"Total reviews: {len(review_pairs)}")

[('./aclImdb/train/pos\\0_9.txt', 1), ('./aclImdb/train/pos\\10000_8.txt', 1)]
Total reviews: 25000
time: 125 ms


### 建立Dataset與DataLoader讀取資料
這裡我們會需要兩個helper functions，其中一個是讀取資料與清洗資料的函式(load_review)，另外一個是生成詞向量BoW的函式
(generate_bow)

In [30]:
def load_review(review_path, vocab_dic):
    
    with open(review_path, mode='r') as f:
        s = f.readlines()

    tmp_s = ''
    for i in s:
        tmp_s += i

    tmp_s = tmp_s.replace('\n','')
        
    #移除non-alphabet符號、贅字與tokenize
    pattern = r"[^a-zA-Z]"
    tmp_s = re.sub(pattern, ' ', tmp_s)
    tmp_s = tmp_s.split(' ')
    
    tmp_s = [i for i in tmp_s if i not in stopwords.words(fileids='english')]
    
    review = generate_bow(tmp_s, vocab_dic)
    
    return review

time: 0 ns


In [31]:
def generate_bow(review, vocab_dic):
    bag_vector = np.zeros(len(vocab_dic))
    for word in review:
        if vocab_dic.get(word):
            bag_vector[vocab_dic.get(word)] += 1
            
    return bag_vector

time: 0 ns


In [35]:
class dataset(Dataset):
    '''custom dataset to load reviews and labels
    Parameters
    ----------
    data_pairs: list
        directory of all review-label pairs
    vocab: list
        list of vocabularies
    '''
    def __init__(self, data_dirs, vocab):
        self.review_path = data_dirs
        self.vocab = vocab

    def __len__(self):
        return len(self.review_path)

    def __getitem__(self, idx):
        ###<your code>###
        review_path = self.review_path[idx][0]
        rank = self.review_path[idx][1]
        
        return (load_review(review_path, self.vocab), rank)
        

time: 0 ns


In [42]:
# 建立客製化dataset
custom_dst = dataset(review_pairs, vocab_dic)
custom_dst[10]

(array([0., 0., 0., ..., 0., 0., 0.]), 1)

time: 125 ms


In [53]:
# 建立dataloader
custom_dataloader = DataLoader(dataset=custom_dst, batch_size=4, shuffle=True)
next(iter(custom_dataloader))

[tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64),
 tensor([1, 0, 1, 0])]

time: 625 ms
