In [56]:
import torch
from torch import nn
import pandas as pd
import numpy as np
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm, tqdm_notebook

In [57]:
class IMDBDataset:
    def __init__(self, csv_path, mode='train', valid_ratio=0.15, test_ratio=0.1):
        self.mode = mode
        self.data_info = pd.read_csv(csv_path)
        self.data_len = (int)(len(self.data_info))
        self.train_valid_len = (int)(self.data_len * (1 - test_ratio))
        self.train_len = (int)(self.train_valid_len * (1 - valid_ratio))

        if self.mode == 'train':
            self.train_text = np.asarray(self.data_info.iloc[0:self.train_len, 0])
            self.train_label = np.asarray(self.data_info.iloc[0:self.train_len, 1])
            self.text_arr = self.train_text
            self.label_arr = self.train_label
        elif self.mode == 'valid':
            self.valid_text = np.asarray(self.data_info.iloc[self.train_len:, 0])
            self.valid_label = np.asarray(self.data_info.iloc[self.train_len:, 1])
            self.text_arr = self.valid_text
            self.label_arr = self.valid_label
        elif self.mode == 'test':
            self.test_text = np.asarray(self.data_info.iloc[self.train_valid_len:, 0])
            self.text_arr = self.test_text
            # 测试集的标签最后才用到
            self.test_label = np.asarray(self.data_info.iloc[self.train_valid_len:, 1])
        else:
            raise ValueError
            
        self.real_len = len(self.text_arr)
        
        print('Finished reading the {} set of Leaves Dataset ({} samples found)'.format(mode, self.real_len))
        
    def __getitem__(self, index):
        if self.mode != 'test':
            return self.text_arr[index], class2num[self.label_arr[index]]
        else:
            return self.text_arr[index], 'None'
        
    
    def __len__(self):
        return self.real_len

In [58]:
# csv_path = './IMDB Dataset.csv'
# train_dataset = IMDBDataset(csv_path, 'train')
# valid_dataset = IMDBDataset(csv_path, 'valid')
# test_dataset = IMDBDataset(csv_path, 'test')
# test_dataset[1]

In [59]:
def get_device():
    return 'cuda' if torch.cuda.is_available() else 'cpu'

device = get_device()
device

'cpu'

In [60]:
class Sequence(Dataset):
    """ 文本预处理 """
    def __init__(self, data, max_seq_len):
        # 句子的最长长度 少于: 用<pad>填补 多余: 截断
        self.max_seq_len = max_seq_len
        df = data
        
        # BOW 以后换Bert
        vectorizer = CountVectorizer(stop_words="english", min_df=0.015)
        vectorizer.fit(df.review.tolist())
        
        # 创建词汇表
        self.token2idx = vectorizer.vocabulary_
        self.token2idx['<pad>'] = max(self.token2idx.values()) + 1
        
        # 词元化工具
        tokenizer = vectorizer.build_analyzer()
        
        # 词元转换为整数索引
        self.encode = lambda x : [self.token2idx[token] for token in tokenizer(x) if token in self.token2idx]
        # 将少于max_seq_len的地方用特殊符号 <pad> 补齐
        self.pad = lambda x : x + (max_seq_len - len(x)) * [self.token2idx['<pad>']]
        
        sequences = [self.encode(sequence)[:max_seq_len] for sequence in df.review.tolist()]
        sequences, self.labels = zip(*[(sequence, label) for sequence, label in zip(sequences, df.label.tolist()) if sequence])
        
        # 填补
        self.sequences = [self.pad(sequence) for sequence in sequences]
        
    def __getitem__(self, index):
        assert len(self.sequences[index]) == self.max_seq_len
        return self.sequences[index], self.labels[index]
    
    def __len__(self):
        return len(self.sequences)

In [61]:
data = pd.read_csv("./IMDB Dataset.csv")
data['label'] = data['sentiment']
del data['sentiment']
data

Unnamed: 0,review,label
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [62]:
labeling = {
    'positive': 1,
    'negative': 0
}

In [63]:
# 将标签转化为整数 积极对应1 消极对应0
data['label'] = data['label'].apply(lambda x : labeling[x])
data

Unnamed: 0,review,label
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [64]:
data.shape

(50000, 2)

In [65]:
# 实例化
dataset = Sequence(data, max_seq_len=128)

In [66]:
dataset.token2idx

{'mentioned': 575,
 'watching': 999,
 'just': 476,
 'episode': 279,
 'll': 531,
 'right': 755,
 'exactly': 290,
 'happened': 404,
 'br': 97,
 'thing': 917,
 'scenes': 778,
 'violence': 983,
 'set': 799,
 'word': 1019,
 'sex': 802,
 'classic': 146,
 'use': 965,
 'called': 111,
 'given': 380,
 'state': 863,
 'city': 144,
 'face': 303,
 'high': 421,
 'home': 428,
 'death': 208,
 'far': 319,
 'away': 59,
 'say': 772,
 'main': 548,
 'appeal': 40,
 'fact': 304,
 'goes': 385,
 'shows': 814,
 'wouldn': 1029,
 'forget': 356,
 'pretty': 700,
 'pictures': 665,
 'audiences': 55,
 'romance': 761,
 'doesn': 240,
 'mess': 576,
 'saw': 771,
 'couldn': 180,
 'watched': 998,
 'taste': 903,
 'got': 390,
 'kill': 481,
 'order': 635,
 'middle': 579,
 'class': 145,
 'turned': 951,
 'lack': 494,
 'street': 873,
 'experience': 298,
 'viewing': 981,
 'touch': 936,
 'wonderful': 1018,
 'little': 527,
 'production': 708,
 'old': 629,
 'time': 926,
 'gives': 381,
 'sense': 793,
 'entire': 277,
 'piece': 666,
 'ac

In [67]:
def collate(batch):
    inputs = torch.LongTensor([item[0] for item in batch])
    target = torch.FloatTensor([item[1] for item in batch])
    return inputs, target

batch_size = 2048
train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate, num_workers=5)

In [68]:
class RNN(nn.Module):
    """ LSTM """
    def __init__(self, vocab_size, batch_size, embedding_size=100, hidden_size=128, n_layers=1, device='cpu'):
        super(RNN, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.device = device
        self.encoder = nn.Embedding(vocab_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers=n_layers, batch_first=True)
        self.decoder = nn.Linear(hidden_size, 1)
        
    def init_hidden(self):
        """ 初始化隐藏状态 hidden-state cell """
        return (torch.randn(self.n_layers, self.batch_size, self.hidden_size).to(self.device),
                torch.randn(self.n_layers, self.batch_size, self.hidden_size).to(self.device))
    
    def forward(self, inputs):
        """ 传播 """
        batch_size = inputs.size(0)
        if batch_size != self.batch_size:
            self.batch_size = batch_size
        X = self.encoder(inputs)
        output, state = self.rnn(X, self.init_hidden())
        # 取最后一个输出
        output = self.decoder(output[:, :, -1]).squeeze()
        return output

In [69]:
model = RNN(
    vocab_size=len(dataset.token2idx),
    batch_size=batch_size,
    hidden_size=128,
    device=device
)
model = model.to(device)
model

RNN(
  (encoder): Embedding(1046, 100)
  (rnn): LSTM(100, 128, batch_first=True)
  (decoder): Linear(in_features=128, out_features=1, bias=True)
)

In [70]:
criterion = nn.BCEWithLogitsLoss()
lr = 0.001
optimizer = torch.optim.Adam([p for p in model.parameters() if p.requires_grad], lr=lr)

In [71]:
model.train()
train_losses = []
num_epochs = 5
model_path = "D:\\AI_Study\\Kaggle\\Sentiment Analysis\\models\\sentiment_analysis_v0.pth"
best_acc = 0.0

In [73]:
if __name__ == '__main__':
    for epoch in range(num_epochs):
        progress_bar = tqdm_notebook(train_loader, leave=False)
        losses = []
        accuracy = []
        total = 0
        for inputs, target in progress_bar:
            inputs, target = inputs.to(device), target.to(device)
            # 梯度清零 因为pytorch会保留上一次计算的梯度
            model.zero_grad()
            output = model(inputs)
            # 损失
            loss = criterion(loss, target)
            # 精确度
            acc = (output.argmax(dim=-1) == target.to(device)).float().mean()
            # 计算图反向传播
            loss.backward()
            # 梯度裁剪 以防梯度爆炸等问题
            nn.utils.clip_grad_norm_(model.parameters(), 3)
            # 更新参数
            optimizer.step()
            # 保存损失和精确度
            losses.append(loss)
            accuracy.append(acc)
            # 进度条显示loss
            progress_bar.set_description(f'Loss {loss.item():.3f}')
            # 迭代次数 +1
            total += 1
        #每个epoch的损失
        epoch_loss = sum(losses) / total
        epoch_acc = sum(accuracy) / total
        train_losses.append(epoch_loss)
        if epoch_acc > best_acc:
            best_acc = epoch_acc
            torch.save(model.state_dict(), model_path)
            print('saving model with acc {:.3f} in epoch {}'.format(best_acc, epoch + 1))

        # 进度条显示迭代信息
        tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')
    print('Done!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n')
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  progress_bar = tqdm_notebook(train_loader, leave=False)


  0%|          | 0/25 [00:00<?, ?it/s]

AttributeError: Can't pickle local object 'Sequence.__init__.<locals>.<lambda>'