In [254]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset,DataLoader
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook
import torch.nn.functional as F
from argparse import Namespace
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
args = Namespace(data_path = '/Users/lijunlin/Documents/研二上/pytorch/data/6000.xlsx',
                columns = ['Sentences','Class'],
                maxlen = 100,
                embedding_dim = 2000,
                numwords = 2000,
                batch_size = 64,
                epochs = 20,
                shuffle = 2000)
df = pd.read_excel(args.data_path,columns = args.columns)

In [255]:
class word_preprocessing:
    def __init__(self,corpus):
        self.sentences = list(corpus)
    def sentences_de_comma(self,inputs):
        pattern = re.compile(r'[^\w\s]')
        return [pattern.sub('',str(x)) for x in inputs]
    def sentences_de_english(self,inputs):
        pattern = re.compile(r'[a-zA-Z]')
        return [pattern.sub('',str(x)) for x in inputs]
    def washing(self):
        inputs = self.sentences
        outputs = self.sentences_de_comma(inputs)
        outputs = self.sentences_de_english(outputs)
        return outputs
corpus = word_preprocessing(df['Sentences']).washing()

In [256]:
class MyVectorizer:
    def __init__(self,corpus,numwords):
        self.corpus = corpus
        self.model = Tokenizer(num_words = args.numwords)
        self.model.fit_on_texts(corpus)
    def vectorize(self):
        self.sequences = self.model.texts_to_sequences(corpus)
        return self.sequences
    def padding(self,maxlen):
        return pad_sequences(self.vectorize(), maxlen, padding='post')
    def get_index_word(self):
        return self.model.index_word
    def get_word_index(self):
        index_2_word_dic = self.model.index_word
        word_2_index_dic = {x:y for y,x in index_2_word_dic.items()}
        return word_2_index_dic
vectorizer = MyVectorizer(corpus,args.numwords)
X = vectorizer.padding(args.maxlen)
index_2_word = vectorizer.get_index_word()
word_2_index = vectorizer.get_word_index()
y = np.asarray(df.iloc[:,-1])

In [257]:
class X_y_preprocessing:
    def __init__(self,X,y):
        self.X = torch.LongTensor(X)
        self.y = torch.LongTensor(y)
        self.X_train,self.X_test,self.y_train,self.y_test = train_test_split(self.X,self.y,test_size= 0.3)
    def get_db_train(self):
        db_train = TensorDataset(self.X_train,self.y_train)
        db_train = DataLoader(db_train,batch_size = args.batch_size,shuffle = True, drop_last= True)
        return db_train
    def get_db_test(self):
        db_test = TensorDataset(self.X_test,self.y_test)
        db_test = DataLoader(db_test, batch_size = args.batch_size, shuffle = True, drop_last=True)
        return db_test
loader = X_y_preprocessing(X,y)
db_train = loader.get_db_train()
db_test = loader.get_db_test()
X_train = loader.X_train
y_train = loader.y_train

In [258]:
class Attention(nn.Module):
    def __init__(self,h_dim):
        super().__init__()
        self.h_dim = h_dim
        self.projection = nn.Sequential(nn.Linear(h_dim,64),
                                       nn.ReLU(True),
                                       nn.Linear(64,1))
    def forward(self,x):
        energy = self.projection(x)  ###[b,ml,h_dim（*2）] -> [b,ml,1] (消掉隐藏层维度)   
        attn_score = F.softmax(energy.squeeze(-1),dim = 1).unsqueeze(-1)   ##[b,ml,1]->[b,ml]->[b,ml,1]    
        outputs = torch.bmm(x.transpose(1,2),attn_score).squeeze()   ##[b,ml,h_dim（*2）]*[b,ml,1]   
        #outputs_2 = (x*attn_score).sum(dim=1)   #[b,ml,h_dim]*[b,ml,1]
        return outputs,attn_score
class Attention_LSTM(nn.Module):
    def __init__(self,numwords,batch_size,embedding_dim,hidden_size,bidirectional,output_dim,num_layers):
        super(Attention_LSTM,self).__init__()
        self.running_loss = 0
        self.direction_num = int(bidirectional)+1
        self.output_dim = output_dim
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(num_embeddings = numwords,embedding_dim = embedding_dim)
        self.lstm = nn.LSTM(input_size = embedding_dim,
                           hidden_size = hidden_size,
                           num_layers = num_layers,
                           batch_first = True,
                           bidirectional = bidirectional,
                           dropout = 0.1)
        self.attention = Attention(h_dim = hidden_size)
        self.fc = nn.Sequential(nn.Linear(hidden_size,output_dim),
                                nn.Softmax(dim = 1))
    def forward(self,x):
        embedded = self.embedding(x)
        lstm_output,h_state = self.lstm(embedded)
        lstm_output = lstm_output[:,:,:hidden_size]+lstm_output[:,:,hidden_size:]
        attn_output,attn_weights = self.attention(lstm_output)
        output = self.fc(attn_output.view(batch_size, -1))
        return output,attn_weights

In [259]:
def get_num_batches(db):
    y = len(list(enumerate(db)))
    return y
def get_accruacy(out,y):
    out_class = out.max(1)[1].numpy()
    acc_num = np.sum(np.array(np.equal(out_class,y)))
    acc = acc_num/len(y)
    return acc

In [260]:
def train(data_loader, model, optimizer, criterion):
    model.train()
    epoch_loss = 0
    epoch_acc= 0
    for batch in enumerate(data_loader):
        i,(X,y) = batch
        optimizer.zero_grad()
        outputs, _ = model(X)
        loss = criterion(outputs,y)
        acc = get_accruacy(outputs,y)
        loss.backward()
        optimizer.step()
        epoch_loss += (loss.item()-epoch_loss)/(i+1)
        epoch_acc  += (acc-epoch_acc)/(i+1)
        train_bar.update()
        train_bar.set_postfix(loss = epoch_loss,acc = epoch_acc)
    train_bar.n = 0
    epoch_bar.update()
    epoch_bar.set_postfix(loss = epoch_loss,
                         acc = epoch_acc)
def evaluate(data_loader,model,criterion):
    model.eval()
    eval_loss = 0
    eval_acc = 0
    for batch in enumerate(data_loader):
        i,(X,y) = batch
        outputs, _ = model(X)
        loss = criterion(outputs,y)
        acc = get_accruacy(outputs,y)
        eval_loss += (loss.item()-eval_loss)/(i+1)
        eval_acc += (acc-eval_acc)/(i+1)
        eval_bar.set_postfix(loss = eval_loss, acc = eval_acc)
        eval_bar.update()
        
        
    

In [261]:
hidden_size = 128
max_len = 100
output_dim = 3
bidirectional = True
embedding_dim = args.embedding_dim
numwords = args.numwords
batch_size = args.batch_size
num_layers = 1
epochs = args.epochs
model = Attention_LSTM(numwords=numwords,
                       batch_size=batch_size,
                       embedding_dim=embedding_dim,
                       hidden_size=hidden_size,bidirectional=bidirectional,output_dim=output_dim,num_layers=num_layers)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [263]:
from tqdm import tqdm_notebook
train_bar = tqdm_notebook(desc='split=train',
                         total=get_num_batches(db_train),
                         position=1,
                         leave=True)
epoch_bar = tqdm_notebook(desc='training routine',
                          total=epochs,
                          position=0)
eval_bar = tqdm_notebook(desc="split=evaluate",
                        total=get_num_batches(db_test),
                        position=2,
                        leave= True)
for epoch in range(epochs):
    train(db_train,model,optimizer,criterion)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, description='split=train', max=65.0, style=ProgressStyle(description_w…

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, description='training routine', max=20.0, style=ProgressStyle(descript…

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if sys.path[0] == '':


HBox(children=(FloatProgress(value=0.0, description='split=train', max=28.0, style=ProgressStyle(description_w…

In [266]:
evaluate(db_test,model,criterion)

In [286]:
embedding_weight=np.array(model.embedding.weight.tolist())