In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [2]:
from transformers import *

I0105 02:43:51.116637 139743199024896 file_utils.py:39] PyTorch version 1.3.0 available.
I0105 02:43:51.389599 139743199024896 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [3]:
from tqdm import tqdm_notebook as tqdm
import preprocessor as p
import numpy as np
import matplotlib.pyplot as plt
import random
from collections import Counter
import spacy
from tqdm import tqdm, tqdm_notebook, tnrange
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [14]:
class RecArch(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, bidir, rnnType,device):
        super(RecArch, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.device = device
        self.rnnType = rnnType
        self.bidirectional = bidir
        
        if self.bidirectional:
            self.numDirs = 2
        else:
            self.numDirs = 1
        
        self.emb = nn.Embedding(self.vocab_size, embedding_dim)
        
        if self.rnnType == 'lstm':
            self.recNN = nn.LSTM(embedding_dim,hidden_dim, num_layers,batch_first=True,bidirectional=self.bidirectional)
            
        if self.rnnType == 'gru':
            self.recNN = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=True,bidirectional=self.bidirectional)
            
        if self.rnnType == 'rnn':
            self.recNN = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True, nonlinearity='tanh',bidirectional=self.bidirectional)
        
        self.fc = nn.Linear(self.numDirs*hidden_dim,output_dim)
    
    def forward(self,x,encMode=False):
        embs = self.emb(x)
        embs = embs.view(x.size(0),-1,self.embedding_dim).to(self.device)
        
        h0 = Variable(torch.zeros(self.numDirs*self.num_layers,x.size(0),self.hidden_dim),requires_grad=True).to(self.device)
        
        if self.rnnType == 'lstm':        
            c0 = Variable(torch.zeros(self.numDirs*self.num_layers,x.size(0),self.hidden_dim),requires_grad=True).to(self.device)
            
            out,(hn,cn) = self.recNN(embs,(h0,c0))
        
        else:
            out, hn = self.recNN(embs, h0)
        
#         print(out[:,-1,:].shape)
        if not encMode:
            out = self.fc(out[:, -1, :])
        else:
            out = out[:,-1,:]
        return out

In [None]:
class RecAttnArch(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, bidir, rnnType, attnType,device):
        super(RecAttnArch, self).__init__()
        
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.device = device
        self.rnnType = rnnType
        self.attnType = attnType
        self.bidirectional = bidir
        
        if self.bidirectional:
            self.numDirs = 2
        else:
            self.numDirs = 1
        
        self.emb = nn.Embedding(self.vocab_size, embedding_dim)
        
        if self.rnnType == 'lstm':
            self.recNN = nn.LSTM(embedding_dim,hidden_dim, num_layers,batch_first=True,bidirectional=self.bidirectional)
            
        if self.rnnType == 'gru':
            self.recNN = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=True,bidirectional=self.bidirectional)
            
        if self.rnnType == 'rnn':
            self.recNN = nn.RNN(embedding_dim, hidden_dim, num_layers, batch_first=True, nonlinearity='tanh',bidirectional=self.bidirectional)
        
        self.query_vector = nn.Parameter(torch.rand(hidden_dim*self.numDirs,1)).float()
        
        self.attnWgtMatrixSize = [self.numDirs*self.hidden_dim, self.numDirs*self.hidden_dim]
        self.attnWgtMatrix = nn.Parameter(torch.randn(self.attnWgtMatrixSize).float()) # Multiplicative Attention
    
        self.softmax = nn.Softmax(dim=1)
        
        if self.attnType == 'dot':
            self.fc = nn.Linear(self.numDirs*self.hidden_dim, output_dim)
        
        if self.attnType == 'self':
            self.fc = nn.Linear(self.numDirs*30*self.hidden_dim, output_dim)
    
    
    def forward(self,x,encMode=False):
        embs = self.emb(x)
        embs = embs.view(x.size(0),-1,self.embedding_dim).to(self.device)
        
        h0 = Variable(torch.zeros(self.numDirs*self.num_layers,x.size(0),self.hidden_dim),requires_grad=True).to(self.device)
        
        if self.rnnType == 'lstm':        
            c0 = Variable(torch.zeros(self.numDirs*self.num_layers,x.size(0),self.hidden_dim),requires_grad=True).to(self.device)
            
            out,(hn,cn) = self.recNN(embs,(h0,c0))
        
        else:
            out, hn = self.recNN(embs, h0)
        
        if self.attnType == 'dot':
            Hw = out
            attn_weights = self.softmax(Hw.matmul(self.query_vector))

            out = out.mul(attn_weights)
            context_vector = torch.sum(out,dim=1)
            
            fc_out = context_vector
            
        if self.attnType == 'self':
            queryMatrix = out
            keyMatrix = out.permute(0,2,1)
            
            attnScores = torch.bmm( torch.matmul(queryMatrix,self.attnWgtMatrix), keyMatrix )
            attnScores = F.softmax(attnScores, dim=2)
            hidden_matrix = torch.bmm(attnScores, queryMatrix)
            
            fc_out = hidden_matrix.view(-1, hidden_matrix.size()[1]*hidden_matrix.size()[2])
        
        if encMode:
            return fc_out
            
        else:
            return self.fc(fc_out)

In [None]:
class AttentionTextEncoder(nn.Module):
    def __init__(self, encoderType, params, X, y, device):
        super(AttentionTextEncoder, self).__init__()
        
        self.encoderType = encoderType
        self.dataProcessed = False
        if params['attnType'] == 'self':
            self.seq_dim = 30
        else:
            self.seq_dim = 1
            
        self.preprocess(X,y)
        
        if encoderType == 'attn':
            self.textEncoder = RecAttnArch(len(self.word2idx), params['embedding_dim'], params['hidden_dim'], 
                                       params['output_dim'], params['num_layers'], params['bidir'], 
                                       params['rnnType'],params['attnType'],device).to(device)
    
    def tokenize(self,text):
        p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI ,p.OPT.HASHTAG)
        return p.tokenize(text).split()
    
    def indexer(self,split_text):
        sent2idx = []
        for w in split_text:
            if w.lower() in self.word2idx:
                sent2idx.append(self.word2idx[w.lower()])
            else:
                sent2idx.append(self.word2idx['_UNK'])

        return sent2idx
    
    def pad_data(self, s, maxlen=30):
        padded = np.zeros((maxlen,), dtype=np.int64)
        if len(s) > maxlen: padded[:] = s[:maxlen]
        else: padded[:len(s)] = s
        return padded
    
    def preprocess(self,X,y):
        clean_text = [self.tokenize(x) for x in X]
        
        words = Counter()
        for sent in tqdm(clean_text):
            words.update(w.lower() for w in sent)
            
        words = sorted(words, key=words.get, reverse=True)
        # add <pad> and <unk> token to vocab which will be used later
        words = ['_PAD','_UNK'] + words
        
        self.word2idx = {o:i for i,o in enumerate(words)}
        
        sent2idx = [torch.tensor(self.indexer(x)) for x in clean_text]
        paddedX = [ self.pad_data(x) for x in sent2idx ]
        
        train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(paddedX, y, random_state=2018)
        
        train_inputs = torch.tensor(train_inputs)
        validation_inputs = torch.tensor(validation_inputs)

        train_labels = torch.tensor(train_labels)
        validation_labels = torch.tensor(validation_labels)

        self.batch_size = 100

        train_data = TensorDataset(train_inputs, train_labels)
        train_sampler = RandomSampler(train_data)
        self.train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.batch_size)

        validation_data = TensorDataset(validation_inputs, validation_labels)
        validation_sampler = SequentialSampler(validation_data)
        self.validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=self.batch_size)

        self.dataProcessed = True
        return self.dataProcessed
    
    def forward(self,text):
        x = self.tokenize(text)
        seq_len = len(x)
        x = self.indexer(x)
        x = torch.tensor(self.pad_data(x))
        x = Variable(x.view(-1, 30, 1)).to(device)
        out = self.textEncoder(x,True)
        return out
    
    def trainModel(self):
        if torch.cuda.is_available():
            device = 'cpu'
        
        model = self.textEncoder.to(device)
        model.device = device
        optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
        criterion = torch.nn.CrossEntropyLoss()
        
        count = 0
        seq_dim = 30
        num_epochs = 200
        
        maxAcc = 0
        
        for epoch in tqdm(range(num_epochs)):
            train_losses = []
            val_losses = []
            for i, (text,label) in enumerate(self.train_dataloader):
                text = Variable(text.view(-1, seq_dim, 1)).to(device)
                label = Variable(label).to(device)

        #         print(sexism_label)

                optimizer.zero_grad()
                outputs = model(text)

        #         print(outputs)

                loss = criterion(outputs, label)
                train_losses.append(loss.data.cpu())

                loss.backward()
                optimizer.step()
                count += 1

                if count % 50 == 0:    
                    correct = 0
                    total = 0

                    allLabels = []
                    allPreds = []

                    for i, (text,label) in enumerate(self.validation_dataloader):
                        labels=[]
                        text = Variable(text.view(-1, seq_dim, 1)).to(device)
                        label = Variable(label).to(device)

                        predicted = model(text)
                        predicted =  torch.softmax(predicted,1)
                        predicted = torch.max(predicted, 1)[1].cpu().numpy().tolist()
        #                 print(predicted)
        #                 print(sexism_label)
                        allLabels += (label.cpu().numpy().tolist())
                        allPreds += (predicted)

                    valacc = accuracy_score(allLabels, allPreds)
                    recscore = recall_score(allLabels, allPreds,average='macro')
                    precscore = precision_score(allLabels, allPreds,average='macro')
                    f1score = f1_score(allLabels, allPreds,average='macro')
                    cr = classification_report(allLabels, allPreds)
                    print(f'acc: {valacc} recall {recscore} prec: {precscore} f1: {f1score}')
                    print(cr)
                    
                    if valacc > maxAcc:
                        maxAcc = valacc
                        self.optimalParams = model.state_dict()
                        
        self.textEncoder = model.to('cpu')

In [None]:
class TextEncoder(nn.Module):
    def __init__(self, encoderType, params, X, y, device):
        super(TextEncoder, self).__init__()
        
        self.encoderType = encoderType
        self.dataProcessed = False
        
        self.preprocess(X,y)
        
        if encoderType == 'rnn':
            self.textEncoder = RecArch(len(self.word2idx), params['embedding_dim'], params['hidden_dim'], 
                                       params['output_dim'], params['num_layers'], params['bidir'], 
                                       params['rnnType'],device).to(device)
    
    def tokenize(self,text):
        p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI ,p.OPT.HASHTAG)
        return p.tokenize(text).split()
    
    def indexer(self,split_text):
        sent2idx = []
        for w in split_text:
            if w.lower() in self.word2idx:
                sent2idx.append(self.word2idx[w.lower()])
            else:
                sent2idx.append(self.word2idx['_UNK'])

        return sent2idx
    
    def pad_data(self, s, maxlen=30):
        padded = np.zeros((maxlen,), dtype=np.int64)
        if len(s) > maxlen: padded[:] = s[:maxlen]
        else: padded[:len(s)] = s
        return padded
    
    def preprocess(self,X,y):
        clean_text = [self.tokenize(x) for x in X]
        
        words = Counter()
        for sent in tqdm(clean_text):
            words.update(w.lower() for w in sent)
            
        words = sorted(words, key=words.get, reverse=True)
        # add <pad> and <unk> token to vocab which will be used later
        words = ['_PAD','_UNK'] + words
        
        self.word2idx = {o:i for i,o in enumerate(words)}
        
        sent2idx = [torch.tensor(self.indexer(x)) for x in clean_text]
        paddedX = [ self.pad_data(x) for x in sent2idx ]
        
        train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(paddedX, y, random_state=2018)
        
        train_inputs = torch.tensor(train_inputs)
        validation_inputs = torch.tensor(validation_inputs)

        train_labels = torch.tensor(train_labels)
        validation_labels = torch.tensor(validation_labels)

        self.batch_size = 100

        train_data = TensorDataset(train_inputs, train_labels)
        train_sampler = RandomSampler(train_data)
        self.train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.batch_size)

        validation_data = TensorDataset(validation_inputs, validation_labels)
        validation_sampler = SequentialSampler(validation_data)
        self.validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=self.batch_size)

        self.dataProcessed = True
        return self.dataProcessed
    
    def forward(self,text):
        x = self.tokenize(text)
        seq_len = len(x)
        x = self.indexer(x)
        x = torch.tensor(self.pad_data(x))
        x = Variable(x.view(-1, 30, 1)).to(device)
        out = self.textEncoder(x,True)
        return out
    
    def trainModel(self):
        if torch.cuda.is_available():
            device = 'cpu'
        
        model = self.textEncoder.to(device)
        model.device = device
        optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
        criterion = torch.nn.CrossEntropyLoss()
        
        count = 0
        seq_dim = 30
        num_epochs = 200
        
        maxAcc = 0
        
        for epoch in tqdm(range(num_epochs)):
            train_losses = []
            val_losses = []
            for i, (text,label) in enumerate(self.train_dataloader):
                text = Variable(text.view(-1, seq_dim, 1)).to(device)
                label = Variable(label).to(device)

        #         print(sexism_label)

                optimizer.zero_grad()
                outputs = model(text)

        #         print(outputs)

                loss = criterion(outputs, label)
                train_losses.append(loss.data.cpu())

                loss.backward()
                optimizer.step()
                count += 1

                if count % 50 == 0:    
                    correct = 0
                    total = 0

                    allLabels = []
                    allPreds = []

                    for i, (text,label) in enumerate(self.validation_dataloader):
                        labels=[]
                        text = Variable(text.view(-1, seq_dim, 1)).to(device)
                        label = Variable(label).to(device)

                        predicted = model(text)
                        predicted =  torch.softmax(predicted,1)
                        predicted = torch.max(predicted, 1)[1].cpu().numpy().tolist()
        #                 print(predicted)
        #                 print(sexism_label)
                        allLabels += (label.cpu().numpy().tolist())
                        allPreds += (predicted)

                    valacc = accuracy_score(allLabels, allPreds)
                    recscore = recall_score(allLabels, allPreds,average='macro')
                    precscore = precision_score(allLabels, allPreds,average='macro')
                    f1score = f1_score(allLabels, allPreds,average='macro')
                    cr = classification_report(allLabels, allPreds)
                    print(f'acc: {valacc} recall {recscore} prec: {precscore} f1: {f1score}')
                    print(cr)
                    
                    if valacc > maxAcc:
                        maxAcc = valacc
                        self.optimalParams = model.state_dict()
                        
        self.textEncoder = model.to('cpu')

In [7]:
class BertTextEncoder(nn.Module):
    def __init__(self, encoderType, params, X, y, device):
        super(BertTextEncoder, self).__init__()
        
        self.encoderType = encoderType
        self.dataProcessed = False
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.preprocess(X,y)
        
        if encoderType == 'bert':
            self.textEncoder = BertModel.from_pretrained("bert-base-uncased", num_labels = 4,
                                                         output_hidden_states=True).to(device)
            self.fc = torch.nn.Linear(768,4).to(device)
            
        if encoderType == 'bertSC':
            self.textEncoder = BertForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                                             num_labels = 4,).to(device)
    
    def pad_data(self, s, maxlen=30):
        padded = np.zeros((maxlen,), dtype=np.int64)
        if len(s) > maxlen: padded[:] = s[:maxlen]
        else: padded[:len(s)] = s
        return padded
    
    def forward(self,text):
        x = torch.tensor(self.tokenizer.encode(text, add_special_tokens=True)).unsqueeze(0)
        outputs = self.textEncoder(x)
        return outputs[1].reshape(-1)
    
    def preprocess(self,X,y):
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        sent2idx = [torch.tensor(tokenizer.encode(x, add_special_tokens=True)) for x in X]
        paddedX = [ self.pad_data(x) for x in sent2idx ]

        attention_masks = []

        for sent in paddedX:
            att_mask = [int(token_id > 0) for token_id in sent]

            attention_masks.append(att_mask)

        train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(paddedX, y, random_state=2018)
        train_masks, validation_masks, _, _ = train_test_split(attention_masks, y, random_state=2018)

        train_inputs = torch.tensor(train_inputs)
        validation_inputs = torch.tensor(validation_inputs)

        train_labels = torch.tensor(train_labels)
        validation_labels = torch.tensor(validation_labels)

        train_masks = torch.tensor(train_masks)
        validation_masks = torch.tensor(validation_masks)

        self.batch_size = 32

        train_data = TensorDataset(train_inputs, train_masks, train_labels)
        train_sampler = RandomSampler(train_data)
        self.train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=self.batch_size)

        validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
        validation_sampler = SequentialSampler(validation_data)
        self.validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=self.batch_size)

        self.dataProcessed = True
        return self.dataProcessed
        
    def trainModel(self):
        if torch.cuda.is_available():
            device = 'cuda:0'
        
        model = self.textEncoder.to(device)
        self.fc = self.fc.to(device)
        optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
        criterion = nn.CrossEntropyLoss()
        maxAcc = 0
        epochs = 4
        total_steps = len(self.train_dataloader) * epochs
        
        for epoch_i in range(0, epochs):
            total_loss = 0

            model.train()

            for step, batch in enumerate(self.train_dataloader):
                b_input_ids = batch[0].to(device)
                b_input_mask = batch[1].to(device)
                b_labels = batch[2].to(device)

                model.zero_grad()        
                
                if self.encoderType == 'bertSC':
                    outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask, 
                                labels=b_labels)

                    loss = outputs[0]
                    
                if self.encoderType == 'bert':
                    outputs = model(b_input_ids, 
                                token_type_ids=None, 
                                attention_mask=b_input_mask,)
                    
                    logits = self.fc(outputs[1])
                    
                    loss = criterion(logits,b_labels)

                total_loss += loss.item()

                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                optimizer.step()

            model.eval()
            
            allLabels = []
            allPreds = []

            for batch in self.validation_dataloader:

                batch = tuple(t.to(device) for t in batch)

                b_input_ids, b_input_mask, b_labels = batch
                
                if self.encoderType == 'bertSC':   
                    with torch.no_grad():        
                        outputs = model(b_input_ids, 
                                        token_type_ids=None, 
                                        attention_mask=b_input_mask)
                        logits = outputs[0]
                        
                if self.encoderType == 'bert':   
                    with torch.no_grad():
                        outputs = model(b_input_ids, 
                                        token_type_ids=None, 
                                        attention_mask=b_input_mask)
                        
                        logits = self.fc(outputs[1])
                        print(logits.shape)
                
                logits = logits.detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                
                allPreds += torch.max(torch.tensor(logits),1)[1].numpy().tolist()
                allLabels += label_ids.tolist()
                
            valacc = accuracy_score(allLabels, allPreds)
            recscore = recall_score(allLabels, allPreds,average='macro')
            precscore = precision_score(allLabels, allPreds,average='macro')
            f1score = f1_score(allLabels, allPreds,average='macro')
            cr = classification_report(allLabels, allPreds)

            print(f'acc: {valacc} recall {recscore} prec: {precscore} f1: {f1score}')
            print(cr)
            print('\n')
            
            if valacc > maxAcc:
                        maxAcc = valacc
                        self.optimalParams = model.state_dict()
            
        self.textEncoder = model.to('cpu')

trainer = TextEncoder('rnn',params,X,y,device)

trainer.textEncoder

trainer.trainModel()

bertTrainer = BertTextEncoder('bertSC',{},X,y,device)

bertTrainer.textEncoder

bertTrainer.trainModel()