In [1]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as torch_data

In [None]:
print('longet train comment length',train_max_len)
print('longest train title length',train_max_title_len)
print(len(train_text))
print(len(train_title))
print(train_labels.shape)

"""
longet train comment length 244
longest train title length 13
1528
1528
(1528,)
"""

print('longest test comment length', test_max_len)
print('longest test title length', test_max_title_len)
print(len(test_text))
print(len(test_title))
print(test_labels.shape)

"""
longest test comment length 126
longest test title length 19
102
102
(102,)
"""

print(train_data_array.shape, train_title_array.shape)
print(test_data_array.shape, test_title_array.shape)
"""
(244, 1528, 300) (13, 1528, 300)
(126, 102, 300) (19, 102, 300)
"""

"""
data_array.shape = (244, 1528, 300)
data_labels.shape = (1528,)
data is an (L,N,D) array
L = max_length of sequence
N = batch_size
D = embed_dim
"""

In [2]:
class BaseModel(nn.Module): # single direction lstm, no attention
  def __init__(self, hidden_size = 100, embed_dim = 300):
    super(BaseModel, self).__init__()
    
    self.hidden_size = hidden_size
    
    self.lstm = nn.LSTM(input_size = embed_dim, hidden_size = hidden_size, num_layers = 1, batch_first = True, dropout = 0.2, bidirectional = False)
    
    # two linear layers for context (final hidden state) => binary classification
    self.linear1 = nn.Linear(hidden_size, 150) 
    self.linear2 = nn.Linear(150, 1)

    self.relu = nn.ReLU()

    self.sigmoid = nn.Sigmoid()

  def forward(self, data):
    """
    data is an (N, L, D) = (batch_size, max_length, embed_dim) array
    returns an (N,1) array of binary probabilities that each comment is hateful
    """
    hidden_states, (_, _) = self.lstm(data)
    # hidden_states = (batch_size, max_length, hidden_size) array
    
    sentences = torch.sum(hidden_states, axis = 1 ) # => (batch_size,hidden_size)

    return self.sigmoid(torch.squeeze(self.linear2(self.relu(self.linear1(sentences)))))

In [None]:
class Full_LSTM_Model(nn.Module): # generalized lstm class
  def __init__(self, hidden_size = 100, embed_dim = 300, bidi = True, attention = True):
    super(Full_LSTM_Model, self).__init__()
    if attention: assert bidi # attention only if the LSTM is bidirectional
    
    self.hidden_size = hidden_size 
    self.attention = attention
    
    self.lstm = nn.LSTM(input_size = embed_dim, hidden_size = hidden_size, num_layers = 1, batch_first = True, dropout = 0.2, bidirectional = bidi)

    # two linear layers for output of lstm: (final hidden state) => binary classification
    self.linear1 = nn.Linear(self.hidden_size*2 if bidi else self.hidden_size, 150) 
    self.linear2 = nn.Linear(150, 1)

    if self.attention: #assuming bidi
        self.attention1 = nn.Linear(2*hidden_size, 50) # map hidden state vector to value
        self.attention2 = nn.Linear(50, 1)
        self.sm = nn.Softmax(dim = 1)
    
    self.relu = nn.ReLU()
    self.sigmoid = nn.Sigmoid()

  def forward(self, data):
    """
    data is an (N, L, D) = (batch_size, max_length, embed_dim) array
    returns an (N,1) array of binary probabilities that each comment is hateful
    """
    hidden_states, (_, _) = self.lstm(data)
    # hidden_states = (batch_size, max_length, hidden_size) array
    
    """
    in this case, attention is a choice of coefficients which we use to weight hidden states 
    when summing them instead of adding them up with equal weighting
    
    TODO: add masks so that we aren't operating on all the hidden states since the padded ones don't matter!
    TODO: average sentences in non attention case instead of summing them
    """
    if self.attention:
        weights = self.attention1(hidden_states) #(batch_size,max_length,50)
        weights = self.relu(weights) #(batch_size,max_length,50)
        weights = self.attention2(weights) #(batch_size,max_length,1)
        alphas = self.sm(weights.squeeze()) #sm((batch_size,max_length)) => (batch_size,max_length)
        
        sentences = torch.sum(hidden_states * alphas[:,:,None], axis = 1) # (batch_size,hidden_size)
    
    else:
        sentences = torch.sum(hidden_states, axis = 1)
    
    output = self.linear2(self.relu(self.linear1(sentences))) # => (batch_size,1)
    output = torch.squeeze(output) # => (batch_size)
    return self.sigmoid(output)

In [None]:
class FullModel(nn.Module): # bidi with attention
  def __init__(self, hidden_size = 100, embed_dim = 300):
    super().__init__()
    
    self.hidden_size = hidden_size
    #self.embedding = embed
    
    self.linear1 = nn.Linear(2*hidden_size, hidden_size) # map context vector to value
    self.linear2 = nn.Linear(hidden_size, 1)

    self.attention1 = nn.Linear(2*hidden_size, 50) # map hidden state vector to value
    self.attention2 = nn.Linear(50, 1)

    self.relu = nn.ReLU()

    self.sm = nn.Softmax(dim = 0)
    self.sigmoid = nn.Sigmoid()
    
    self.lstm = nn.LSTM(input_size = embed_dim, hidden_size = hidden_size, num_layers = 1, batch_first = False, dropout = 0.2, bidirectional = True)

  def forward(self, data):
    """
    data is an (L,N,D) array
    L = max_length of sequence
    N = batch_size
    D = embed_dim
    returns an (N,1) array of probabilities that each comment is hateful
    """
    hidden_states, (_, _) = self.lstm(data) # (L,N,2H) array
    weights = self.attention2(self.relu(self.attention1(hidden_states))) # (L,N,1) array
    
    alpha = self.sm(weights.reshape(weights.shape[:-1])) # (LxN)

    hidden_states = torch.moveaxis(hidden_states, -1, 0) # (2H,N,L)


    sentences = torch.sum(hidden_states * alpha, axis = 1)

    sentences = torch.moveaxis(sentences, 0, -1)

    return self.sigmoid(torch.squeeze(self.linear2(self.relu(self.linear1(sentences)))))