In [53]:
from torchviz import make_dot, make_dot_from_trace
from transformers import AutoModel, AutoTokenizer 
import torch
import pickle 
import numpy as np
import pandas as pd 
import re
from tqdm import tqdm
import seaborn as sns
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score
from torch import nn
import os
import torch.nn.functional as F
import torch.optim as optim
import time
# from sklearn.metrics import classification_report
# from Attention_Augmented_Conv2d.attention_augmented_conv import AugmentedConv
# # use_cuda = torch.cuda.is_available()
# device = torch.device('cuda' if use_cuda else 'cpu')
# from ark_tweet_pos import CMUTweetTagger
# import shlex
# run_tagger_cmd = "java -XX:ParallelGCThreads=10 -Xmx500m -jar ark_tweet_pos/ark-tweet-nlp-0.3.2.jar"
# import FeaturesText
import wandb
wandb.login()
from sklearn.metrics import accuracy_score

In [54]:
class Attention(nn.Module):
    """ Applies attention mechanism on the `context` using the `query`.

    **Thank you** to IBM for their initial implementation of :class:`Attention`. Here is
    their `License
    <https://github.com/IBM/pytorch-seq2seq/blob/master/LICENSE>`__.

    Args:
        dimensions (int): Dimensionality of the query and context.
        attention_type (str, optional): How to compute the attention score:

            * dot: :math:`score(H_j,q) = H_j^T q`
            * general: :math:`score(H_j, q) = H_j^T W_a q`

    Example:

         >>> attention = Attention(256)
         >>> query = torch.randn(5, 1, 256)
         >>> context = torch.randn(5, 5, 256)
         >>> output, weights = attention(query, context)
         >>> output.size()
         torch.Size([5, 1, 256])
         >>> weights.size()
         torch.Size([5, 1, 5])
    """

    def __init__(self, dimensions, attention_type='general'):
        super(Attention, self).__init__()

        if attention_type not in ['dot', 'general']:
            raise ValueError('Invalid attention type selected.')

        self.attention_type = attention_type
        if self.attention_type == 'general':
            self.linear_in = nn.Linear(dimensions, dimensions, bias=False)

        self.linear_out = nn.Linear(dimensions * 2, dimensions, bias=False)
        self.softmax = nn.Softmax(dim=-1)
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(0.3)
    def forward(self, query, context):
        """
        Args:
            query (:class:`torch.FloatTensor` [batch size, output length, dimensions]): Sequence of
                queries to query the context.
            context (:class:`torch.FloatTensor` [batch size, query length, dimensions]): Data
                overwhich to apply the attention mechanism.

        Returns:
            :class:`tuple` with `output` and `weights`:
            * **output** (:class:`torch.LongTensor` [batch size, output length, dimensions]):
              Tensor containing the attended features.
            * **weights** (:class:`torch.FloatTensor` [batch size, output length, query length]):
              Tensor containing attention weights.
        """
        batch_size, output_len, dimensions = query.size()
        query_len = context.size(1)

        if self.attention_type == "general":
            query = query.reshape(batch_size * output_len, dimensions)
            query = self.linear_in(query)
            query = query.reshape(batch_size, output_len, dimensions)

        # TODO: Include mask on PADDING_INDEX?

        # (batch_size, output_len, dimensions) * (batch_size, query_len, dimensions) ->
        # (batch_size, output_len, query_len)
        attention_scores = torch.bmm(query, context.transpose(1, 2).contiguous())

        # Compute weights across every context sequence
        attention_scores = attention_scores.view(batch_size * output_len, query_len)
        attention_weights = self.softmax(attention_scores)
        attention_weights = attention_weights.view(batch_size, output_len, query_len)

        # (batch_size, output_len, query_len) * (batch_size, query_len, dimensions) ->
        # (batch_size, output_len, dimensions)
        mix = torch.bmm(attention_weights, context)

        # concat -> (batch_size * output_len, 2*dimensions)
        combined = torch.cat((mix, query), dim=2)
        combined = combined.view(batch_size * output_len, 2 * dimensions)

        # Apply linear_out on every 2nd dimension of concat
        # output -> (batch_size, output_len, dimensions)
        output = self.linear_out(combined).view(batch_size, output_len, dimensions)
        output = self.dropout(self.tanh(output))

        return output, attention_weights

In [55]:
class baseline(nn.Module):
    def __init__(self):
        super(baseline, self).__init__()
        
        self.conv1d = nn.Conv1d(4, 3, kernel_size=1)
        torch.nn.init.xavier_uniform_(self.conv1d.weight, gain=np.sqrt(2))
        self.attention = Attention(768)
        self.conv1d2 = nn.Conv1d(3, 2, kernel_size=1)
        torch.nn.init.xavier_uniform_(self.conv1d2.weight, gain=np.sqrt(2))
        self.conv1d3 = nn.Conv1d(2, 1, kernel_size=1)
        torch.nn.init.xavier_uniform_(self.conv1d3.weight, gain=np.sqrt(2))
        self.bgru = nn.GRU(input_size=768, hidden_size=384, num_layers=2, batch_first=True, bidirectional=True)
        self.drop = nn.Dropout(0.2)
        self.drop2 = nn.Dropout(0.3)
        self.drop3 = nn.Dropout(0.4)
        self.dense1 = nn.Linear(768, 512)
        torch.nn.init.xavier_uniform_(self.dense1.weight, gain=np.sqrt(2))
        self.dense2 = nn.Linear(512,256)
        torch.nn.init.xavier_uniform_(self.dense2.weight, gain=np.sqrt(2))
        self.dense3 = nn.Linear(256,128)
        torch.nn.init.xavier_uniform_(self.dense3.weight, gain=np.sqrt(2))
        self.dense4 = nn.Linear(128,64)
        torch.nn.init.xavier_uniform_(self.dense4.weight, gain=np.sqrt(2))
        self.dense5 = nn.Linear(64,2)
        
    def forward(self, input1):
        input_sentence = F.relu(self.conv1d(input1))
        input_sentence = self.drop(input_sentence)
        attention_1, _ = self.attention(input_sentence, input1)
        
        input_sentence = F.relu(self.conv1d2(attention_1))
        input_sentence = self.drop(input_sentence)
        attention_2, _ = self.attention(input_sentence, attention_1)
        
        input_sentence = F.relu(self.conv1d3(attention_2))
        input_sentence = self.drop(input_sentence)
        attention_3, _ = self.attention(input_sentence, attention_2) # N x 1 x 768
        
        gru, _ = self.bgru(attention_3)
        gru = self.drop2(gru)
        flattening = torch.squeeze(gru, 1)
        
        dense = F.relu(self.dense1(flattening))
        dense = self.drop2(dense)
        dense = F.relu(self.dense2(dense))
        dense = self.drop3(dense)
        dense = self.dense3(dense)
        dense = self.drop2(dense)
        dense = F.relu(self.dense4(dense))
        dense = self.drop2(dense)
        output = self.dense5(dense)
        
        return output

In [56]:

model = baseline()

In [61]:
from torch.utils.tensorboard import SummaryWriter

# default `log_dir` is "runs" - we'll be more specific here
tb = SummaryWriter('prova')

In [62]:
x = torch.zeros((16,4,768))

In [63]:
tb = SummaryWriter()
tb.add_graph(model, x)
tb.close()