<a href="https://colab.research.google.com/github/peeyushsinghal/nlp-debias/blob/main/debias_nlp_sa_da_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
#@title Installing Libraries
! pip install ekphrasis --quiet # library to pre process twitter data
! pip install emoji --upgrade --quiet #library to deal with emoji data

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.8/83.8 KB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.8/52.8 KB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 KB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.9/240.9 KB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for emoji (setup.py) ... [?25l[?25hdone


In [4]:
#@title Import Statements
import pandas as pd
import os
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.autograd import Function
import numpy as np
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import emoji
from tqdm import tqdm
import random
import torch.optim as optim
import scipy.stats as stats
from statistics import mean

import time

from copy import deepcopy


from torchsummary import summary

import matplotlib.pyplot as plt
# import EarlyStopping
# from pytorchtools import EarlyStopping

# from torch_lr_finder import LRFinder


from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau
from tqdm import tqdm_notebook # required for embeddings


In [5]:
#@title Data Loading

#Mounting google drive
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
#@title Data Configuration

BASE_PATH = '/content/drive/MyDrive/semeval-2018'

DATA_DIR = os.path.join(BASE_PATH,'datasets')
TARGET_DIR = os.path.join(BASE_PATH,'targetdataset')

MODEL_DIR = os.path.join(BASE_PATH,'models')
REF_DIR = os.path.join(BASE_PATH,'reference')
EMBEDDINGS_DIR = os.path.join(BASE_PATH,'embeddings')
if not os.path.exists(MODEL_DIR):
  os.makedirs(MODEL_DIR)
  print("The new directory is created!")


In [7]:
#@title Experiment Configuration

MAX_SIZE = 50
MAX_VOCAB_SIZE = 10000

BATCH_SIZE = 16 # based on EWC paper

# EMBEDDING_TO_BE_USED = 'glove_gn' # {'glove', 'glove_gn'}
TARGET_BATCH_SIZE = 8

NUM_EPOCHS = 150
INITIAL_LR = 0.02 
# INITIAL_LR = 0.1
dict_initial_lr = {'EI_sadness': 0.005,
                   'V' : 0.02,
                   'EI_fear': 0.02,
                   'EI_anger' : 0.05,
                   'EI_joy': 0.05  }
                   
dict_emb_file = {'glove':'glove.6B.300d.txt',
                 'glove_gn': '1b-vectors300-0.8-0.8.txt'}

DANN_NUM_EPOCHS = 50
DANN_INITIAL_LR = 0.001

DANN_EWC_NUM_EPOCHS = 50
DANN_EWC_INITIAL_LR = 0.0001
# INITIAL_LR_EWC = INITIAL_LR / NUM_EPOCHS
LR_GAMMA = 0.9
LR = 2e-4

PATIENCE = 15
EWC_LAMBDA = 0.4


BONFERRONI_CORRECTION = 5.0

In [8]:
# Seed and Cuda
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:{}".format(DEVICE))
# setting seed
SEED = 42 # arbit seed, why 42 - because in hitch hikers guide to galaxy it is answer to everything
# torch.cuda.seed(SEED) 
torch.cuda.manual_seed_all(SEED) if DEVICE == 'cuda' else torch.manual_seed(SEED)

Running on:cpu


<torch._C.Generator at 0x7f3902153af0>

Data Configuration

In [9]:
# data configuration

class TASK1(object):
  
    EI_reg = {
        'anger': {
            'train': os.path.join(
                DATA_DIR, 'task1/EI-reg/training/EI-reg-En-anger-train.txt'),
            'dev': os.path.join(
                DATA_DIR, 'task1/EI-reg/development/2018-EI-reg-En-anger-dev.txt'),
            'gold': os.path.join(
                DATA_DIR, 'task1/EI-reg/test-gold/2018-EI-reg-En-anger-test-gold.txt')
                },
        'joy': {
                'train': os.path.join(
                    DATA_DIR, 'task1/EI-reg/training/EI-reg-En-joy-train.txt'),
                'dev': os.path.join(
                    DATA_DIR, 'task1/EI-reg/development/2018-EI-reg-En-joy-dev.txt'),
                'gold': os.path.join(
                    DATA_DIR, 'task1/EI-reg/test-gold/2018-EI-reg-En-joy-test-gold.txt')
                },
        'fear': {
            'train': os.path.join(
                DATA_DIR, 'task1/EI-reg/training/EI-reg-En-fear-train.txt'),
            'dev': os.path.join(
                DATA_DIR, 'task1/EI-reg/development/2018-EI-reg-En-fear-dev.txt'),
            'gold': os.path.join(
                DATA_DIR, 'task1/EI-reg/test-gold/2018-EI-reg-En-fear-test-gold.txt')
                },
        'sadness': {
            'train': os.path.join(
                DATA_DIR, 'task1/EI-reg/training/EI-reg-En-sadness-train.txt'),
            'dev': os.path.join(
                DATA_DIR, 'task1/EI-reg/development/2018-EI-reg-En-sadness-dev.txt'),
            'gold': os.path.join(
                DATA_DIR, 'task1/EI-reg/test-gold/2018-EI-reg-En-sadness-test-gold.txt')
                }                     
        }

    V_reg = {
        'train': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-train.txt'),
        'dev': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-dev.txt'),
        'gold': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-test-gold.txt')
             }

    EEC = {
        'eec': os.path.join(
            DATA_DIR, 'task1/Equity-Evaluation-Corpus/Equity-Evaluation-Corpus.csv')
             }

In [10]:
def parse_reg(data_file, label_format='tuple'):
    """
    This is for datasets for the EI-reg and V-reg English tasks 
    Returns:
        df: dataframe with columns in the first row of file [ID-Tweet-Affect Dimension-Intensity Score]
    """
    with open(data_file, 'r') as fd:
      data = [l.strip().split('\t') for l in fd.readlines()]
    df = pd.DataFrame (data[1:],columns=data[0])
    return df


In [11]:
def parse_csv(task, dataset, emotion='anger'):
    if task == 'EI-reg':
        data_train = TASK1.EI_reg[emotion][dataset]
        df = parse_reg(data_train)
        return df

    elif task == 'V-reg':
        data_train = TASK1.V_reg[dataset]

        df = parse_reg(data_train)
        return df

    else:
        return None

In [12]:
emotions = ['anger','joy','fear','sadness']
dict_data ={'train':'train','dev':'val','gold':'test'}
dict_file_name ={}
for emotion in emotions:
  file_name = str('EI_'+ emotion)
  dict_file_name[file_name] = {}
  for data_info, data_usage in dict_data.items():
    dict_file_name[file_name][data_usage] = parse_csv('EI-reg', data_info, emotion)

dict_file_name['V'] ={}
for data_info, data_usage in dict_data.items():
  dict_file_name['V'][data_usage] = parse_csv('V-reg', data_info, emotion)

(dict_file_name.keys())

dict_keys(['EI_anger', 'EI_joy', 'EI_fear', 'EI_sadness', 'V'])

In [13]:
#@title Pre-process Tweets

In [14]:
# dict_file_name['EI_fear']['val'].iloc[0]['Tweet']

In [15]:
# reference : https://github.com/cbaziotis/ekphrasis


text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading twitter - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_1grams.txt
Reading twitter - 2grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_2grams.txt
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [16]:
def emoji_treatment(tag):
  """
  Breaks down :emojia_emojib: into :, emojia , emojib, :
  """
  if len(tag) < 2:
    return [tag]
  else:
    if tag[0]==":" and tag[-1]==":" and len(tag)>1:
      new_tokens = tag[1:-1].split("_")
      new_tokens.insert(0,"<emoji>")
      new_tokens.append("</emoji>")
      return new_tokens
    else:
      return [tag]

In [17]:
def preprocess_tweet(tweet): 

  tweet_processed = text_processor.pre_process_doc(tweet)
  demojized_list =[]
  final_list =[]
  for index, tweet in enumerate(tweet_processed):
      demojized_list.append(emoji.demojize(tweet, language = 'en'))

  
  for index, tag in enumerate(demojized_list):
    for return_tag in emoji_treatment(tag):
      final_list.append(return_tag)

  return final_list

In [18]:
# T =5
# print (dict_file_name['EI_fear']['val'].iloc[T]['Tweet'])
# print( preprocess_tweet(dict_file_name['EI_fear']['val'].iloc[T]['Tweet']))

In [19]:
#@title Building Vocab Related 
def tokenize(texts):
    """Tokenize texts, build vocabulary and find maximum sentence length.
    
    Args:
        texts (List[str]): List of text data
    
    Returns:
        tokenized_texts (List[List[str]]): List of list of tokens
        word2idx (Dict): Vocabulary built from the corpus
        max_len (int): Maximum sentence length
    """

    max_len = 0
    tokenized_texts = []
    word2idx = {}

    # Add <pad> and <unk> tokens to the vocabulary
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1

    # Building our vocab from the corpus starting from index 2
    idx = 2
    for sent in texts:
        tokenized_sent = preprocess_tweet(sent)

        # Add `tokenized_sent` to `tokenized_texts`
        tokenized_texts.append(tokenized_sent)

        # Add new token to `word2idx`
        for token in tokenized_sent:
            if token not in word2idx:
                word2idx[token] = idx
                idx += 1

        # Update `max_len`
        max_len = max(max_len, len(tokenized_sent))

    return tokenized_texts, word2idx, max_len


In [20]:
def encode(tokenized_texts, word2idx, max_len):
    """Pad each sentence to the maximum sentence length and encode tokens to
    their index in the vocabulary.

    Returns:
        input_ids (np.array): Array of token indexes in the vocabulary with
            shape (N, max_len). It will the input of our CNN model.
    """

    input_ids = []
    for tokenized_sent in tokenized_texts:
        # Pad sentences to max_len
        tokenized_sent += ['<pad>'] * (max_len - len(tokenized_sent))

        # Encode tokens to input_ids
        input_id = [word2idx.get(token) for token in tokenized_sent]
        input_ids.append(input_id)
    
    return np.array(input_ids)

In [21]:
#@title Load Pretrained Vectors - Glove / GloveGN

dict_emb_file = {'glove':'glove.6B.300d.txt',
                 'glove_gn': '1b-vectors300-0.8-0.8.txt'}
# print("EMBEDDING_TO_BE_USED:", EMBEDDING_TO_BE_USED)   
# emb_file_path = os.path.join(EMBEDDINGS_DIR ,dict_emb_file[EMBEDDING_TO_BE_USED])
# emb_file_path            

In [22]:

def load_pretrained_vectors(word2idx, fname):
    """Load pretrained vectors and create embedding layers.
    
    Args:
        word2idx (Dict): Vocabulary built from the corpus
        fname (str): Path to pretrained vector file

    Returns:
        embeddings (np.array): Embedding matrix with shape (N, d) where N is
            the size of word2idx and d is embedding dimension
    """

    print("Loading pretrained vectors...")
    fin = open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    d = len(fin.readline().split())-1
    # n, d = map(int, fin.readline().split())

    # Initilize random embeddings
    embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
    embeddings[word2idx['<pad>']] = np.zeros((d,))

    # Load pretrained vectors
    count = 0
    for line in tqdm_notebook(fin):
        tokens = line.rstrip().split(' ')
        word = tokens[0]
        if word in word2idx:
            count += 1
            embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)

    print(f"There are {count} / {len(word2idx)} pretrained vectors found.")

    return embeddings

In [23]:
#@title Embeddings Creation
print("Tokenizing...\n")
dict_data_embedding_properties = {}
for name, dict_df in dict_file_name.items():
  dict_data_embedding_properties[name] = {}
  tokenized_texts, word2idx, max_len = tokenize(dict_df['train']['Tweet'].to_list()) # input is list of sentences
  dict_data_embedding_properties[name]['max_len'] = max_len
  dict_data_embedding_properties[name]['word2idx'] = word2idx
  dict_data_embedding_properties[name]['tokenized_texts'] = tokenized_texts

  input_ids = encode(tokenized_texts, word2idx, max_len)
  dict_data_embedding_properties[name]['input_ids'] = input_ids

  for embedding, embedding_file_name in dict_emb_file.items():
    dict_data_embedding_properties[name][embedding] = {}
    emb_file_path = os.path.join(EMBEDDINGS_DIR ,embedding_file_name)
    embeddings = load_pretrained_vectors(word2idx, emb_file_path) # providing word to index and embedding file
    dict_data_embedding_properties[name][embedding]['embeddings'] = torch.tensor(embeddings)

  

Tokenizing...

Loading pretrained vectors...


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for line in tqdm_notebook(fin):


0it [00:00, ?it/s]

There are 4482 / 4794 pretrained vectors found.
Loading pretrained vectors...


0it [00:00, ?it/s]

There are 3898 / 4794 pretrained vectors found.
Loading pretrained vectors...


0it [00:00, ?it/s]

There are 4365 / 4752 pretrained vectors found.
Loading pretrained vectors...


0it [00:00, ?it/s]

There are 3783 / 4752 pretrained vectors found.
Loading pretrained vectors...


0it [00:00, ?it/s]

There are 5268 / 5650 pretrained vectors found.
Loading pretrained vectors...


0it [00:00, ?it/s]

There are 4510 / 5650 pretrained vectors found.
Loading pretrained vectors...


0it [00:00, ?it/s]

There are 4613 / 4957 pretrained vectors found.
Loading pretrained vectors...


0it [00:00, ?it/s]

There are 4021 / 4957 pretrained vectors found.
Loading pretrained vectors...


0it [00:00, ?it/s]

There are 4121 / 4417 pretrained vectors found.
Loading pretrained vectors...


0it [00:00, ?it/s]

There are 3562 / 4417 pretrained vectors found.


In [24]:
# print(len(dict_data_embedding_properties['EI_sadness']['tokenized_texts'][0]))
# ((dict_data_embedding_properties['EI_sadness']['max_len']))

In [25]:
#@title Dataset Class

class TwitterDataset(Dataset):
  def __init__(self, df_data, word2idx, max_len):

    list_tweet = df_data['Tweet'].to_list()
    list_tweet_preprocessed = [preprocess_tweet(tweet) for tweet in list_tweet]

    list_word_indices = []
    for tweet_preprocessed in list_tweet_preprocessed:
      if len(tweet_preprocessed) > max_len:
          tweet_preprocessed = tweet_preprocessed [: max_len]
      else:
        tweet_preprocessed += ['<pad>'] * (max_len - len(tweet_preprocessed))

      list_word_index =[]
      for token in tweet_preprocessed:
        if word2idx.get(token):
          list_word_index.append(word2idx.get(token))
        elif token == '<pad>':
          list_word_index.append(word2idx.get('<pad>'))
        else:
          list_word_index.append(word2idx.get('<unk>'))
      
      list_word_indices.append(list_word_index)

    self.x = torch.tensor(list_word_indices) #list_word_index #list_tweet_preprocessed
    self.y = torch.tensor(df_data['Intensity Score'].values.astype(float))

  def __len__(self):
    return len(self.y)

  def __getitem__(self,idx):
    return self.x[idx], self.y[idx]


In [26]:
# temp_dataset = TwitterDataset(dict_file_name['EI_anger']['val'], word2idx = dict_data_embedding_properties['EI_anger']['word2idx'], max_len = dict_data_embedding_properties['EI_anger']['max_len'])
# next(iter(temp_dataset))

In [27]:
# (dict_data_embedding_properties['V']['input_ids']) #, len(dict_file_name['V']['test']['Intensity Score'].values)

In [28]:
#@title Creating Datasets

def create_datasets (dict_file_name, dict_data, dict_data_embedding_properties ):
  dict_dataset ={}
  for name in dict_file_name.keys(): # ['EI_anger', 'EI_joy', 'EI_fear', 'EI_sadness', 'V']
    dict_dataset[name]={}
    for usage in dict_data.values(): # {'train':'train','dev':'val','gold':'test'}
      dict_dataset[name][usage] = TwitterDataset(df_data= dict_file_name[name][usage], 
                                                word2idx = dict_data_embedding_properties[name]['word2idx'], 
                                                max_len = dict_data_embedding_properties[name]['max_len'])

  return dict_dataset

In [29]:
#@title Create Dataloader
def data_loader (dict_dataset, train_batch_size = 16):
  dict_dataloader ={}
  for name in dict_file_name.keys():
    train_data = dict_dataset[name]['train']
    val_data = dict_dataset[name]['val']
    test_data = dict_dataset[name]['test']

    # Create DataLoader for training data
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=train_batch_size)

    # Create DataLoader for validation data
    # val_data = TensorDataset(val_inputs, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=len(val_data))

    # Create DataLoader for test data
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=len(test_data))
    
    dict_dataloader[name] = {'train': train_dataloader ,'val': val_dataloader,'test' :test_dataloader}
  return dict_dataloader

In [30]:
#@title Instantiate dataset and dataloader
dict_dataset = create_datasets (dict_file_name, dict_data, dict_data_embedding_properties)
dict_dataloader = data_loader (dict_dataset, train_batch_size = 16)

In [31]:
# next(iter(dict_dataloader['V']['train']))

In [32]:
#@title Gradient Reversal Function

class GradientReversalFn(Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha

        return output, None

In [33]:
#@title CNN Model

def conv_block(in_channels, out_channels, dropout_value = 0.1, *args, **kwargs):
  """Returns a conv block"""
  return nn.Sequential(
      nn.Conv1d(in_channels, out_channels, *args, **kwargs),
      nn.BatchNorm1d(out_channels),
      nn.ReLU(),
      nn.Dropout(dropout_value),
      nn.Conv1d(out_channels, out_channels, *args, **kwargs),
      nn.BatchNorm1d(out_channels),
      nn.ReLU(),
      nn.Dropout(dropout_value),
      nn.Conv1d(out_channels, out_channels, *args, **kwargs),
      )

class CNN_NLP(nn.Module):
    """An 1D Convulational Neural Network for Sentence Classification."""
    def __init__(self,
                 pretrained_embedding=None,
                 freeze_embedding=False,
                 vocab_size=None,
                 embed_dim=300,
                 filter_sizes = [2, 3, 4, 5],
                 num_filters = [100, 100, 100, 100],
                 num_classes = 1,
                 dropout=0.2):
        """
        The constructor for CNN_NLP class.

        Args:
            pretrained_embedding (torch.Tensor): Pretrained embeddings with
                shape (vocab_size, embed_dim)
            freeze_embedding (bool): Set to False to fine-tune pretraiend
                vectors. Default: False
            vocab_size (int): Need to be specified when not pretrained word
                embeddings are not used.
            embed_dim (int): Dimension of word vectors. Need to be specified
                when pretrained word embeddings are not used. Default: 300
            filter_sizes (List[int]): List of filter sizes. Default: [2, 3, 4, 5]
            num_filters (List[int]): List of number of filters, has the same
                length as `filter_sizes`. Default: [100, 100, 100, 100]
            n_classes (int): Number of classes. Default: 2
            dropout (float): Dropout rate. Default: 0.2
        """

        super(CNN_NLP, self).__init__()
        #---------------------Feature Extractor Network----------------------#
        # Embedding layer

        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding,
                                                          freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)

        # Conv Network
        self.feature_extractor = nn.ModuleList([
            conv_block(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      dropout_value = dropout,
                      kernel_size=filter_sizes[i]) 
            for i in range(len(filter_sizes))
        ])

        #---------------------Regression Network------------------------#
        # Fully-connected layer and Dropout
        self.regression = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(sum(num_filters), sum(num_filters) // 2),
            nn.LayerNorm(sum(num_filters) // 2),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(sum(num_filters) // 2, num_classes * 10),
            nn.LayerNorm(num_classes * 10),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(num_classes * 10, num_classes)
        )

        #---------------------Domain Classifier Network------------------------#
        # Fully-connected layer and Dropout
        self.domain_classifier = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(sum(num_filters), sum(num_filters) // 2),
            nn.LayerNorm(sum(num_filters) // 2),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(sum(num_filters) // 2, num_classes * 10),
            nn.LayerNorm(num_classes * 10),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            nn.Linear(num_classes * 10, 2), # Number of classes would be 2
            nn.LogSoftmax(dim=1)
        )

    def forward(self, text_ids , alpha=1.0):
        """Perform a forward pass through the network.

        Args:
            text_ids (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)

        Returns:
            logits (torch.Tensor): Output logits with shape (batch_size,
                n_classes)
        """

        # Get embeddings from `input_ids`. Output shape: (b, max_len, embed_dim)
        x_embed = self.embedding(text_ids).float()

        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)

        # Apply CNN, BatchNorm, ReLU and Dropout. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [(conv1d(x_reshaped)) for conv1d in self.feature_extractor]
        

        # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]
        
        # Concatenate x_pool_list to feed the fully connected layer. Output of Feature block
        # Output shape: (b, sum(num_filters))
        x_feature = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
        
        reverse_feature = GradientReversalFn.apply(x_feature, alpha)

        regression_output = self.regression(x_feature)
        domain_classifier_output = self.domain_classifier(x_feature)

        return regression_output, domain_classifier_output


In [34]:
#@title Initialize Model
dict_model_arch = {}
for name in dict_file_name.keys(): # ['EI_anger', 'EI_joy', 'EI_fear', 'EI_sadness', 'V']
  dict_model_arch[name] = {}
  for embedding in dict_emb_file.keys() : # 'glove', 'glove_gn'
    dict_model_arch[name] [embedding] = CNN_NLP ( pretrained_embedding=dict_data_embedding_properties[name][embedding]['embeddings'])

dict_model_arch

{'EI_anger': {'glove': CNN_NLP(
    (embedding): Embedding(4794, 300)
    (feature_extractor): ModuleList(
      (0): Sequential(
        (0): Conv1d(300, 100, kernel_size=(2,), stride=(1,))
        (1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): Dropout(p=0.2, inplace=False)
        (4): Conv1d(100, 100, kernel_size=(2,), stride=(1,))
        (5): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (6): ReLU()
        (7): Dropout(p=0.2, inplace=False)
        (8): Conv1d(100, 100, kernel_size=(2,), stride=(1,))
      )
      (1): Sequential(
        (0): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
        (1): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU()
        (3): Dropout(p=0.2, inplace=False)
        (4): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
        (5): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=T

In [None]:
#@title Training - one forward pass
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(device)

for name in dict_file_name.keys(): # ['EI_anger', 'EI_joy', 'EI_fear', 'EI_sadness', 'V']
  for embedding in dict_emb_file.keys() : # 'glove', 'glove_gn'
    model = dict_model_arch[name][embedding]
    model = model.to(device)
    inputs, score = next(iter(dict_dataloader[name]['train']))
    print(inputs)
    regression_output, domain_classifier_output = model (inputs)
    print(regression_output, domain_classifier_output)
    break
  break

In [40]:
for step, batch in enumerate(tqdm(dict_dataloader['V']['train'])):
  print (batch)
  break

  0%|          | 0/74 [00:00<?, ?it/s]

[tensor([[  37,  264,  299,  ...,    0,    0,    0],
        [   2, 1048,   14,  ...,    0,    0,    0],
        [3336,   35, 3481,  ...,    0,    0,    0],
        ...,
        [3986,    2,   37,  ...,    0,    0,    0],
        [ 386,  483,   26,  ...,    0,    0,    0],
        [  22,   61,  192,  ...,    0,    0,    0]]), tensor([0.4680, 0.5650, 0.6000, 0.2420, 0.6610, 0.3230, 0.8970, 0.2330, 0.3830,
        0.6830, 0.8460, 0.5310, 0.6130, 0.7660, 0.6410, 0.4500],
       dtype=torch.float64)]





In [41]:
#@title Weight Initialization

def weights_init(m):
    if isinstance(m, nn.Conv1d):
      nn.init.xavier_normal_(m.weight)
      # m.weight.data.normal_(0.0, 0.02)
    elif isinstance(m, nn.Linear):
      nn.init.xavier_normal_(m.weight)
      nn.init.zeros_(m.bias)
      # m.weight.data.normal_(1.0, 0.02)
      # m.bias.data.fill_(0)

In [53]:
#@title Typical Training Function with Learning rate

from tqdm import tqdm # for beautiful model training updates

def train_model(model, device, train_loader, optimizer, epoch, scheduler= None):
    pbar = tqdm(train_loader) # putting the iterator in pbar

    processed =0 # for accuracy denominator
    epoch_loss = 0.0

    for batch_idx, batch in enumerate(pbar):
      # print("batch", batch)
       
      tweets, intensities = batch[0].to(device), batch[1].to(device)  # plural, we are not interested in domain
      #sending data to CPU or GPU as per device
      model.train() # setting the model in training mode
      optimizer.zero_grad() # setting gradients to zero to avoid accumulation

      y_preds,_ = model(tweets) # forward pass, result captured in y_preds (plural as there are many body in a batch)
      # we are not interested in domain prediction
      # the predictions are in one hot vector

      regression_loss = regression_loss_function(y_preds,intensities.unsqueeze(1)) # Computing loss
      # loss = F.mse_loss(y_preds,intensities.unsqueeze(1)) # Computing loss

      # train_regresion_losses.append(regression_loss.item()) # to capture loss over many epochs

      regression_loss.backward() # backpropagation, creating gradients

      optimizer.step() # updating the params

      epoch_loss += regression_loss.item()

      processed += len(tweets)

      curr_lr = optimizer.param_groups[0]['lr']

      pbar.set_description(desc= f'Loss={regression_loss.item()} Batch_id={batch_idx} Epoch Average loss={epoch_loss/processed:0.8f} LR={curr_lr:0.6f}')
    return float("{:.8f}".format(epoch_loss/processed)),  curr_lr

In [56]:
#@title Typical Test Function
def test_model(model, device, data_loader, mode= 'test'):
    model.eval() # setting the model in evaluation mode
    loss = 0
    correct = 0 # for accuracy numerator
    test_regresion_losses =[] # for overall epoch (summed over batches)
    valid_regresion_losses =[] # for overall epoch (summed over batches)

    with torch.no_grad():
        for batch in data_loader:

            tweets, intensities  = batch[0].to(device), batch[1].to(device) #sending data to CPU or GPU as per device
            # we are not interested in domains
            
            y_preds,_ = model(tweets) # forward pass, result captured in outputs (plural as there are many bodies in a batch)
            # the outputs are in batch size x one hot vector 
            # not interested in domain output

            regression_loss = regression_loss_function(y_preds,intensities.unsqueeze(1))

            if mode == 'test':
              test_regresion_losses.append(regression_loss.item())
            else:
              valid_regresion_losses.append(regression_loss.item())

        if mode == 'test':
          avg_epoch_test_loss = float("{:.6f}".format(sum(test_regresion_losses) / len(data_loader)))
          print(f'TEST LOSS (Average) : {avg_epoch_test_loss}')
          return float(avg_epoch_test_loss)
        else:
          avg_epoch_valid_loss = float("{:.6f}".format(sum(valid_regresion_losses) / len(data_loader)))
          print(f'VALIDATION LOSS (Average) : {avg_epoch_valid_loss}')
          return float(avg_epoch_valid_loss)

In [44]:
#@title Early Stopping Function

def early_stopping_difference(list_loss: list, patience = 5, difference = 0.0003):
  if len(list_loss) > patience:
    reverse_list_loss = list_loss[::-1]
    reverse_list_loss = reverse_list_loss[0:patience+1]
    for index in range(0,len(reverse_list_loss)-1): 
      if abs(reverse_list_loss[index] - reverse_list_loss[index+1]) > difference:
        return False
    return True
  else:
    return False

In [57]:
#@title EXECUTION NON DANN 
# EXECUTION (NON DANN) FOR MULTIPLE MODELS

lr = INITIAL_LR

EPOCHS = NUM_EPOCHS
EPOCHS = 2

dict_non_dann_model_saved= {}
dict_non_dann_losses_list = {}

for name in dict_file_name.keys(): # ['EI_anger', 'EI_joy', 'EI_fear', 'EI_sadness', 'V']
# for embedding in dict_emb_file.keys(): # 'glove', 'glove_gn'
  dict_non_dann_losses_list[name] = {}
  dict_non_dann_model_saved[name] = {}
  for embedding in dict_emb_file.keys(): # 'glove', 'glove_gn'
  # for name in dict_file_name.keys(): # ['EI_anger', 'EI_joy', 'EI_fear', 'EI_sadness', 'V']
    #model
    model = dict_model_arch[name][embedding]
    model = model.to(DEVICE)
    model.apply(weights_init)
    
    #learning rate
    lr = dict_initial_lr[name] # experiment learning rate configuration

    #optimizer
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01) 
    
    # scheduler
    scheduler = ReduceLROnPlateau(optimizer, factor = 0.1, patience = 5, threshold =  0.0005, verbose = True)

    # loss functions
    domain_loss_function= (nn.BCEWithLogitsLoss()).to(DEVICE)
    regression_loss_function = (nn.L1Loss()).to(DEVICE)

    # loss accumulation
    train_losses = [] # to capture train losses over training epochs
    val_losses = [] # to capture validation loss over epochs

    print(f'----------------------training started for {name}-{embedding}-----------------')
    print(f'starting LR : {lr}')
    
    epoch_converge = NUM_EPOCHS
    
    for epoch in range(EPOCHS):
      print("EPOCH:", epoch+1)

      # Train
      avg_epoch_loss, curr_lr = train_model(model, DEVICE, dict_dataloader[name]['train'] , optimizer, epoch)
      train_losses.append(avg_epoch_loss)
      scheduler.step(avg_epoch_loss) #applying scheduler on training loss

      # Validation
      avg_epoch_valid_loss = test_model(model, DEVICE, dict_dataloader[name]['val'], mode = 'val')
      val_losses.append(avg_epoch_valid_loss)

      if early_stopping_difference(val_losses, patience = PATIENCE):
        print (f'-------Early Stopping at epoch {epoch+1}---')
        epoch_converge = epoch+1
        break

    # testing the model when all epochs are finished (outsied epoch loop)

    test_loss = test_model(model, DEVICE, dict_dataloader[name]['test'], mode = 'test')

    dict_non_dann_losses_list [name][embedding] = {'train_losses' : train_losses, 'val_losses': val_losses, 'test_loss' : test_loss , 'epoch_convergence' : epoch_converge  }

    model_name = name + "_" +str(time.strftime("%d_%m"))+ "_non_dann_"+embedding+".pt"
    torch.save(model.state_dict(), os.path.join(MODEL_DIR, model_name))
    dict_non_dann_model_saved[name][embedding]= model_name

    print(f'----------------------training complete for {name}-----------------')

print (f'---NON DANN Results---')

for name in dict_file_name.keys(): # ['EI_anger', 'EI_joy', 'EI_fear', 'EI_sadness', 'V']
  for embedding in dict_emb_file.keys(): # 'glove', 'glove_gn'
    print ("test loss for ", name," ",embedding ," ", dict_non_dann_losses_list[name][embedding]['epoch_convergence']," \t:\t",  dict_non_dann_losses_list[name][embedding]['test_loss'])




# for name, model_arch in dict_model_arch.items():
#   model = model_arch
#   # optimizer = optim.Adam(model.parameters(), lr=lr)

#   lr = dict_initial_lr[name]
#   optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
#   # scheduler
#   scheduler = ReduceLROnPlateau(optimizer, factor = 0.1, patience = 5, threshold =  0.0005, verbose = True)
#   # scheduler = ReduceLROnPlateau(optimizer, factor = 0.5, patience = 5, threshold =  0.002)

#   domain_loss_function= nn.BCEWithLogitsLoss()
#   regression_loss_function = nn.L1Loss()
#   model = model.to(DEVICE)
#   model.apply(weights_init)
#   domain_loss_function = domain_loss_function.to(DEVICE)
#   regression_loss_function = regression_loss_function.to(DEVICE)


#   train_losses = [] # to capture train losses over training epochs
#   val_losses = [] # to capture validation loss over epochs


#   print(f'----------------------training started for {name}-----------------')
#   print(f'starting LR : {lr}')
#   epoch_converge = NUM_EPOCHS
#   for epoch in range(EPOCHS):
#     print("EPOCH:", epoch+1)
#     # train_model(model, DEVICE, train_iterator, optimizer, epoch) # single model
#     avg_epoch_loss, curr_lr = train_model(model, DEVICE, dict_iterator[name]['train_iterator'], optimizer, epoch)
#     train_losses.append(avg_epoch_loss)
    
#     scheduler.step(avg_epoch_loss) #applying scheduler on training loss

#     # test_model(model, DEVICE, valid_iterator, mode = 'val')# single model
#     avg_epoch_valid_loss = test_model(model, DEVICE, dict_iterator[name]['val_iterator'], mode = 'val')

        
#     # optimizer = optim.Adam(model.parameters(), lr=curr_lr*LR_GAMMA)
#     # optimizer = optim.AdamW(model.parameters(), lr=curr_lr*LR_GAMMA, weight_decay=0.01)

    
#     val_losses.append(avg_epoch_valid_loss)

#     if early_stopping_difference(val_losses, patience = PATIENCE):
#       print (f'-------Early Stopping at epoch {epoch+1}---')
#       epoch_converge = epoch+1
#       break

#   # testing the model when all epochs are finished (outsied epoch loop)
#   # test_model(model, DEVICE, test_iterator, mode = 'test')# single model
#   test_loss = test_model(model, DEVICE, dict_iterator[name]['test_iterator'], mode = 'test')

#   dict_non_dann_losses_list [name] = {'train_losses' : train_losses, 'val_losses': val_losses, 'test_loss' : test_loss , 'epoch_convergence' : epoch_converge  }

#   model_name = name + "_" +str(time.strftime("%d_%m"))+ "_non_dann_"+EMBEDDING_TO_BE_USED+".pt"
#   torch.save(model.state_dict(), os.path.join(MODEL_DIR, model_name))
#   dict_non_dann_model_saved[name]= model_name
#   print(f'----------------------training complete for {name}-----------------')
# print (f'---NON DANN @ lr = {lr}---')
# for name, values in dict_non_dann_losses_list.items():
#   print ("test loss for ", name," ", values['epoch_convergence']," \t:\t",  values['test_loss'])

----------------------training started for EI_anger-glove-----------------
starting LR : 0.05
EPOCH: 1


Loss=0.18406440920829772 Batch_id=106 Epoch Average loss=0.00926586 LR=0.050000: 100%|██████████| 107/107 [00:26<00:00,  3.98it/s]


VALIDATION LOSS (Average) : 0.164384
EPOCH: 2


Loss=0.10874477496147154 Batch_id=106 Epoch Average loss=0.00877955 LR=0.050000: 100%|██████████| 107/107 [00:26<00:00,  4.01it/s]


VALIDATION LOSS (Average) : 0.164471
TEST LOSS (Average) : 0.157472
----------------------training complete for EI_anger-----------------
----------------------training started for EI_anger-glove_gn-----------------
starting LR : 0.05
EPOCH: 1


Loss=0.1796259105205536 Batch_id=106 Epoch Average loss=0.01156230 LR=0.050000: 100%|██████████| 107/107 [00:26<00:00,  4.08it/s]


VALIDATION LOSS (Average) : 0.164952
EPOCH: 2


Loss=0.074251983833313 Batch_id=106 Epoch Average loss=0.00861223 LR=0.050000: 100%|██████████| 107/107 [00:25<00:00,  4.13it/s]


VALIDATION LOSS (Average) : 0.16902
TEST LOSS (Average) : 0.163972
----------------------training complete for EI_anger-----------------
----------------------training started for EI_joy-glove-----------------
starting LR : 0.05
EPOCH: 1


Loss=0.16291790051758293 Batch_id=100 Epoch Average loss=0.01332402 LR=0.050000: 100%|██████████| 101/101 [00:18<00:00,  5.51it/s]


VALIDATION LOSS (Average) : 0.158654
EPOCH: 2


Loss=0.18782181030511858 Batch_id=100 Epoch Average loss=0.01110366 LR=0.050000: 100%|██████████| 101/101 [00:19<00:00,  5.11it/s]


VALIDATION LOSS (Average) : 0.161025
TEST LOSS (Average) : 0.156351
----------------------training complete for EI_joy-----------------
----------------------training started for EI_joy-glove_gn-----------------
starting LR : 0.05
EPOCH: 1


Loss=0.17863800331950186 Batch_id=100 Epoch Average loss=0.01333877 LR=0.050000: 100%|██████████| 101/101 [00:18<00:00,  5.49it/s]


VALIDATION LOSS (Average) : 0.15874
EPOCH: 2


Loss=0.21973129327595234 Batch_id=100 Epoch Average loss=0.01099332 LR=0.050000: 100%|██████████| 101/101 [00:19<00:00,  5.26it/s]


VALIDATION LOSS (Average) : 0.168804
TEST LOSS (Average) : 0.176392
----------------------training complete for EI_joy-----------------
----------------------training started for EI_fear-glove-----------------
starting LR : 0.02
EPOCH: 1


Loss=0.19716322563091915 Batch_id=140 Epoch Average loss=0.01135145 LR=0.020000: 100%|██████████| 141/141 [00:24<00:00,  5.86it/s]


VALIDATION LOSS (Average) : 0.143092
EPOCH: 2


Loss=0.16342939540743828 Batch_id=114 Epoch Average loss=0.01021010 LR=0.020000:  82%|████████▏ | 115/141 [00:19<00:04,  5.82it/s]


KeyboardInterrupt: ignored