<a href="https://colab.research.google.com/github/peeyushsinghal/da/blob/main/mitigating_bias_sa_da_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mitigating bias in sentiment analysis using domain adaptation

In [189]:
! pip install ekphrasis # library to pre process twitter data
! pip install emoji --upgrade #library to deal with emoji data
! pip install NRCLex



Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting NRCLex
  Downloading NRCLex-4.0-py3-none-any.whl (4.4 kB)
  Downloading NRCLex-3.0.0.tar.gz (396 kB)
[K     |████████████████████████████████| 396 kB 4.2 MB/s 
Building wheels for collected packages: NRCLex
  Building wheel for NRCLex (setup.py) ... [?25l[?25hdone
  Created wheel for NRCLex: filename=NRCLex-3.0.0-py3-none-any.whl size=43329 sha256=0cf6f6f4660ebc1514b002c681bccad5d734fe9ab5330de55e1f5da43afc59e1
  Stored in directory: /root/.cache/pip/wheels/af/2c/9c/dfa19d1b65326c520b32850a9311f6d4eda679ac04dba26081
Successfully built NRCLex
Installing collected packages: NRCLex
Successfully installed NRCLex-3.0.0


In [208]:
## Import statements
import pandas as pd
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchtext.vocab import GloVe
import numpy as np
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import emoji
from tqdm import tqdm
import random
import torch.optim as optim
import json

from nrclex import NRCLex

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# checking device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:{}".format(DEVICE))

Running on:cpu


# Data Handling

Mounting google drive for data in there

In [7]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


##Data configuration

In [193]:
BASE_PATH = '/content/drive/MyDrive/semeval-2018'
DATA_DIR = os.path.join(BASE_PATH,'datasets')
MODEL_DIR = os.path.join(BASE_PATH,'models')
REF_DIR = os.path.join(BASE_PATH,'reference')

if not os.path.exists(MODEL_DIR):
  os.makedirs(MODEL_DIR)
  print("The new directory is created!")

domain_source = 0.0
domain_target = 1.0

In [11]:
class TASK1(object):
  
    EI_reg = {
        'anger': {
            'train': os.path.join(
                DATA_DIR, 'task1/EI-reg/training/EI-reg-En-anger-train.txt'),
            'dev': os.path.join(
                DATA_DIR, 'task1/EI-reg/development/2018-EI-reg-En-anger-dev.txt'),
            'gold': os.path.join(
                DATA_DIR, 'task1/EI-reg/test-gold/2018-EI-reg-En-anger-test-gold.txt')
                }
        }

    V_reg = {
        'train': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-train.txt'),
        'dev': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-dev.txt'),
        'gold': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-test-gold.txt')
             }

    EEC = {
        'eec': os.path.join(
            DATA_DIR, 'task1/Equity-Evaluation-Corpus/Equity-Evaluation-Corpus.csv')
             }

##Dataloading and Parsing

### Source Data

Parsing Emotion and Valence regression data : `format [ID	Tweet	Affect Dimension	Intensity Score]`

In [101]:
def parse_reg(data_file, label_format='tuple')-> pd.DataFrame:
    """
    This is for datasets for the EI-reg and V-reg English tasks 
    Returns:
        df: dataframe with columns in the first row of file [ID-Tweet-Affect Dimension-Intensity Score]
    """
    with open(data_file, 'r') as fd:
      data = [l.strip().split('\t') for l in fd.readlines()]
    
    df = pd.DataFrame (data[1:],columns=data[0])
    df['domain'] = domain_source
    return df

### Target Data

parsing EEC data : `format [ID	Sentence	Template	Person	Gender	Race Emotion	Emotion word]`

In [102]:
def parse_eec()->pd.DataFrame:
  """
  This is for EEC Dataset, it is a csv file
  Returns:
        df_eec: dataframe 
  """
  data_train = TASK1.EEC['eec']
  df_eec = pd.read_csv(data_train)
  df_eec['domain'] = domain_target
  return df_eec


In [103]:
def parse(task, dataset, emotion='anger') -> pd.DataFrame:
    if task == 'EI-reg':
        data_train = TASK1.EI_reg[emotion][dataset]
        df = parse_reg(data_train)
        df[df.columns[-1]] = df[df.columns[-1]].astype(float)
        return df
    elif task == 'V-reg':
        data_train = TASK1.V_reg[dataset]
        df = parse_reg(data_train)
        df[df.columns[-1]] = df[df.columns[-1]].astype(float)
        return df
    else:
        return None, None

In [104]:
## Creating Dataframes
df_EI_reg_train = parse('EI-reg','train')
df_EI_reg_val = parse('EI-reg','dev')
df_EI_reg_test = parse('EI-reg','gold')
df_V_reg_train = parse('V-reg','train')
df_V_reg_val = parse('V-reg','dev')
df_V_reg_test = parse('V-reg','gold')

dict_df= {'df_EI_reg_train':df_EI_reg_train, 
          'df_EI_reg_val':df_EI_reg_val, 
          'df_EI_reg_test':df_EI_reg_test, 
          'df_V_reg_train': df_V_reg_train, 
          'df_V_reg_val':df_V_reg_val, 
          'df_V_reg_test': df_V_reg_test 
          }

In [172]:
dict_df['df_V_reg_test']
count = 0
for name, df in dict_df.items():
  if "train" in name:
    count = count+ len(df)
    print(len(df))

print("count", count)


1701
1181
count 2882


In [151]:
## Creating EEC Dataframe
df_EEC = parse_eec()
df_EEC.head()

Unnamed: 0,ID,Sentence,Template,Person,Gender,Race,Emotion,Emotion word,domain
0,2018-En-mystery-05498,Alonzo feels angry.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,angry,1.0
1,2018-En-mystery-11722,Alonzo feels furious.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,furious,1.0
2,2018-En-mystery-11364,Alonzo feels irritated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,irritated,1.0
3,2018-En-mystery-14320,Alonzo feels enraged.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,enraged,1.0
4,2018-En-mystery-14114,Alonzo feels annoyed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,annoyed,1.0


In [195]:
# unique_templates = df_EEC['Template'].unique()

# dict_templates ={}
# for index, template in enumerate(unique_templates):
#   name = "T_"+str(index)
#   dict_templates[template] = name

In [None]:
# df_EEC['Template_Number'] = df_EEC['Template']
# df_EEC['Template_Number'] = df_EEC['Template_Number'].map(dict_templates)

In [199]:
# df_EEC_subset_anger = df_EEC[df_EEC['Emotion']=='anger']

In [201]:
# len(df_EEC_subset_anger)

2100

##PreProcess Twitter Data

In [23]:
# reference : https://github.com/cbaziotis/ekphrasis


text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading twitter - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_1grams.txt
Reading twitter - 2grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_2grams.txt
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [54]:
# #### Example checks of pre-processing
# sentences = [
#     "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))",
#     "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
#     "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/.",
#     "@MGBarbieri @SpalkTalk a@b.com And just saw your LinkedIn comment after I sent this! Thanks for the message :) 😀",
#     "💙💛🏆 @GeorgeePitman Young Player of The Season 🏆💛💙 #irony #actuallyseventy"
# ]

# for s in sentences:
#     print(" ".join(text_processor.pre_process_doc(s)))
# # print ([text_processor.pre_process_doc(s) for s in sentences])

In [24]:
def preprocess_tweets(df)-> pd.DataFrame:
  tweets = df.Tweet.to_list()
  # df['TweetTokens'] = [emoji.demojize(text_processor.pre_process_doc(tweet),language = 'en') for tweet in tweets] # Translates emoji in to word and preprocesss
  # df['TweetTokens'] = [text_processor.pre_process_doc(tweet) for tweet in tweets] # preprocesss
  # tweets_processed = [text_processor.pre_process_doc(tweet) for tweet in tweets] # preprocesss
  # for tweet in tweets_processed:
  #   for index, token in enumerate(tweet):
  #     if emoji.is_emoji(token):
  #       tweet[index] = emoji.demojize(token, language = 'en')

  tweets_processed = [" ".join(text_processor.pre_process_doc(tweet)) for tweet in tweets] # preprocesss
  # print (tweets_processed)
  for index, tweet in enumerate(tweets_processed):
      tweets_processed[index] = emoji.demojize(tweet, language = 'en')
  
  df['TweetTokens'] = tweets_processed
  # print(df)
  return df

In [25]:
# df_EI_reg_train = preprocess_tweets(df_EI_reg_train)
# df_V_reg_train = preprocess_tweets(df_V_reg_train)

for name, df in dict_df.items():
  df = preprocess_tweets(df)


In [174]:
#df_V_reg_test

TODO : 
* remove stop words
* stem
* lemmetize


In [26]:
df_V_reg_train.columns

Index(['ID', 'Tweet', 'Affect Dimension', 'Intensity Score', 'domain',
       'TweetTokens'],
      dtype='object')

In [27]:
def subset_df(df):
  return df[['TweetTokens','Intensity Score','domain']]

In [28]:
dict_df_subset ={name+"_subset": subset_df(df) for name, df in dict_df.items() }

In [None]:
print (dict_df_subset)

## Creating Pytorch Datasets

### Creating Vocabulary
Before we create the Dataset, we need to define a process to build our vocabulary. For this,
We’ll create a “Vocabulary” class which will create the word-to-index and index-to-word mappings using only the train dataframe we created before
Also, the “Vocabulary” class returns the numericalized version of each sentence in our dataframe. Eg: [‘i’, ‘love’, ‘apple’] -> [23, 54, 1220]. We need to convert the words to numbers as models expect each word in our vocabulary to be represented by a number

In [173]:
#######################################################
#               Define Vocabulary Class
#######################################################

class Vocabulary:
  
    '''
    __init__ method is called by default as soon as an object of this class is initiated
    we use this method to initiate our vocab dictionaries
    '''
    def __init__(self, freq_threshold = 1, max_size = 10000):
        '''
        freq_threshold : the minimum times a word must occur in corpus to be treated in vocab
        max_size : max source vocab size. Eg. if set to 10,000, we pick the top 10,000 most frequent words and discard others
        '''
        #initiate the index to token dict
        ## <PAD> -> padding, used for padding the shorter sentences in a batch to match the length of longest sentence in the batch
        ## <UNK> -> words which are not found in the vocab are replace by this token
        # self.itos = {0: '<PAD>', 1: '<UNK>', 2:'<NUMBER>', 3: '<CURRENCY>', 4: '<URL>'}
        self.itos = {0: '<PAD>', 1: '<UNK>'}
        
        
        #initiate the token to index dict
        self.stoi = {k:j for j,k in self.itos.items()}
        self.original_stoi = self.stoi.copy()
#         print(self.stoi)
        
        self.freq_threshold = freq_threshold
        self.max_size = max_size
    
    '''
    __len__ is used by dataloader later to create batches
    '''
    def __len__(self):
        return len(self.itos)
    
    '''
    a simple tokenizer to split on space and converts the sentence to list of words
    '''
    @staticmethod
    def tokenizer(text):
#         return [tok.strip() for tok in text.split(' ')]
        return [tok.lower().strip() for tok in text.split(' ')] # this is commented out to avoid <NUMBER> ,<UNK> lowering
#         return [tok.lower().strip() for tok in text.split(' ') if tok not in list(self.stoi.keys())] 
    
    '''
    build the vocab: create a dictionary mapping of index to string (itos) and string to index (stoi)
    output ex. for stoi -> {'the':6, 'a':7, 'an':8}
    '''
    def build_vocabulary(self, sentence_list):
        #calculate the frequencies of each word first to remove the words with freq < freq_threshold
#         frequencies = {}  #init the freq dict
        frequencies = {k:self.max_size+1 for _,k in self.itos.items()}  # updated so that intial ones are also part of this
        
        # idx = 5 #index from which we want our dict to start. We already used 4 indexes for pad, unk...
        idx = len(self.original_stoi)
        
        #calculate freq of words
        for sentence in sentence_list:
            list_word = [tok.lower().strip() for tok in sentence.split(' ') if tok not in list(self.stoi.keys())] 
            for word in list_word:
#             for word in self.tokenizer(sentence):
                
                if word not in frequencies.keys():
                    frequencies[word]=1
                else:
                    
                    frequencies[word]+=1
                    
#         print ("----2-----\n",frequencies)
        
        #limit vocab by removing low freq words
        frequencies = {k:v for k,v in frequencies.items() if v>self.freq_threshold} 
        
#         print ("----3-----\n",frequencies)
        
        #limit vocab to the max_size specified
        frequencies = dict(sorted(frequencies.items(), key = lambda x: -x[1])[:self.max_size-idx]) # idx =5 for pad, unk...
        
#         print ("----4-----\n",frequencies)
            
        #create vocab
        for key in set(self.stoi.keys()):
            frequencies.pop(key)
        
#         print ("----5-----\n",frequencies)
        
        for word in frequencies.keys():
            self.stoi[word] = idx
            self.itos[idx] = word
            idx+=1
        
#         print ("----6-----\n",self.stoi)
        
    '''
    convert the list of words to a list of corresponding indexes
    '''    
    def numericalize(self, text):
        #tokenize text
#         tokenized_text = self.tokenizer(text)
#         print("---------\n",self.original_stoi.keys())
        tokenized_text = []
        for tok in text.split(' '):
            if tok not in list(self.original_stoi.keys()):
                tokenized_text.append(tok.lower().strip())
            else:
                tokenized_text.append(tok.strip())
                
#         tokenized_text = [tok.lower().strip() for tok in text.split(' ') if tok not in list(self.original_stoi.keys())]
        numericalized_text = []
        for token in tokenized_text:
            if token in self.stoi.keys():
                numericalized_text.append(self.stoi[token])
            else: #out-of-vocab words are represented by UNK token index
                numericalized_text.append(self.stoi['<UNK>'])
                
        return numericalized_text

In [63]:
# # #create a vocab class with freq_threshold=0 and max_size=100
# voc = Vocabulary(0, 100)
# sentence_list = ['that is a cat CAT', 'that is not a dog']
# #build vocab
# voc.build_vocabulary(sentence_list)

# print('index to string: ',voc.itos)
# print('string to index:',voc.stoi)

# print('numericalize -> cat and a dog <URL>: ', voc.numericalize('cat and a dog <NUMBER>'))

In [231]:
# Creating sentence list of all the training dataframes to create vocabulary later, this would mean a more robust vocab
sentence_list = []
for name,df in dict_df_subset.items():
  if "train" in name or 'val' in name :
    sentence_list.extend(df.TweetTokens.to_list())
print(len(sentence_list))

3719


In [232]:
## Adding more words - especially emotions and sentiments from NRCLex

lexicon_file = os.path.join(REF_DIR,'nrc_en.json')
print(os.path.isfile(lexicon_file) )
print(lexicon_file)

with open(lexicon_file, 'r') as json_file:
  lexicon_json= json.loads(json_file.read())


True
/content/drive/MyDrive/semeval-2018/reference/nrc_en.json


In [250]:
list_lexicon_all = []
for name, value in lexicon_json.items():
  list_lexicon_all.append(name)

list_lexicon_all = list(set(list_lexicon_all))
print(len(list_lexicon_all),list_lexicon_all)



In [252]:
print(len(sentence_list))
sentence_list.extend(list_lexicon_all)
print(len(sentence_list))

3739
10207


In [None]:
sentence_list[3729:]

In [272]:
freq_threshold = 1
vocab_max_size = 50000

vocab = Vocabulary(freq_threshold, vocab_max_size)
vocab.build_vocabulary(sentence_list)

In [269]:
len(vocab.stoi)

5068

### Build Source Train Dataset
We first inherit PyTorch's Dataset class.
Then, we initialize and build the vocabs for subject in our train data frame.
Then, we use the getitem() method to numericalize the subject 1 example at a time for the data loader (a function to load data in batches).

In [278]:
#######################################################
#               Define Train_Dataset class
#######################################################

class Train_Dataset(Dataset):
    '''
    Initiating Variables
    df: the training dataframe
    subject : the name of target text column in the dataframe
    transform : If we want to add any augmentation
    freq_threshold : the minimum times a word must occur in corpus to be treated in vocab
    vocab_max_size : max  vocab size
    '''
    
    def __init__(self, df, subject, label_col, vocab , domain = domain_source, max_sentence_length = 150, transform=None, freq_threshold = 5,
                vocab_max_size = 50000):
    
        self.df = df
        self.transform = transform
        
        #get body and label
        self.subject_texts = self.df[subject]
        self.labels = self.df[label_col].astype(float)
        self.domain = domain
        self.vocab = vocab
        
        # ##VOCAB class has been created above
        # #Initialize vocab object and build vocabulary
        # self.vocab = Vocabulary(freq_threshold, vocab_max_size)
        # self.vocab.build_vocabulary(self.subject_texts.tolist())
        self.max_sentence_length = max_sentence_length
        
    def __len__(self):
        return len(self.df)
    
    '''
    __getitem__ runs on 1 example at a time. Here, we get an example at index and return its numericalize source and
    target values using the vocabulary objects we created in __init__
    '''
    def __getitem__(self, index):
        subject_text = self.subject_texts[index]
        label = self.labels[index]
        domain_value = self.domain
#         print(subject_text)
        
        if self.transform is not None:
            subject_text = self.transform(subject_text)
            
        #numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1,12,2,9,24,2]
        numerialized_subject =[]
        numerialized_subject += self.vocab.numericalize(subject_text)
        
        while len(numerialized_subject) < self.max_sentence_length:
            numerialized_subject.append(0)
        
        #convert the list to tensor and return
        return torch.tensor(numerialized_subject[:self.max_sentence_length]), torch.tensor(label), torch.tensor(domain_value)
#         return torch.tensor(numerialized_subject[:self.train_dataset.max_sentence_length]),label

In [256]:
dict_df_subset.keys()

dict_keys(['df_EI_reg_train_subset', 'df_EI_reg_val_subset', 'df_EI_reg_test_subset', 'df_V_reg_train_subset', 'df_V_reg_val_subset', 'df_V_reg_test_subset'])

In [279]:
df_train = dict_df_subset['df_EI_reg_train_subset']
df_train

Unnamed: 0,TweetTokens,Intensity Score,domain
0,<user> <user> shut up hashtags are cool <hasht...,0.562,0.0
1,it makes me so fucking irate jesus . nobody is...,0.750,0.0
2,lol adam the bull with his fake outrage . <rep...,0.417,0.0
3,<user> passed away early this morning in a fas...,0.354,0.0
4,<user> lol wow i was gonna say really ? ! <rep...,0.438,0.0
...,...,...,...
1696,got a <money> tip from a drunk uber passenger ...,0.708,0.0
1697,<user> <user> <user> <user> fucker blocked me ...,0.625,0.0
1698,<user> i look rabid,0.472,0.0
1699,"<user> i am not surprised , i would be fuming ...",0.479,0.0


In [280]:
dict_train_dataset ={}
for name,df in dict_df_subset.items():
  if "train" in name:
    dataset_name = name+"_dataset"
    # vars()[dataset_name] = Train_Dataset(df,'TweetTokens','Intensity Score', max_sentence_length =200) # dynamically assigning datasetname
    dict_train_dataset[dataset_name] = Train_Dataset(df,'TweetTokens','Intensity Score', vocab, domain = domain_source, max_sentence_length =200) # dynamically assigning datasetname

In [259]:
print(dict_train_dataset.keys())

dict_keys(['df_EI_reg_train_subset_dataset', 'df_V_reg_train_subset_dataset'])


In [281]:
i = random.randint(0,len(dict_train_dataset['df_EI_reg_train_subset_dataset']))
print(dict_df_subset['df_EI_reg_train_subset'].loc[i][['TweetTokens','Intensity Score','domain']])
print((dict_train_dataset['df_EI_reg_train_subset_dataset'][i][0]))
print((dict_train_dataset['df_EI_reg_train_subset_dataset'][i][1]))
print((dict_train_dataset['df_EI_reg_train_subset_dataset'][i][2]))

TweetTokens        why is it always me picking up the pieces <ann...
Intensity Score                                                0.812
domain                                                           0.0
Name: 800, dtype: object
tensor([  83,   14,   17,  124,   24, 3587,   48,    7, 1955, 1034,    2,   86,
           3,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0

In [75]:
# train_dataset_obj = Train_Dataset(df_train,'TweetTokens','Intensity Score', vocab,max_sentence_length =200)

In [76]:
# i = random.randint(0,len(train_dataset_obj))
# print(df_train.loc[i][['TweetTokens','Intensity Score']])
# print((train_dataset_obj[i][1]))
# print(len(train_dataset_obj[i][0]))
# print(train_dataset_obj[i][0])

In [77]:
# print(df_EI_reg_train_subset_dataset)

In [78]:
# i = random.randint(0,len(df_EI_reg_train_subset_dataset))
# # print(train.loc[i][['body','label']])
# print(type(df_EI_reg_train_subset_dataset[i][1]))
# len(df_EI_reg_train_subset_dataset[i][0])

### Build Source Validation and Test Dataset

In [282]:
#######################################################
#               Define Validation / Test Dataset Class
#######################################################

class Validation_Dataset(Dataset):
    def __init__(self, train_dataset, df, subject, label_col, domain = domain_source, transform = None):
        self.df = df
        self.transform = transform
        
        #train dataset will be used as lookup for vocab
        self.train_dataset = train_dataset
        
        #get body and label
        self.subject_texts = self.df[subject]
        self.labels = self.df[label_col].astype(float)
        self.domain = domain
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,index):
        
        subject_text = self.subject_texts[index]
        label = self.labels[index]
        domain_value = self.domain
#         print(subject_text)
        
        if self.transform is not None:
            subject_text = self.transform(subject_text)
            

        #numericalize texts ['cat', 'in', 'a', 'bag'] -> [12,2,9,24]
        numerialized_subject = []
        numerialized_subject += self.train_dataset.vocab.numericalize(subject_text)
#         print("max sentence length", self.train_dataset.max_sentence_length)
        while len(numerialized_subject) < self.train_dataset.max_sentence_length:
            numerialized_subject.append(0)
            
        #convert the list to tensor and return
#         return torch.tensor(numerialized_subject),label

#         #convert the list to tensor and return
        return torch.tensor(numerialized_subject[:self.train_dataset.max_sentence_length]),torch.tensor(label), torch.tensor(domain_value)


In [283]:
dict_val_dataset = {}
for name,df in dict_df_subset.items():
  if "train" in name:
    train_dataset_name = name+"_dataset"
    val_df_name = name[:len(name)-13]+"_val_subset"
    val_dataset_name = val_df_name + "_dataset"
    dict_val_dataset[val_dataset_name] = Validation_Dataset(dict_train_dataset[train_dataset_name], dict_df_subset[val_df_name], 'TweetTokens','Intensity Score', transform = None)

In [284]:
dict_test_dataset = {}
for name,df in dict_df_subset.items():
  if "train" in name:
    train_dataset_name = name+"_dataset"
    test_df_name = name[:len(name)-13]+"_test_subset"
    test_dataset_name = test_df_name + "_dataset"
    dict_test_dataset[test_dataset_name] = Validation_Dataset(dict_train_dataset[train_dataset_name], dict_df_subset[test_df_name], 'TweetTokens','Intensity Score', transform = None)

In [71]:
# dict_val_dataset.keys()

In [72]:
# dict_test_dataset.keys()

In [78]:
(dict_test_dataset['df_EI_reg_test_subset_dataset'])

<__main__.Validation_Dataset at 0x7f4f06386f50>

In [286]:
i = random.randint(0,len(dict_val_dataset['df_EI_reg_val_subset_dataset']))
# i = 3774
print("i=",i)
print(dict_df_subset['df_EI_reg_val_subset'].loc[i][['TweetTokens','Intensity Score','domain']])

print((dict_val_dataset['df_EI_reg_val_subset_dataset'][i][0]))
print((dict_val_dataset['df_EI_reg_val_subset_dataset'][i][1]))
print((dict_val_dataset['df_EI_reg_val_subset_dataset'][i][2]))

i= 107
TweetTokens        tonight ' s run . <repeated> <hashtag> restles...
Intensity Score                                                0.484
domain                                                           0.0
Name: 107, dtype: object
tensor([ 282,   11,   22,  421,    4,   19,    2, 1075,    3,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    

### Build Target (DA) dataset
EEC dataframe does not have label column,
Therefore, we would write another dataset class

In [287]:
#######################################################
#               Define DA class
#######################################################

class DA_Dataset(Dataset):
    '''
    Initiating Variables
    df: the training dataframe
    subject : the name of target text column in the dataframe
    transform : If we want to add any augmentation
    freq_threshold : the minimum times a word must occur in corpus to be treated in vocab
    vocab_max_size : max  vocab size
    '''
    
    def __init__(self, df, subject, vocab , domain = domain_target, max_sentence_length = 150, transform=None, freq_threshold = 5,
                vocab_max_size = 50000):
    
        self.df = df
        self.transform = transform
        
        #get body and label
        self.subject_texts = self.df[subject]
        self.domain = domain
        self.vocab = vocab
        
        self.max_sentence_length = max_sentence_length
        
    def __len__(self):
        return len(self.df)
    
    '''
    __getitem__ runs on 1 example at a time. Here, we get an example at index and return its numericalize source and
    target values using the vocabulary objects we created in __init__
    '''
    def __getitem__(self, index):
        subject_text = self.subject_texts[index]
        domain_value = self.domain
#         print(subject_text)
        
        if self.transform is not None:
            subject_text = self.transform(subject_text)
            
        #numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1,12,2,9,24,2]
        numerialized_subject =[]
        numerialized_subject += self.vocab.numericalize(subject_text)
        
        while len(numerialized_subject) < self.max_sentence_length:
            numerialized_subject.append(0)
        
        #convert the list to tensor and return
        return torch.tensor(numerialized_subject[:self.max_sentence_length]), torch.tensor(domain_value)

In [288]:
da_dataset = DA_Dataset(df_EEC,'Sentence', vocab, domain = domain_target, max_sentence_length =200) 

In [289]:
i = random.randint(0,len(da_dataset))
print("i=",i)
print(df_EEC.loc[i][['Sentence','domain']])
print(da_dataset[i][0])
print(da_dataset[i][1])

i= 5945
Sentence    Betsy found herself in a depressing situation.
domain                                                 1.0
Name: 5945, dtype: object
tensor([   1,  362, 3101,   21,   10,  424,    1,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0

## Create Dataloader


In [290]:

#######################################################
#            Define Dataloader Functions
#######################################################

# If we run a next(iter(data_loader)) we get an output of batch_size * (num_workers+1)
def get_loader(dataset, batch_size, num_workers=1, shuffle=True, pin_memory=True): #increase num_workers according to CPU
    loader = DataLoader(dataset, batch_size = batch_size, num_workers = num_workers,
                        shuffle=shuffle,
                       pin_memory=pin_memory)
    
    return loader


In [291]:
dict_train_loader = {}
batch_size = 8
for name, dataset in dict_train_dataset.items():
  name_dataloader = name+"_dataloader"
  dict_train_loader[name_dataloader] = get_loader(dataset, batch_size)
  x = next(iter(dict_train_loader[name_dataloader]))
  print(name_dataloader, x[0].shape, x[1].shape, type(x[0]), type (x[1]))

df_EI_reg_train_subset_dataset_dataloader torch.Size([8, 200]) torch.Size([8]) <class 'torch.Tensor'> <class 'torch.Tensor'>
df_V_reg_train_subset_dataset_dataloader torch.Size([8, 200]) torch.Size([8]) <class 'torch.Tensor'> <class 'torch.Tensor'>


In [292]:
dict_val_loader = {}
# batch_size = 8 
for name, dataset in dict_val_dataset.items():
  name_dataloader = name+"_dataloader"
  # dict_val_loader[name_dataloader] = get_loader(dataset,batch_size)
  dict_val_loader[name_dataloader] = get_loader(dataset,len(dataset))
  x = next(iter(dict_val_loader[name_dataloader]))
  print(name_dataloader, x[0].shape, x[1].shape, type(x[0]), type (x[1]))


df_EI_reg_val_subset_dataset_dataloader torch.Size([388, 200]) torch.Size([388]) <class 'torch.Tensor'> <class 'torch.Tensor'>
df_V_reg_val_subset_dataset_dataloader torch.Size([449, 200]) torch.Size([449]) <class 'torch.Tensor'> <class 'torch.Tensor'>


In [293]:
dict_test_loader = {}
# batch_size = 8 
for name, dataset in dict_test_dataset.items():
  name_dataloader = name+"_dataloader"
  dict_test_loader[name_dataloader] = get_loader(dataset,len(dataset))
  x = next(iter(dict_test_loader[name_dataloader]))
  print(name_dataloader, x[0].shape, x[1].shape, type(x[0]), type (x[1]))

df_EI_reg_test_subset_dataset_dataloader torch.Size([1002, 200]) torch.Size([1002]) <class 'torch.Tensor'> <class 'torch.Tensor'>
df_V_reg_test_subset_dataset_dataloader torch.Size([937, 200]) torch.Size([937]) <class 'torch.Tensor'> <class 'torch.Tensor'>


In [294]:
da_dataset_loader = get_loader(da_dataset, len(da_dataset))

In [None]:
next(iter(dict_val_loader['df_EI_reg_val_subset_dataset_dataloader'] ))

In [295]:
next(iter(da_dataset_loader))

[tensor([[  23, 1230,  362,  ...,    0,    0,    0],
         [   1,  217,   24,  ...,    0,    0,    0],
         [  42,  339, 1012,  ...,    0,    0,    0],
         ...,
         [   1,  785,    1,  ...,    0,    0,    0],
         [   7, 1037,   38,  ...,    0,    0,    0],
         [   7,  947,  222,  ...,    0,    0,    0]]),
 tensor([1., 1., 1.,  ..., 1., 1., 1.])]

# Embeddings 

In [87]:
# # Creating sentence list of all the training dataframes to create vocabulary later, this would mean a more robust vocab
# sentence_list = []
# for name,df in dict_df_subset.items():
#   if "train" in name:
#     sentence_list.extend(df.TweetTokens.to_list())
# print(len(sentence_list))

In [88]:
# freq_threshold = 3
# vocab_max_size = 50000

# vocab = Vocabulary(freq_threshold, vocab_max_size)
# vocab.build_vocabulary(sentence_list)

In [150]:
global_vectors = GloVe(name='840B', dim=300)


.vector_cache/glove.840B.300d.zip: 2.18GB [06:53, 5.26MB/s]                            
100%|█████████▉| 2196016/2196017 [04:50<00:00, 7568.38it/s]


In [177]:
type(global_vectors)

torchtext.vocab.vectors.GloVe

In [274]:

def load_pretrained_vectors(word2idx, embedding_name = 'glove', embedding_file = global_vectors):
    """Load pretrained vectors and create embedding layers.
    
    Args:
        word2idx - vocab.stoi (Dict): Vocabulary built from the corpus
        embedding_name (str): the type of embedding - glove for GloVe or word2vec for word2vec
        embedding_file (object) :optional embedding file

    Returns:
        embeddings (np.array): Embedding matrix with shape (N, d) where N is
            the size of word2idx and d is embedding dimension
    """

    if embedding_name == 'glove':
      print("Loading pretrained vectors...")
      if embedding_file:
        global_vectors = embedding_file
      else:
        global_vectors = GloVe(name='840B', dim=300)

      print("Processing pretrained vectors...")
      d = 300
      print("\ndimension of pretained embedding: ", d)

      # Initilize random embeddings
      embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
      embeddings[word2idx['<PAD>']] = np.zeros((d,))
      
      # Load pretrained vectors
      count = 0 
      for word in global_vectors.stoi:
        if word in word2idx:
            count +=1
            embeddings[word2idx[word]] = global_vectors[word]
      print(f"There are {count} / {len(word2idx)} pretrained vectors found.")
        
      print("Process Completed...")
      return embeddings

    else:
      print(" Embedding not implemented, returning zero embedding")
      return np.zeros(len(word2idx), 300)
    
    
#     # downloaded word2vec from https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz

#     word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
    
#     print("Processing pretrained vectors...")
#     d = word2vec.vector_size
#     print("\ndimension of pretained embedding: ", d)
    
#     # Initilize random embeddings
#     embeddings = np.random.uniform(-0.25, 0.25, (len(word2idx), d))
#     embeddings[word2idx['<PAD>']] = np.zeros((d,))

#     # Load pretrained vectors
#     count = 0 
#     for word in word2vec.key_to_index:
#         if word in word2idx:
#             count +=1
#             embeddings[word2idx[word]] = word2vec.get_vector(word)
    
    
# #     count = 0
# #     for line in tqdm_notebook(fin):
# #         tokens = line.rstrip().split(' ')
# #         word = tokens[0]
# #         if word in word2idx:
# #             count += 1
# #             embeddings[word2idx[word]] = np.array(tokens[1:], dtype=np.float32)

#     print(f"There are {count} / {len(word2idx)} pretrained vectors found.")
    
#     print("Process Completed...")
#     return embeddings


In [275]:
pre_trained_embeddings = load_pretrained_vectors(vocab.stoi, embedding_name = 'glove', embedding_file = global_vectors)

Loading pretrained vectors...
Processing pretrained vectors...

dimension of pretained embedding:  300
There are 4805 / 5068 pretrained vectors found.
Process Completed...


In [276]:
print(pre_trained_embeddings.shape)

(5068, 300)


In [296]:
embedding_tensor = torch.tensor(pre_trained_embeddings)
embedding_tensor.shape

torch.Size([5068, 300])

# Model Creation

## Gradient Reversal Layer Function

In [297]:
from torch.autograd import Function

class ReverseLayerF(Function):

    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha

        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha

        return output, None

## CNN 1-D Model

Reference: A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification, Ye Zhang, Byron Wallace 2015

Difference: 

1.   use of embedding
2.   use of sigmoid function, as we are having a regression model not a classififer as the main task



In [349]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CNN_SA(nn.Module):
    """An 1D Convulational Neural Network for Sentiment Analysis."""
    def __init__(self,
                 pretrained_embedding=None,
                 freeze_embedding=False,
                 vocab_size=None,
                 embed_dim=300,
                 filter_sizes=[1, 2, 3, 4, 5],
                 num_filters=[ 100, 100, 100, 100, 100],
                 num_classes=2,
                 dropout=0.25):
        """
        The constructor for CNN_SA class.

        Args:
            pretrained_embedding (torch.Tensor): Pretrained embeddings with
                shape (vocab_size, embed_dim)
            freeze_embedding (bool): Set to False to fine-tune pretraiend
                vectors. Default: False
            vocab_size (int): Need to be specified when not pretrained word
                embeddings are not used.
            embed_dim (int): Dimension of word vectors. Need to be specified
                when pretrained word embeddings are not used. Default: 300
            filter_sizes (List[int]): List of filter sizes. Default: [2, 3, 4, 5]
            num_filters (List[int]): List of number of filters, has the same
                length as `filter_sizes`. Default: [100, 100, 100, 100]
            n_classes (int): Number of classes (domain classification usage). 
            Default: 2
            dropout (float): Dropout rate. Default: 0.25
        """

        super(CNN_SA, self).__init__()
        
         #---------------------Feature Extractor Network----------------------#
        # Embedding layer
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding,
                                                          freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size,
                                          embedding_dim=self.embed_dim,
                                          padding_idx=0,
                                          max_norm=5.0)
        # Conv Network
        self.feature_extractor = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim,
                      out_channels=num_filters[i],
                      kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])
        
        #---------------------Regression Network------------------------#
        # Fully-connected layer and Dropout
        self.regression = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(np.sum(num_filters), np.sum(num_filters)//2),
            nn.ReLU(),
#             nn.BatchNorm1d(np.sum(num_filters)//2),
            # nn.Linear(np.sum(num_filters)//2, num_classes), # for classification
            nn.Linear(np.sum(num_filters)//2, 1), # for regression
            # nn.LogSoftmax(dim=1) # for classification
            nn.Sigmoid() # for regession (values between 0 and 1)
        )
        
        #---------------------Domain Classifier Network------------------------#
        # Fully-connected layer and Dropout
        self.domain_classifier = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(np.sum(num_filters), np.sum(num_filters)//2),
            nn.ReLU(),
#             nn.BatchNorm1d(np.sum(num_filters)//2),
            nn.Linear(np.sum(num_filters)//2, num_classes),
            nn.LogSoftmax(dim=1)
        )
        
        

    def forward(self, input_ids,alpha=1):
        """Perform a forward pass through the network.

        Args:
            input_ids (torch.Tensor): A tensor of token ids with shape
                (batch_size, max_sent_length)

        Returns:
            sigmoid (torch.Tensor) : Output sigmoid 
            logits (torch.Tensor): Output logits with shape (batch_size,
                n_classes)
        """

        # Get embeddings from `input_ids`. Output shape: (b, max_len, embed_dim)
#         input_ids = torch.tensor(input_ids).to(torch.int64)
        input_ids = input_ids.clone().detach().to(torch.int64)
        # print("input_ids.shape", input_ids.shape)
        
        x_embed = self.embedding((input_ids)).float()
        # print("x_embed.shape", x_embed.shape)


        # Permute `x_embed` to match input shape requirement of `nn.Conv1d`.
        # Output shape: (b, embed_dim, max_len)
        x_reshaped = x_embed.permute(0, 2, 1)
        # print("x_reshaped.shape", x_reshaped.shape)
        

#         # Apply CNN and ReLU. Output shape: (b, num_filters[i], L_out)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.feature_extractor]
        # print("x_conv_list[1].shape", x_conv_list[1].shape)
        # print("x_conv_list[2].shape", x_conv_list[2].shape)

#         # Max pooling. Output shape: (b, num_filters[i], 1)
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2])
            for x_conv in x_conv_list]
        # print( "x_pool_list[3].shape", x_pool_list[3].shape)
        
#         # Concatenate x_pool_list to feed the fully connected layer.
#         # Output shape: (b, sum(num_filters))
        x_feature = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
        # print("x_feature.shape", x_feature.shape)
        
# #         # Compute logits. Output shape: (b, n_classes)
#         logits = self.fc(self.dropout(x_feature))
#         print(logits)

        reverse_feature = ReverseLayerF.apply(x_feature, alpha)
        # print("reverse_feature",reverse_feature)
    
        regression_output = self.regression(x_feature)
    
        domain_classifier_output = self.domain_classifier(reverse_feature)
#         print(domain_classifier_logits.shape)

#         return logits
        return regression_output, domain_classifier_output


In [350]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = CNN_SA(pretrained_embedding=embedding_tensor,freeze_embedding=True).to(device)
print(model)


cpu
CNN_SA(
  (embedding): Embedding(5068, 300)
  (feature_extractor): ModuleList(
    (0): Conv1d(300, 100, kernel_size=(1,), stride=(1,))
    (1): Conv1d(300, 100, kernel_size=(2,), stride=(1,))
    (2): Conv1d(300, 100, kernel_size=(3,), stride=(1,))
    (3): Conv1d(300, 100, kernel_size=(4,), stride=(1,))
    (4): Conv1d(300, 100, kernel_size=(5,), stride=(1,))
  )
  (regression): Sequential(
    (0): Dropout(p=0.25, inplace=False)
    (1): Linear(in_features=500, out_features=250, bias=True)
    (2): ReLU()
    (3): Linear(in_features=250, out_features=1, bias=True)
    (4): Sigmoid()
  )
  (domain_classifier): Sequential(
    (0): Dropout(p=0.25, inplace=False)
    (1): Linear(in_features=500, out_features=250, bias=True)
    (2): ReLU()
    (3): Linear(in_features=250, out_features=2, bias=True)
    (4): LogSoftmax(dim=1)
  )
)


In [351]:


for name, train_loader in dict_train_loader.items():
  tweet, intensity, domain = next(iter(train_loader))
  print (name, "\n", tweet, intensity)
  a, b = model(tweet)
  print ("a - sigmoid output\n", a)
  print ("b - softmax output\n", b)

df_EI_reg_train_subset_dataset_dataloader 
 tensor([[   5,    5,    5,  ...,    0,    0,    0],
        [   5,  519,  115,  ...,    0,    0,    0],
        [   5,    5,   52,  ...,    0,    0,    0],
        ...,
        [   5,  134,   58,  ...,    0,    0,    0],
        [  13,   44, 1816,  ...,    0,    0,    0],
        [  49,   15,  341,  ...,    0,    0,    0]]) tensor([0.3330, 0.3330, 0.6460, 0.9000, 0.5000, 0.5420, 0.7080, 0.6250],
       dtype=torch.float64)
a - sigmoid output
 tensor([[0.4808],
        [0.4904],
        [0.4559],
        [0.4635],
        [0.4850],
        [0.4949],
        [0.4556],
        [0.4687]], grad_fn=<SigmoidBackward0>)
b - softmax output
 tensor([[-0.7309, -0.6568],
        [-0.6815, -0.7049],
        [-0.6494, -0.7389],
        [-0.6257, -0.7655],
        [-0.6589, -0.7286],
        [-0.6507, -0.7375],
        [-0.6691, -0.7178],
        [-0.6758, -0.7108]], grad_fn=<LogSoftmaxBackward0>)
df_V_reg_train_subset_dataset_dataloader 
 tensor([[519,   1

# Training and Test Function

## Typical Training, Test Function (without Domain Adaptation)

### Typical Training Function (without domain adapatation)

In [352]:
# Training Function

from tqdm import tqdm # for beautiful model training updates


def train_model(model, device, train_loader, optimizer, epoch):
    model.train() # setting the model in training mode
    pbar = tqdm(train_loader) # putting the iterator in pbara
    correct = 0 # for accuracy numerator
    processed =0 # for accuracy denominator
    epoch_loss = 0.0
    for batch_idx, batch in enumerate(pbar):

        tweets, intensities = batch[0].to(device), batch[1].float().to(device)  # plural, we are not interested in domain
        #sending data to CPU or GPU as per device

        optimizer.zero_grad() # setting gradients to zero to avoid accumulation

        y_preds, _ = model(tweets) # forward pass, result captured in y_preds (plural as there are many body in a batch)
        # we are not interested in domain prediction
        # the predictions are in one hot vector

        loss = F.mse_loss(y_preds,intensities.unsqueeze(1)) # Computing loss

        train_losses.append(loss) # to capture loss over many epochs

        loss.backward() # backpropagation
        optimizer.step() # updating the params

        # preds = y_preds.argmax(dim=1, keepdim=True)  # get the index olf the max log-probability
        # correct += preds.eq(labels.view_as(preds)).sum().item()
        epoch_loss += loss.item()

        processed += len(tweets)

        pbar.set_description(desc= f'Loss={loss.item()} Batch_id={batch_idx} Epoch Average loss={100*epoch_loss/processed:0.4f}')
    train_accuracy.append(100*epoch_loss/len(train_loader))

### Typical Test Function 

In [353]:
def test_model(model,device, test_loader, mode= 'test'):
    model.eval() # setting the model in evaluation mode
    loss = 0
    correct = 0 # for accuracy numerator

    with torch.no_grad():
        for (tweets, intensities, domain) in test_loader:

            tweets, intensities  = tweets.to(device),intensities.float().to(device), #sending data to CPU or GPU as per device
            # we are not interested in domains
            
            outputs,_ = model(tweets) # forward pass, result captured in outputs (plural as there are many bodies in a batch)
            # the outputs are in batch size x one hot vector 
            # not interested in domain output

            loss = F.mse_loss(outputs,intensities.unsqueeze(1))


        loss /= len(test_loader.dataset) # average test loss
        if mode == 'test':
          test_losses.append(loss) # to capture loss over many batches
          print('...Average test loss: {:.8f}'.format(loss))
        else:
          val_losses.append(loss) # to capture loss over many batches
          print('...Average val loss: {:.8f}'.format(loss))


### Training and Testing loop

In [354]:
# EXECUTION

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

typical_model =  CNN_SA(pretrained_embedding=embedding_tensor,
                  freeze_embedding=True,
                 ).to(device)


optimizer = optim.Adam(typical_model.parameters(), lr=0.001)

# train_losses = [] # to capture train losses over training epochs
# train_accuracy = [] # to capture train accuracy over training epochs
# val_losses = [] # to capture validation loss
# test_losses = [] # to capture test losses 
# test_accuracy = [] # to capture test accuracy 

EPOCHS = 2
# EPOCHS = 5
dict_val_loss = {}
dict_test_loss = {}

for train_name, train_loader in dict_train_loader.items():
  name = "_".join(train_name.split("_")[1:3])
  train_losses = [] # to capture train losses over training epochs
  train_accuracy = [] # to capture train accuracy over training epochs
  val_losses = [] # to capture validation loss
  test_losses = [] # to capture test losses 
  print(f'----------------------training started for {name}-----------------')
  for epoch in range(EPOCHS):
    print("EPOCH:", epoch+1)
    train_model(typical_model, device, train_loader, optimizer, epoch)

    print("\nfor validation.......")
    val_name = train_name.replace("train", "val" )
    test_model(typical_model, device, dict_val_loader[val_name], mode = 'val')
    


    print("for test  .......")
    test_name = train_name.replace("train", "test" )
    test_model(typical_model, device, dict_test_loader[test_name], mode = 'test')

  dict_val_loss[name] = val_losses
  dict_test_loss[name] = test_losses

  model_name = name+".pt"
  torch.save(typical_model.state_dict(), os.path.join(MODEL_DIR, model_name))
  print(f'----------------------training complete for {name}-----------------')
print(dict_val_loss.items())
print(dict_test_loss.items())

cpu
----------------------training started for EI_reg-----------------
EPOCH: 1


Loss=0.021468039602041245 Batch_id=212 Epoch Average loss=0.2867: 100%|██████████| 213/213 [00:25<00:00,  8.35it/s]


for validation.......





...Average val loss: 0.00007197
for test  .......
...Average test loss: 0.00002696
EPOCH: 2


Loss=0.010277757421135902 Batch_id=212 Epoch Average loss=0.1514: 100%|██████████| 213/213 [00:27<00:00,  7.81it/s]


for validation.......





...Average val loss: 0.00006732
for test  .......
...Average test loss: 0.00002614
----------------------training complete for EI_reg-----------------
----------------------training started for V_reg-----------------
EPOCH: 1


Loss=0.08481224626302719 Batch_id=147 Epoch Average loss=0.4969: 100%|██████████| 148/148 [00:16<00:00,  8.71it/s]


for validation.......





...Average val loss: 0.00004720
for test  .......
...Average test loss: 0.00002206
EPOCH: 2


Loss=0.009592588059604168 Batch_id=147 Epoch Average loss=0.1911: 100%|██████████| 148/148 [00:17<00:00,  8.64it/s]


for validation.......





...Average val loss: 0.00004452
for test  .......
...Average test loss: 0.00002076
----------------------training complete for V_reg-----------------
dict_items([('EI_reg', [tensor(7.1968e-05), tensor(6.7316e-05)]), ('V_reg', [tensor(4.7198e-05), tensor(4.4523e-05)])])
dict_items([('EI_reg', [tensor(2.6958e-05), tensor(2.6142e-05)]), ('V_reg', [tensor(2.2058e-05), tensor(2.0761e-05)])])


In [356]:
print(dict_val_loss.items())
print(dict_test_loss.items())

dict_items([('EI_reg', [tensor(7.1968e-05), tensor(6.7316e-05)]), ('V_reg', [tensor(4.7198e-05), tensor(4.4523e-05)])])
dict_items([('EI_reg', [tensor(2.6958e-05), tensor(2.6142e-05)]), ('V_reg', [tensor(2.2058e-05), tensor(2.0761e-05)])])


In [None]:
# next(iter(dict_train_loader['df_EI_reg_train_subset_dataset_dataloader']))

## Domain Adaptation Training Function (using DANN)

### DA Training Function (using DANN)

In [355]:
# DANN Training Function

def dann_train_model(model, device, train_source_loader, train_target_loader,optimizer, epoch, num_epochs):
    model.train() # setting the model in training mode
    len_dataloader = min(len(train_source_loader), len(train_target_loader)) # training for minimum of two dataloaders
    
    i = 0 # as the training progresses the alpha changes
    while i < len_dataloader -1:
        
        # implementation of alpha as per paper
        p = float(i + epoch * len_dataloader) / (num_epochs * len_dataloader)
        alpha = 2. / (1. + np.exp(-10 * p)) - 1
        alpha = torch.tensor(alpha)
        
        # training model using source data
        source_batch = next(iter(train_source_loader))
        source_bodies, source_labels, source_domains = source_batch[0].to(device),source_batch[1].float().to(device),source_batch[2].long().to(device) # plural
        source_batch_size = len(source_labels)
        
        optimizer.zero_grad() # setting gradients to zero to avoid accumulation
        
        y_preds, source_domain_outputs  = model(source_bodies, alpha = alpha) # forward pass, plural results
        
        loss_source_label =  F.mse_loss(y_preds,source_labels.unsqueeze(1)) # Computing loss, regression loss
        loss_source_domain = F.nll_loss(source_domain_outputs, source_domains) # classificaiton loss
        

    #     train_losses.append(loss) # to capture loss over many epochs

    #     loss.backward() # backpropagation
    #     optimizer.step() # updating the params

    #     # preds = y_preds.argmax(dim=1, keepdim=True)  # get the index olf the max log-probability
    #     # correct += preds.eq(labels.view_as(preds)).sum().item()
    #     epoch_loss += loss.item()

    #     processed += len(tweets)

    #     pbar.set_description(desc= f'Loss={loss.item()} Batch_id={batch_idx} Epoch Average loss={100*epoch_loss/processed:0.4f}')
    # train_accuracy.append(100*epoch_loss/len(train_loader))



        # training model using target data
        target_batch = next(iter(train_target_loader))
#         target_bodies, _ , target_domains = tuple(t.to(device) for t in target_batch) # plural, we are not interesed in  label
        target_bodies, target_domains = target_batch[0].to(device),target_batch[2].long().to(device) # plural, we are not interesed in  label

        target_batch_size = len(target_domains)
        
        _ , target_domain_outputs = model(target_bodies, alpha = alpha) # forward pass, plural results, we are not interested in label
        
        loss_target_domain = F.nll_loss(target_domain_outputs, target_domains)
        
        loss = loss_source_label + loss_source_domain + loss_target_domain
        
        train_losses.append(loss)
        
        loss.backward() # backpropagation
        optimizer.step() # updating the params
        
        if ((i + 1) % 100 == 0):
                print("Epoch [{}/{}] Step [{}/{}]: domain_loss_target={:.4f} / domain_loss_source={:.4f} / class_loss_source={:.4f}"
                      .format(epoch + 1,
                              num_epochs,
                              i + 1,
                              len_dataloader,
                              loss_target_domain.item()
                              ,loss_source_domain.item()
                              ,loss_source_label.item()))
        
        i = i+1
        
    # torch.save(model.state_dict(),"DANN-{}.pt".format(epoch + 1))

    

### DA Test Function (using DANN)

In [358]:
from logging import logProcesses
# DANN Test Function

def dann_test_model(model, device, test_loader , domain = domain_source, mode = 'test'):
    model.eval() # setting the model in evaluation mode
    alpha = 0
    loss = 0
    correct = 0  # for accuracy numerator
    if domain == domain_source:
        domain_str = "source"
    else:
        domain_str = "target"
    
    with torch.no_grad():
        for (bodies,labels,domains) in test_loader:
            
            bodies, labels, domains = bodies.to(device), labels.float().to(device), domains.long().to(device)#sending data to CPU or GPU as per device
            outputs, _  = model(bodies, alpha = alpha) # forward pass, plural results
            # the outputs are in batch size x one hot vector 
            # not interested in domain output

            loss = F.mse_loss(outputs,labels.unsqueeze(1))


            
        loss /= len(test_loader.dataset) # average test loss
            
        if mode == 'test':
          dann_test_losses.append(loss) # to capture loss over many batches
          print('...Average test loss: {:.8f},  domain: {} \n'.format(loss, domain_str))
        else:
          dann_val_losses.append(loss) # to capture loss over many batches
          print('...Average val loss: {:.8f},  domain: {} \n'.format(loss, domain_str))
                        
        

### DA (DANN) Train and Test Loop

In [359]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
dann_model =  CNN_SA(pretrained_embedding=embedding_tensor,
                  freeze_embedding=True).to(device)


optimizer = optim.Adam(dann_model.parameters(), lr=0.001)

# train_losses = [] # to capture train losses over training epochs
# train_accuracy = [] # to capture train accuracy over training epochs

# test_losses_source = [] # to capture test losses 
# test_losses_target = [] # to capture test losses
# test_accuracy_source = [] # to capture test accuracy 
# test_accuracy_target = [] # to capture test accuracy

# EPOCHS = 2
EPOCHS = 10
dict_dann_val_loss = {}
dict_dann_test_loss = {}

for train_name, train_loader in dict_train_loader.items():
  name = "_".join(train_name.split("_")[1:3])

  dann_train_losses = [] # to capture train losses over training epochs
  dann_train_accuracy = [] # to capture train accuracy over training epochs
  dann_val_losses = [] # to capture validation loss
  dann_test_losses = [] # to capture test losses 

  print(f'----------------------training started for {name}-----------------')
  for epoch in range(EPOCHS):
    print("EPOCH:", epoch+1)
    dann_train_model(dann_model, device, train_loader, da_dataset_loader, optimizer, epoch, num_epochs = EPOCHS)
    print("for val.......")
    val_name = train_name.replace("train", "val" )
    dann_test_model(dann_model, device, dict_val_loader[val_name], domain = domain_source, mode = 'val')
    print("for test......")
    test_name = train_name.replace("train", "test" )
    dann_test_model(dann_model, device, dict_test_loader[test_name], domain = domain_source, mode = 'test')
  print(f'----------------------training complete for {name}-----------------')
  
  dict_dann_val_loss[name] = dann_val_losses
  dict_dann_test_loss[name] = dann_test_losses

  model_name = name+"_dann.pt"
  torch.save(dann_model.state_dict(), os.path.join(MODEL_DIR, model_name))
  print(f'----------------------Model: {model_name } saved complete for {name}-----------------')  
print("---------------------DANN--training complete-----------------")
print(dict_dann_val_loss.items())
print(dict_dann_test_loss.items())

cpu
----------------------training started for EI_reg-----------------
EPOCH: 1
for val.......
...Average val loss: 0.00010251,  domain: source 

for test......
...Average test loss: 0.00003599,  domain: source 

EPOCH: 2
for val.......
...Average val loss: 0.00010251,  domain: source 

for test......
...Average test loss: 0.00003599,  domain: source 

EPOCH: 3
for val.......
...Average val loss: 0.00010251,  domain: source 

for test......
...Average test loss: 0.00003599,  domain: source 

EPOCH: 4
for val.......
...Average val loss: 0.00010251,  domain: source 

for test......
...Average test loss: 0.00003599,  domain: source 

EPOCH: 5
for val.......
...Average val loss: 0.00010251,  domain: source 

for test......
...Average test loss: 0.00003599,  domain: source 

EPOCH: 6
for val.......
...Average val loss: 0.00010251,  domain: source 

for test......
...Average test loss: 0.00003599,  domain: source 

EPOCH: 7
for val.......
...Average val loss: 0.00010251,  domain: source 

fo

In [327]:
print(dict_dann_val_loss.items())
print(dict_dann_test_loss.items())


dict_items([('EI_reg', [tensor(0.0001), tensor(0.0001)]), ('V_reg', [tensor(0.0001), tensor(0.0001)])])
dict_items([('EI_reg', [tensor(3.5896e-05), tensor(3.5896e-05)]), ('V_reg', [tensor(5.0222e-05), tensor(5.0222e-05)])])


In [3]:
name = 'df_EI_reg_val_subset_dataset_dataloader'
"_".join(name.split("_")[1:3])

'EI_reg'