<a href="https://colab.research.google.com/github/peeyushsinghal/da/blob/main/mitigating_bias_sa_da_v10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mitigating bias in sentiment analysis using domain adaptation

In [1]:
! pip install torchtext==0.10.0 --quiet # DOWNGRADE YOUR TORCHTEXT
! pip install ekphrasis --quiet # library to pre process twitter data
! pip install emoji --upgrade --quiet #library to deal with emoji data

In [2]:
## Import statements
import pandas as pd
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchtext.legacy.data import Dataset, Field, TabularDataset, BucketIterator
from torchtext.vocab import GloVe
# import torchtext.functional as TTF
import numpy as np
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import emoji
from torchtext.legacy.vocab import Vectors
from tqdm import tqdm
import random
import torch.optim as optim
# import json
# Importing library
import scipy.stats as stats
from statistics import mean

In [3]:
# checking device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:{}".format(DEVICE))

Running on:cpu


## Data loading

In [4]:
#Mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Data Configuration

In [5]:
BASE_PATH = '/content/drive/MyDrive/semeval-2018'

DATA_DIR = os.path.join(BASE_PATH,'datasets')
TARGET_DIR = os.path.join(BASE_PATH,'targetdataset')

MODEL_DIR = os.path.join(BASE_PATH,'models')
REF_DIR = os.path.join(BASE_PATH,'reference')

MAX_SIZE = 50
MAX_VOCAB_SIZE = 10000
BATCH_SIZE = 8

TARGET_BATCH_SIZE = 8

if not os.path.exists(MODEL_DIR):
  os.makedirs(MODEL_DIR)
  print("The new directory is created!")



In [6]:
# data configuration

class TASK1(object):
  
    EI_reg = {
        'anger': {
            'train': os.path.join(
                DATA_DIR, 'task1/EI-reg/training/EI-reg-En-anger-train.txt'),
            'dev': os.path.join(
                DATA_DIR, 'task1/EI-reg/development/2018-EI-reg-En-anger-dev.txt'),
            'gold': os.path.join(
                DATA_DIR, 'task1/EI-reg/test-gold/2018-EI-reg-En-anger-test-gold.txt')
                },
        'joy': {
                'train': os.path.join(
                    DATA_DIR, 'task1/EI-reg/training/EI-reg-En-joy-train.txt'),
                'dev': os.path.join(
                    DATA_DIR, 'task1/EI-reg/development/2018-EI-reg-En-joy-dev.txt'),
                'gold': os.path.join(
                    DATA_DIR, 'task1/EI-reg/test-gold/2018-EI-reg-En-joy-test-gold.txt')
                },
        'fear': {
            'train': os.path.join(
                DATA_DIR, 'task1/EI-reg/training/EI-reg-En-fear-train.txt'),
            'dev': os.path.join(
                DATA_DIR, 'task1/EI-reg/development/2018-EI-reg-En-fear-dev.txt'),
            'gold': os.path.join(
                DATA_DIR, 'task1/EI-reg/test-gold/2018-EI-reg-En-fear-test-gold.txt')
                },
        'sadness': {
            'train': os.path.join(
                DATA_DIR, 'task1/EI-reg/training/EI-reg-En-sadness-train.txt'),
            'dev': os.path.join(
                DATA_DIR, 'task1/EI-reg/development/2018-EI-reg-En-sadness-dev.txt'),
            'gold': os.path.join(
                DATA_DIR, 'task1/EI-reg/test-gold/2018-EI-reg-En-sadness-test-gold.txt')
                }                     
        }

    V_reg = {
        'train': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-train.txt'),
        'dev': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-dev.txt'),
        'gold': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-test-gold.txt')
             }

    EEC = {
        'eec': os.path.join(
            DATA_DIR, 'task1/Equity-Evaluation-Corpus/Equity-Evaluation-Corpus.csv')
             }

## Source Data
Parsing Emotion and Valence regression data : `format [ID	Tweet	Affect Dimension	Intensity Score]`

In [7]:
def parse_reg(data_file, label_format='tuple'):#->  pd.DataFrame:
    """
    This is for datasets for the EI-reg and V-reg English tasks 
    Returns:
        df: dataframe with columns in the first row of file [ID-Tweet-Affect Dimension-Intensity Score]
    """
    with open(data_file, 'r') as fd:
      data = [l.strip().split('\t') for l in fd.readlines()]
    
    # print(data)
    df = pd.DataFrame (data[1:],columns=data[0])
    csv_file_name = (data_file.split("/")[-1]).split('.')[0]+".csv"
    csv_file = df.to_csv(str(csv_file_name))
    # df['domain'] = domain_source
    return csv_file_name
    # return df

Generic Source Data Parser

In [8]:
def parse_csv(task, dataset, emotion='anger'):
    if task == 'EI-reg':
        data_train = TASK1.EI_reg[emotion][dataset]
        csv_file_name = parse_reg(data_train)
        return csv_file_name
        # df[df.columns[-1]] = df[df.columns[-1]].astype(float)
        # return df
    elif task == 'V-reg':
        data_train = TASK1.V_reg[dataset]
        # df = parse_reg(data_train)
        # print(data_train)
        csv_file_name = parse_reg(data_train)
        return csv_file_name
        # df[df.columns[-1]] = df[df.columns[-1]].astype(float)
        # return df
    else:
        return None

In [9]:
emotions = ['anger','joy','fear','sadness']
dict_data ={'train':'train','dev':'val','gold':'test'}
dict_file_name ={}
for emotion in emotions:
  for data_info, data_usage in dict_data.items():
    file_name = str('file_EI_'+ emotion + "_" + data_usage)
    dict_file_name[file_name] = parse_csv('EI-reg', data_info, emotion)

    file_name2 = str('file_V_'+ data_usage)
    dict_file_name[file_name2] = parse_csv('V-reg', data_info)

# file_EI_reg_train = parse_csv('EI-reg','train','joy')
# file_EI_reg_val = parse_csv('EI-reg','dev','joy')
# file_EI_reg_test = parse_csv('EI-reg','gold','joy')

(dict_file_name)

{'file_EI_anger_train': 'EI-reg-En-anger-train.csv',
 'file_V_train': '2018-Valence-reg-En-train.csv',
 'file_EI_anger_val': '2018-EI-reg-En-anger-dev.csv',
 'file_V_val': '2018-Valence-reg-En-dev.csv',
 'file_EI_anger_test': '2018-EI-reg-En-anger-test-gold.csv',
 'file_V_test': '2018-Valence-reg-En-test-gold.csv',
 'file_EI_joy_train': 'EI-reg-En-joy-train.csv',
 'file_EI_joy_val': '2018-EI-reg-En-joy-dev.csv',
 'file_EI_joy_test': '2018-EI-reg-En-joy-test-gold.csv',
 'file_EI_fear_train': 'EI-reg-En-fear-train.csv',
 'file_EI_fear_val': '2018-EI-reg-En-fear-dev.csv',
 'file_EI_fear_test': '2018-EI-reg-En-fear-test-gold.csv',
 'file_EI_sadness_train': 'EI-reg-En-sadness-train.csv',
 'file_EI_sadness_val': '2018-EI-reg-En-sadness-dev.csv',
 'file_EI_sadness_test': '2018-EI-reg-En-sadness-test-gold.csv'}

## Preprocess tweets

In [10]:
# reference : https://github.com/cbaziotis/ekphrasis


text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading twitter - 1grams ...


In [11]:
# #### Example checks of pre-processing
# sentences = [
#     "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))",
#     "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
#     "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/.",
#     "@MGBarbieri @SpalkTalk a@b.com And just saw your LinkedIn comment after I sent this! Thanks for the message :) 😀",
#     "💙💛🏆 @GeorgeePitman Young Player of The Season 🏆💛💙 #irony #actuallyseventy"
# ]

# for s in sentences:
#     print(" ".join(text_processor.pre_process_doc(s)))
# # print ([text_processor.pre_process_doc(s) for s in sentences])

In [12]:
def preprocess_tweet(tweet): 
  tweet_processed = text_processor.pre_process_doc(tweet)
  # print (tweet_processed)
  final_list =[]
  for index, tweet in enumerate(tweet_processed):
      final_list.append(emoji.demojize(tweet, language = 'en'))
  
  # print(df)
  return final_list

In [13]:
# #### Example checks of pre-processing
# sentences = [
#     "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))",
#     "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
#     "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/.",
#     "@MGBarbieri @SpalkTalk a@b.com And just saw your LinkedIn comment after I sent this! Thanks for the message :) 😀",
#     "💙💛🏆 @GeorgeePitman Young Player of The Season 🏆💛💙 #irony #actuallyseventy"
# ]

# for s in sentences:
#   print(preprocess_tweet(s))
#   # print(" ".join(preprocess_tweet(s)))

## TorchText Treatment

In [22]:
# # ORIGINAL
# field_tweet = Field(sequential=True, use_vocab = True, tokenize = preprocess_tweet, fix_length = MAX_SIZE, batch_first = True)
# field_intensity = Field(sequential= False, 
#                         dtype = torch.float,
#                         use_vocab = False 
#                         )

In [23]:
# fields = {
#     'Tweet':('tweet', field_tweet ), #
#     'Intensity Score': ('intensity',field_intensity) # Intensity Score is name of the dataset column, field_intensity is how we have defined the field, intensity is the name of the variable going fwd
#     }

In [None]:
# emotions = ['anger','joy','fear','sadness']
# dict_data ={'train':'train','dev':'val','gold':'test'}

In [14]:
dict_file_name.keys()

dict_keys(['file_EI_anger_train', 'file_V_train', 'file_EI_anger_val', 'file_V_val', 'file_EI_anger_test', 'file_V_test', 'file_EI_joy_train', 'file_EI_joy_val', 'file_EI_joy_test', 'file_EI_fear_train', 'file_EI_fear_val', 'file_EI_fear_test', 'file_EI_sadness_train', 'file_EI_sadness_val', 'file_EI_sadness_test'])

In [15]:
dict_fields ={}
list_name = list(set(["_".join(key.split("_")[1:-1]) for key in list(dict_file_name.keys())]))

for name in list_name:
  field_tweet = Field(sequential=True, 
                      use_vocab = True, 
                      tokenize = preprocess_tweet, 
                      fix_length = MAX_SIZE, 
                      batch_first = True)
  field_intensity = Field(sequential= False, 
                        dtype = torch.float,
                        use_vocab = False)
  fields = {
    'Tweet':('tweet', field_tweet ), #
    'Intensity Score': ('intensity',field_intensity) # Intensity Score is name of the dataset column, field_intensity is how we have defined the field, intensity is the name of the variable going fwd
    }
  
  dict_fields[name] = fields

  # dict_fields[name]= { 'field_tweet': Field(sequential=True,
  #                                        use_vocab = True,
  #                                        tokenize = preprocess_tweet,
  #                                        fix_length = MAX_SIZE,
  #                                        batch_first = True ), 
  #                           'field_intensity': Field(sequential= False,
  #                                              dtype = torch.float,
  #                                              use_vocab = False )}

dict_fields

{'EI_fear': {'Tweet': ('tweet',
   <torchtext.legacy.data.field.Field at 0x7f457aa60750>),
  'Intensity Score': ('intensity',
   <torchtext.legacy.data.field.Field at 0x7f457aa60ad0>)},
 'EI_sadness': {'Tweet': ('tweet',
   <torchtext.legacy.data.field.Field at 0x7f457aa60d90>),
  'Intensity Score': ('intensity',
   <torchtext.legacy.data.field.Field at 0x7f457aa609d0>)},
 'EI_anger': {'Tweet': ('tweet',
   <torchtext.legacy.data.field.Field at 0x7f457aa60d50>),
  'Intensity Score': ('intensity',
   <torchtext.legacy.data.field.Field at 0x7f457aa60690>)},
 'V': {'Tweet': ('tweet',
   <torchtext.legacy.data.field.Field at 0x7f457aa60b50>),
  'Intensity Score': ('intensity',
   <torchtext.legacy.data.field.Field at 0x7f457aa60b10>)},
 'EI_joy': {'Tweet': ('tweet',
   <torchtext.legacy.data.field.Field at 0x7f457aa60a10>),
  'Intensity Score': ('intensity',
   <torchtext.legacy.data.field.Field at 0x7f457aa60bd0>)}}

In [16]:
dict_fields['EI_sadness']['Tweet'][1]

<torchtext.legacy.data.field.Field at 0x7f457aa60d90>

In [28]:
# WORKING CODE 
#dict_fields ={}
# list_name = list(set(["_".join(key.split("_")[1:-1]) for key in list(dict_file_name.keys())]))

# for name in list_name:
#   dict_fields[name]= { 'field_tweet': Field(sequential=True,
#                                          use_vocab = True,
#                                          tokenize = preprocess_tweet,
#                                          fix_length = MAX_SIZE,
#                                          batch_first = True ), 
#                             'field_intensity': Field(sequential= False,
#                                                dtype = torch.float,
#                                                use_vocab = False )}

# dict_fields

{'EI_sadness': {'field_tweet': <torchtext.legacy.data.field.Field at 0x7f7431f71dd0>,
  'field_intensity': <torchtext.legacy.data.field.Field at 0x7f7431f71c90>},
 'EI_fear': {'field_tweet': <torchtext.legacy.data.field.Field at 0x7f7431f71f10>,
  'field_intensity': <torchtext.legacy.data.field.Field at 0x7f7431f71510>},
 'EI_anger': {'field_tweet': <torchtext.legacy.data.field.Field at 0x7f743278f910>,
  'field_intensity': <torchtext.legacy.data.field.Field at 0x7f743278f350>},
 'EI_joy': {'field_tweet': <torchtext.legacy.data.field.Field at 0x7f743278f7d0>,
  'field_intensity': <torchtext.legacy.data.field.Field at 0x7f743278f250>},
 'V': {'field_tweet': <torchtext.legacy.data.field.Field at 0x7f743278f850>,
  'field_intensity': <torchtext.legacy.data.field.Field at 0x7f743278fbd0>}}

In [None]:
# dict_field_values = {}
# for name in list_name:
#   dict_field_values[name] = {
      
#   }


  # fields = {
#     'Tweet':('tweet', field_tweet ), #
#     'Intensity Score': ('intensity',field_intensity) # Intensity Score is name of the dataset column, field_intensity is how we have defined the field, intensity is the name of the variable going fwd
#     }

In [None]:
# dict_fields ={}
# for name in list(dict_dataset.keys()):
#   print(name,dict_dataset[name]["train_dataset"])
#   dict_fields[name]= { 'field_tweet': Field(sequential=True,
#                                          use_vocab = True,
#                                          tokenize = preprocess_tweet,
#                                          fix_length = MAX_SIZE,
#                                          batch_first = True ), 
#                             'field_intensity': Field(sequential= False,
#                                                dtype = torch.float,
#                                                use_vocab = False )}

In [18]:
dict_dataset ={}
for file_key, file_name in dict_file_name.items():
  # print(file_key,file_name)
  if "train" in (file_key.split("_")[-1]):
    head_name = "_".join(file_key.split("_")[0:-1])
    base_name = "_".join(file_key.split("_")[1:-1])
    # print(base_name)
    train_file = dict_file_name[head_name+"_train"]
    val_file = dict_file_name[head_name+"_val"]
    test_file =  dict_file_name[head_name+"_test"]

    train, val, test =TabularDataset.splits( path = './', 
                                            train = train_file, 
                                            validation = val_file, 
                                            test = test_file,
                                            format = 'csv', 
                                            fields = dict_fields[base_name])
    
    # print(train_file,val_file,test_file)
    
    # dict_dataset[base_name+"_train"], dict_dataset[base_name+"_val"],dict_dataset[base_name+"_test"]=TabularDataset.splits( path = './',
    #                                                                                                                        train = train_file,
    #                                                                                                                        validation = val_file,
    #                                                                                                                        test = test_file,
    #                                                                                                                        format = 'csv',
    #                                                                                                                        fields = fields)
    dict_dataset[base_name] = {"train_dataset": train, "val_dataset":val,"test_dataset":test}

In [24]:
# ## BEFORE USING DICT_FIELDS, USING GENERIC FIELDS HERE

# dict_dataset ={}
# for file_key, file_name in dict_file_name.items():
#   # print(file_key,file_name)
#   if "train" in (file_key.split("_")[-1]):
#     head_name = "_".join(file_key.split("_")[0:-1])
#     base_name = "_".join(file_key.split("_")[1:-1])
#     # print(base_name)
#     train_file = dict_file_name[head_name+"_train"]
#     val_file = dict_file_name[head_name+"_val"]
#     test_file =  dict_file_name[head_name+"_test"]

#     train, val, test =TabularDataset.splits( path = './', 
#                                             train = train_file, 
#                                             validation = val_file, 
#                                             test = test_file,
#                                             format = 'csv', 
#                                             fields = fields)
    
#     # print(train_file,val_file,test_file)
    
#     # dict_dataset[base_name+"_train"], dict_dataset[base_name+"_val"],dict_dataset[base_name+"_test"]=TabularDataset.splits( path = './',
#     #                                                                                                                        train = train_file,
#     #                                                                                                                        validation = val_file,
#     #                                                                                                                        test = test_file,
#     #                                                                                                                        format = 'csv',
#     #                                                                                                                        fields = fields)
#     dict_dataset[base_name] = {"train_dataset": train, "val_dataset":val,"test_dataset":test}

In [19]:
dict_dataset

{'EI_anger': {'train_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7f4575d16ad0>,
  'val_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7f4579591690>,
  'test_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7f457761d550>},
 'V': {'train_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7f45777c4f90>,
  'val_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7f45773892d0>,
  'test_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7f4576e587d0>},
 'EI_joy': {'train_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7f4576e64390>,
  'val_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7f4576e5c110>,
  'test_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7f457738c950>},
 'EI_fear': {'train_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7f457a545890>,
  'val_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7f457827fd90>,
  'test_dataset': <torchtext.legacy

In [25]:
dict_dataset

{'EI_anger': {'train_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7fb5edfa3390>,
  'val_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7fb5ed75b450>,
  'test_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7fb5eb0bb490>},
 'V': {'train_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7fb5d12d9450>,
  'val_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7fb5d1336c90>,
  'test_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7fb5d2044ad0>},
 'EI_joy': {'train_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7fb5d0a1c3d0>,
  'val_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7fb5d2dd0750>,
  'test_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7fb5d2969dd0>},
 'EI_fear': {'train_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7fb5ed029950>,
  'val_dataset': <torchtext.legacy.data.dataset.TabularDataset at 0x7fb5d07dd850>,
  'test_dataset': <torchtext.legacy

In [21]:
# train_data, valid_data, test_data = TabularDataset.splits( path = './', 
#                                               train = file_EI_reg_train,
#                                               validation = file_EI_reg_val, 
#                                               test = file_EI_reg_test,
#                                               format = 'csv',
#                                               fields = fields
#                                           )

In [22]:
# print(train_data[0].__dict__.keys(),"\n",valid_data[0].__dict__.values())

dict_keys(['tweet', 'intensity']) 
 dict_values([['<user>', 'oh', '!', 'that', 'actually', 'was', 'my', 'first', 'guess', 'but', '.', '<repeated>', 'i', 'thought', 'he', 'was', 'too', 'dark', 'for', 'an', 'irish', 'man', 'from', '<allcaps>', 'bce', '</allcaps>', '.', 'thanks', 'for', 'clearing', 'it', 'up', '<laugh>'], '0.470'])


In [20]:
for key, value in dict_dataset.items():
  # count = 0
  for name, dataset in value.items():
    for example in dataset.examples:
      print(key, name, example.tweet, example.intensity)
      break


# for example in test_data.examples:
#   print(example.tweet, example.intensity)
#   count += 1
#   if count > 2:
#     break

EI_anger train_dataset ['<user>', '<user>', 'shut', 'up', 'hashtags', 'are', 'cool', '<hashtag>', 'offended', '</hashtag>'] 0.562
EI_anger val_dataset ["'", 'we', 'need', 'to', 'do', 'something', '.', 'something', 'must', 'be', 'done', '!', '<repeated>', "'", '\\', 'n', '\\', 'nyour', 'anxiety', 'is', 'amusing', '.', 'nothing', 'will', 'be', 'done', '.', 'despair', '.'] 0.517
EI_anger test_dataset ['<user>', 'i', 'know', 'you', 'mean', 'well', 'but', 'i', 'am', 'offended', '.', 'prick', '.'] 0.734
V train_dataset ['<user>', 'yeah', '!', '<happy>', 'playing', 'well'] 0.600
V val_dataset ['so', '<user>', 'site', 'crashes', 'everytime', 'i', 'try', 'to', 'book', '-', 'how', 'do', 'they', 'help', '?', 'tell', 'me', 'there', "'", 's', 'nothing', 'wrong', '&', 'hang', 'up', '<hashtag>', 'furious', '</hashtag>', '<hashtag>', 'helpless', '</hashtag>', '<user>'] 0.141
V test_dataset ['gm', 'and', 'have', 'a', '<hashtag>', 'tuesday', '</hashtag>', '!'] 0.589
EI_joy train_dataset ['<user>', 'quit

## Building iterator and Vocabulary

In [27]:
# #WORKING
# dict_fields ={}
# for name in list(dict_dataset.keys()):
#   print(name,dict_dataset[name]["train_dataset"])
#   dict_fields[name]= { 'field_tweet': Field(sequential=True,
#                                          use_vocab = True,
#                                          tokenize = preprocess_tweet,
#                                          fix_length = MAX_SIZE,
#                                          batch_first = True ), 
#                             'field_intensity': Field(sequential= False,
#                                                dtype = torch.float,
#                                                use_vocab = False )}
#   # dict_fields[name]['field_intensity'] = Field(sequential= False,
#   #                                              dtype = torch.float,
#   #                                              use_vocab = False )

# # field_tweet = Field(sequential=True,
# #                     use_vocab = True,
# #                     tokenize = preprocess_tweet, 
# #                     fix_length = MAX_SIZE, 
# #                     batch_first = True
# #                     )

# # field_intensity = Field(sequential= False, 
# #                         dtype = torch.float,
# #                         use_vocab = False 
# #                         )

EI_anger <torchtext.legacy.data.dataset.TabularDataset object at 0x7fb5edfa3390>
V <torchtext.legacy.data.dataset.TabularDataset object at 0x7fb5d12d9450>
EI_joy <torchtext.legacy.data.dataset.TabularDataset object at 0x7fb5d0a1c3d0>
EI_fear <torchtext.legacy.data.dataset.TabularDataset object at 0x7fb5ed029950>
EI_sadness <torchtext.legacy.data.dataset.TabularDataset object at 0x7fb5d29656d0>


In [41]:
for name, value in dict_fields.items():
  print(name, value['Tweet'][1])
  value['Tweet'][1].build_vocab(dict_dataset[name]['train_dataset'],
                                   max_size = MAX_VOCAB_SIZE,
                                   min_freq = 1,
                                   vectors = "glove.6B.100d",
                                   unk_init=torch.Tensor.normal_)
  value['Intensity Score'][1].build_vocab(dict_dataset[name]['train_dataset'])
  # value['field_tweet'].build_vocab(dict_dataset[name]['train_dataset'],
  #                                  max_size = MAX_VOCAB_SIZE,
  #                                  min_freq = 1,
  #                                  vectors = "glove.6B.100d",
  #                                  unk_init=torch.Tensor.normal_)
  # value['field_intensity'].build_vocab(dict_dataset[name]['train_dataset'])

EI_fear <torchtext.legacy.data.field.Field object at 0x7f457aa60750>
EI_sadness <torchtext.legacy.data.field.Field object at 0x7f457aa60d90>
EI_anger <torchtext.legacy.data.field.Field object at 0x7f457aa60d50>
V <torchtext.legacy.data.field.Field object at 0x7f457aa60b50>
EI_joy <torchtext.legacy.data.field.Field object at 0x7f457aa60a10>


In [29]:
# #WORKING
# for name, value in dict_fields.items():
#   value['field_tweet'].build_vocab(dict_dataset[name]['train_dataset'],
#                                    max_size = MAX_VOCAB_SIZE,
#                                    min_freq = 1,
#                                    vectors = "glove.6B.100d",
#                                    unk_init=torch.Tensor.normal_)
#   value['field_intensity'].build_vocab(dict_dataset[name]['train_dataset'])

.vector_cache/glove.6B.zip: 862MB [02:39, 5.41MB/s]                           
100%|█████████▉| 399999/400000 [00:16<00:00, 23604.24it/s]


In [24]:
## ORIGINAL
# field_tweet.build_vocab(train_data, 
#                   max_size = MAX_VOCAB_SIZE,
#                   min_freq = 1,
#                   vectors = "glove.6B.100d",
#                   unk_init=torch.Tensor.normal_)

# vocab = field_tweet.build_vocab(train_data, 
#                   max_size = MAX_VOCAB_SIZE,
#                   min_freq = 1,
#                   vectors = "glove.6B.100d",
#                   unk_init=torch.Tensor.normal_)
# field_intensity.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [02:46, 5.19MB/s]                           
100%|█████████▉| 399999/400000 [00:24<00:00, 16531.91it/s]


In [52]:
dict_iterator ={}
for name, value in dict_dataset.items():
  VALID_TEST_BATCH_SIZE= min(len(value['val_dataset']),len(value['test_dataset']) )
  print(name, VALID_TEST_BATCH_SIZE)
  train_iterator, val_iterator, test_iterator= BucketIterator.splits(
      (value['train_dataset'], value['val_dataset'],value['test_dataset']),
      batch_sizes= (BATCH_SIZE,VALID_TEST_BATCH_SIZE,VALID_TEST_BATCH_SIZE),
      sort_key = lambda x: len(x.tweet),
      sort_within_batch=True,
      device = DEVICE,
      shuffle= True)
  
  dict_iterator[name] = {"train_iterator": train_iterator, "val_iterator":val_iterator,"test_iterator":test_iterator}

  # dict_iterator[name]['train_iterator'], dict_iterator[name]['val_iterator'], dict_iterator[name]['test_iterator'] = BucketIterator.splits((dict_dataset[name]['train_dataset'], dict_dataset[name]['val_dataset'],dict_dataset[name]['test_dataset']), 
  #                                                     batch_sizes= (BATCH_SIZE,VALID_TEST_BATCH_SIZE,VALID_TEST_BATCH_SIZE),
  #                                                     sort_key = lambda x: len(x.tweet),
  #                                                     sort_within_batch=True,
  #                                                     device = DEVICE,
  #                                                     shuffle= True)

EI_anger 388
V 449
EI_joy 290
EI_fear 389
EI_sadness 397


In [53]:
dict_iterator.items()

dict_items([('EI_anger', {'train_iterator': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f4572c517d0>, 'val_iterator': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f4572c51f10>, 'test_iterator': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f4572c51890>}), ('V', {'train_iterator': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f4572c512d0>, 'val_iterator': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f4572c51c50>, 'test_iterator': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f4572c51810>}), ('EI_joy', {'train_iterator': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f4572c51a10>, 'val_iterator': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f4572c51090>, 'test_iterator': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f4572c51d50>}), ('EI_fear', {'train_iterator': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f45752d09d0>, 'val_iterator': <torchtex

In [25]:
# VALID_TEST_BATCH_SIZE = min (len(valid_data),len(test_data))
# # print(VALID_TEST_BATCH_SIZE)

# train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), 
#                                                       batch_sizes= (BATCH_SIZE,VALID_TEST_BATCH_SIZE,VALID_TEST_BATCH_SIZE),
#                                                       sort_key = lambda x: len(x.tweet),
#                                                       sort_within_batch=True,
#                                                       device = DEVICE,
#                                                       shuffle= True)

In [None]:
for key, value in dict_iterator.items():
  for name, iterator in value.items():
    for batch in iterator:
      print(key, name, batch.tweet)
      print(batch.intensity)
      break
    break
  break
    
# count = 0a
# for batch in train_iterator:
#   print (batch.tweet)
#   print (batch.intensity)
#   count += 1
#   if count > 2:
#     break

In [60]:
for name in list_name:
  print(dict_fields[name]['Tweet'][1].vocab.stoi.items())

dict_items([('<unk>', 0), ('<pad>', 1), ('</hashtag>', 2), ('<hashtag>', 3), ('.', 4), ('<user>', 5), ('i', 6), ('the', 7), ('to', 8), (',', 9), ('a', 10), ("'", 11), ('is', 12), ('of', 13), ('and', 14), ('you', 15), ('not', 16), ('it', 17), ('!', 18), ('<repeated>', 19), ('in', 20), ('my', 21), ('s', 22), ('that', 23), ('<number>', 24), ('for', 25), ('?', 26), ('on', 27), ('have', 28), ('are', 29), ('am', 30), ('do', 31), ('me', 32), ('so', 33), ('\\', 34), ('</allcaps>', 35), ('<allcaps>', 36), ('this', 37), ('be', 38), ('-', 39), ('can', 40), ('your', 41), ('at', 42), ('but', 43), ('was', 44), ('with', 45), ('just', 46), ('all', 47), ('if', 48), ('like', 49), ('what', 50), ('about', 51), ('will', 52), ('n', 53), ('we', 54), ('no', 55), ('when', 56), ('up', 57), ('they', 58), ('get', 59), ('he', 60), ('from', 61), ('&', 62), ('an', 63), ('fear', 64), ('out', 65), ('anxiety', 66), ('now', 67), ('awful', 68), ('horror', 69), ('nightmare', 70), ('terrorism', 71), ('nervous', 72), ('star

In [None]:
# print(list(field_tweet.vocab.stoi.items()))
# print(field_tweet.vocab.vectors[field_tweet.vocab.stoi['the']])

In [28]:
# for batch_number, batch in enumerate(train_iterator):
#   print(batch_number)
#   print(batch.tweet)
#   print(batch.intensity)

## TorchText treatment of Target Data

In [61]:
# Target Data
df_target = pd.read_csv(os.path.join(TARGET_DIR,'winomt_saunders_combined.csv'))
print(df_target.head())
target_file_name = 'target_csv.csv'
target_csv_file = df_target.to_csv(str(target_file_name))

                             Tweet  Intensity Score
0    the actor finished her work .                0
1    the actor finished his work .                0
2  the actuary finished her work .                0
3  the actuary finished his work .                0
4    the agent finished her work .                0


In [63]:
dict_target_dataset = {}
for name in list_name:
  # print(dict_fields[name])
  dict_target_dataset [name] = TabularDataset( path = os.path.join(TARGET_DIR,'winomt_saunders_combined.csv'), 
                                              format = 'csv',
                                              fields = fields )

print(dict_target_dataset)

{'Tweet': ('tweet', <torchtext.legacy.data.field.Field object at 0x7f457aa60750>), 'Intensity Score': ('intensity', <torchtext.legacy.data.field.Field object at 0x7f457aa60ad0>)}
{'Tweet': ('tweet', <torchtext.legacy.data.field.Field object at 0x7f457aa60d90>), 'Intensity Score': ('intensity', <torchtext.legacy.data.field.Field object at 0x7f457aa609d0>)}
{'Tweet': ('tweet', <torchtext.legacy.data.field.Field object at 0x7f457aa60d50>), 'Intensity Score': ('intensity', <torchtext.legacy.data.field.Field object at 0x7f457aa60690>)}
{'Tweet': ('tweet', <torchtext.legacy.data.field.Field object at 0x7f457aa60b50>), 'Intensity Score': ('intensity', <torchtext.legacy.data.field.Field object at 0x7f457aa60b10>)}
{'Tweet': ('tweet', <torchtext.legacy.data.field.Field object at 0x7f457aa60a10>), 'Intensity Score': ('intensity', <torchtext.legacy.data.field.Field object at 0x7f457aa60bd0>)}
{'EI_fear': <torchtext.legacy.data.dataset.TabularDataset object at 0x7f45714448d0>, 'EI_sadness': <torch

In [30]:
# target_data = TabularDataset( path = os.path.join(TARGET_DIR,'winomt_saunders_combined.csv'), 
#                                               format = 'csv',
#                                               fields = fields
#                                           )
# print(target_data[0].__dict__.keys())
# print(target_data[0].__dict__.values())

dict_keys(['tweet', 'intensity'])
dict_values([['the', 'actor', 'finished', 'her', 'work', '.'], '0'])


In [66]:
for name, dataset in dict_target_dataset.items():
  count=0
  for example in dataset:
    print(example.tweet, example.intensity)
    count += 1
    if count > 2:
      break

['the', 'actor', 'finished', 'her', 'work', '.'] 0
['the', 'actor', 'finished', 'his', 'work', '.'] 0
['the', 'actuary', 'finished', 'her', 'work', '.'] 0
['the', 'actor', 'finished', 'her', 'work', '.'] 0
['the', 'actor', 'finished', 'his', 'work', '.'] 0
['the', 'actuary', 'finished', 'her', 'work', '.'] 0
['the', 'actor', 'finished', 'her', 'work', '.'] 0
['the', 'actor', 'finished', 'his', 'work', '.'] 0
['the', 'actuary', 'finished', 'her', 'work', '.'] 0
['the', 'actor', 'finished', 'her', 'work', '.'] 0
['the', 'actor', 'finished', 'his', 'work', '.'] 0
['the', 'actuary', 'finished', 'her', 'work', '.'] 0
['the', 'actor', 'finished', 'her', 'work', '.'] 0
['the', 'actor', 'finished', 'his', 'work', '.'] 0
['the', 'actuary', 'finished', 'her', 'work', '.'] 0


In [31]:
# count = 0
# for example in target_data:
#   print(example.tweet, example.intensity)
#   count += 1
#   if count > 2:
#     break

['the', 'actor', 'finished', 'her', 'work', '.'] 0
['the', 'actor', 'finished', 'his', 'work', '.'] 0
['the', 'actuary', 'finished', 'her', 'work', '.'] 0


In [69]:
dict_target_iterator = {}
for name in list_name:
  dict_target_iterator [name] = BucketIterator(dict_target_dataset[name], # given that there is only one dataset we are not using splits
                                 batch_size= TARGET_BATCH_SIZE,
                                 sort_key = lambda x: len(x.tweet),
                                 sort_within_batch=True,
                                 device = DEVICE,
                                 repeat=True,
                                 shuffle= True)

print(dict_target_iterator)

{'EI_fear': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f456fa48f90>, 'EI_sadness': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f456fa480d0>, 'EI_anger': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f456fa48f10>, 'V': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f456fa48e10>, 'EI_joy': <torchtext.legacy.data.iterator.BucketIterator object at 0x7f456fa48f50>}


In [32]:

# target_iterator = BucketIterator(target_data, # given that there is only one dataset we are not using splits
#                                  batch_size= TARGET_BATCH_SIZE,
#                                  sort_key = lambda x: len(x.tweet),
#                                  sort_within_batch=True,
#                                  device = DEVICE,
#                                  repeat=True,
#                                  shuffle= True)

In [33]:
len(target_data)//TARGET_BATCH_SIZE

583

In [34]:
next(iter(target_iterator))


[torchtext.legacy.data.batch.Batch of size 8]
	[.tweet]:[torch.LongTensor of size 8x50]
	[.intensity]:[torch.FloatTensor of size 8]

In [71]:
for name, iterator in dict_target_iterator.items():
  count = 0
  for batch in iterator:
    print(name)
    print(batch)
    print (batch.tweet)
    print (batch.intensity)
    count += 1
    break
    if count > 2:
      break

EI_fear

[torchtext.legacy.data.batch.Batch of size 8]
	[.tweet]:[torch.LongTensor of size 8x50]
	[.intensity]:[torch.FloatTensor of size 8]
tensor([[   6,    0,    0,   10,    0,   12,  926,   83,    9,  249,  174,   46,
            4,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1],
        [   6,    0,  605,    9,    6,    0,   12,  627,  171,   71,   10,    0,
            4,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1],
        [   6,    0,    0,  468,   28,    6,    0,   41,  115,   77,   85,  235,
            4,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
     

In [None]:
# count = 0
# for batch in target_iterator:
#   print(batch)
#   print (batch.tweet)
#   print (batch.intensity)
#   count += 1
#   if count > 2:
#     break

## CNN 1d model

### Gradient Reversal layer

In [72]:
from torch.autograd import Function


class GradientReversalFn(Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha

        return output, None

CNN 1 D model
Reference: A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification, Ye Zhang, Byron Wallace 2015

Difference:

use of embedding
use of sigmoid function, as we are having a regression model not a classififer as the main task

In [73]:
import torch.nn as nn
import torch.nn.functional as F

class CNN1d(nn.Module):
    def __init__(self, 
                 vocab_size, 
                 embedding_dim, 
                 n_filters, 
                 filter_sizes, 
                 output_dim, 
                 dropout, 
                 pad_idx
                 ):
        super().__init__()
        
        #---------------------Feature Extractor Network----------------------#
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)

        # Convolutional Network
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size = fs)
                                    for fs in filter_sizes
                                    ])
        
        #---------------------Regression Network------------------------#
        # Fully-connected layer and Dropout
        self.regression = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(len(filter_sizes) * n_filters, len(filter_sizes) * n_filters // 2),
            nn.ReLU(),
            nn.Linear(len(filter_sizes) * n_filters // 2, output_dim * 10),
            nn.ReLU(),
            nn.Linear(output_dim * 10, output_dim)
            # ,
            # nn.Sigmoid()
        )
        # self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim * 10)
        # self.fc2 = nn.Linear(output_dim * 10, output_dim)
        # self.dropout = nn.Dropout(dropout)

        #---------------------Domain Classifier Network------------------------#
        # Fully-connected layer and Dropout
        self.domain_classifier = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(len(filter_sizes) * n_filters, len(filter_sizes) * n_filters // 2),
            nn.ReLU(),
            nn.Linear(len(filter_sizes) * n_filters // 2, output_dim * 10),
            nn.ReLU(),
            nn.Linear(output_dim * 10, 2),
            nn.LogSoftmax(dim=1),
        )
        
    def forward(self, text, alpha=1.0):
        
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.permute(0, 2, 1)
        
        #embedded = [batch size, emb dim, sent len]
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        x_feature = torch.cat(pooled, dim = 1)
        
        #x_feature = [batch size, n_filters * len(filter_sizes)]
        
        reverse_feature = GradientReversalFn.apply(x_feature, alpha)
        # print("reverse_feature",reverse_feature)
    
        regression_output = self.regression(x_feature)
    
        domain_classifier_output = self.domain_classifier(reverse_feature)


        return regression_output, domain_classifier_output

In [38]:
INPUT_DIM = len(field_tweet.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2, 3, 4, 5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = field_tweet.vocab.stoi[field_tweet.pad_token]

In [39]:
model = CNN1d(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
model.to(DEVICE)

CNN1d(
  (embedding): Embedding(4788, 100, padding_idx=1)
  (convs): ModuleList(
    (0): Conv1d(100, 100, kernel_size=(2,), stride=(1,))
    (1): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (2): Conv1d(100, 100, kernel_size=(4,), stride=(1,))
    (3): Conv1d(100, 100, kernel_size=(5,), stride=(1,))
  )
  (regression): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=400, out_features=200, bias=True)
    (2): ReLU()
    (3): Linear(in_features=200, out_features=10, bias=True)
    (4): ReLU()
    (5): Linear(in_features=10, out_features=1, bias=True)
  )
  (domain_classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=400, out_features=200, bias=True)
    (2): ReLU()
    (3): Linear(in_features=200, out_features=10, bias=True)
    (4): ReLU()
    (5): Linear(in_features=10, out_features=2, bias=True)
    (6): LogSoftmax(dim=1)
  )
)

### Load Pre trained embeddings
we'll load the pre-trained *embeddings*

In [40]:
pretrained_embeddings = field_tweet.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.9277, -0.7689,  0.5307,  ..., -0.7100, -0.4301, -0.2852],
        [ 0.0811, -1.0963, -0.0437,  ..., -0.5124,  0.0774, -0.3126],
        [ 0.3447,  0.7465,  0.6800,  ..., -1.8343,  1.1020,  0.7335],
        ...,
        [-0.3437, -0.1366, -0.6357,  ..., -0.4675,  1.4354, -1.3787],
        [ 2.0333, -1.2754, -0.3370,  ...,  0.6044,  1.2364, -0.1271],
        [-0.2258, -1.4275, -2.2880,  ..., -0.3692,  0.9410,  1.5251]])

In [41]:
field_tweet.vocab.vectors.shape

torch.Size([4788, 100])

In [42]:
UNK_IDX = field_tweet.vocab.stoi[field_tweet.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

## Training the model

### Without training one forward pass

In [None]:
for batch in train_iterator:
  print(batch.tweet)
  output = model(batch.tweet)
  print (output)
  break

tensor([[   8,  141,   61,    9, 2722,    6,  113,   16, 4194, 2386,   10,   31,
           12,   10,  156, 1144,   46,    4,  233,   13,   56,   36,   10,  125,
           80,   14,    3, 4767, 4766,    2,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1],
        [   5,    8,  298,  258,  231,    7,   41,  369, 1117, 4063,   20,  622,
           72,    7,    8,   11,  254,  223,    7, 3631,   34, 1282,  957,   19,
            9,  666,  959,  957,   19,    4,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1],
        [   6,  294,  602,  208,   20,    6,  871,   16,  921,   15,  573,  161,
           59,   45,  900,    7,   59,   45,  124,  112,  921,   32,  245,  161,
           24, 1873,  188,   43, 3453,   42,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,   

  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [None]:
# import torch.optim as optim

# optimizer = optim.Adam(model.parameters())

# criterion = nn.BCEWithLogitsLoss()

# model = model.to(DEVICE)
# criterion = criterion.to(DEVICE)

### Typical Train Model Function

In [None]:
# Typical Training Function

from tqdm import tqdm # for beautiful model training updates


def train_model(model, device, train_loader, optimizer, epoch):
    model.train() # setting the model in training mode
    pbar = tqdm(train_loader) # putting the iterator in pbara
    correct = 0 # for accuracy numerator
    processed =0 # for accuracy denominator
    epoch_loss = 0.0
    for batch_idx, batch in enumerate(pbar):

        tweets, intensities = batch.tweet.to(device), batch.intensity.to(device)  # plural, we are not interested in domain
        #sending data to CPU or GPU as per device

        optimizer.zero_grad() # setting gradients to zero to avoid accumulation

        y_preds,_ = model(tweets) # forward pass, result captured in y_preds (plural as there are many body in a batch)
        # we are not interested in domain prediction
        # the predictions are in one hot vector

        regression_loss = regression_loss_function(y_preds,intensities.unsqueeze(1)) # Computing loss
        # loss = F.mse_loss(y_preds,intensities.unsqueeze(1)) # Computing loss

        train_regresion_losses.append(regression_loss.item()) # to capture loss over many epochs

        regression_loss.backward() # backpropagation
        optimizer.step() # updating the params

        # preds = y_preds.argmax(dim=1, keepdim=True)  # get the index olf the max log-probability
        # correct += preds.eq(labels.view_as(preds)).sum().item()
        epoch_loss += regression_loss.item()

        processed += len(tweets)

        pbar.set_description(desc= f'Loss={regression_loss.item()} Batch_id={batch_idx} Epoch Average loss={100*epoch_loss/processed:0.4f}')
    train_accuracy.append(100*epoch_loss/len(train_loader))

### Typical Test Function

In [None]:
def test_model(model,device, data_loader, mode= 'test'):
    model.eval() # setting the model in evaluation mode
    loss = 0
    correct = 0 # for accuracy numerator
    test_regresion_losses =[] # for overall batches (summed over batches)
    valid_regresion_losses =[] # for overall batches (summed over batches)

    with torch.no_grad():
        for batch in data_loader:

            tweets, intensities  = batch.tweet.to(device), batch.intensity.to(device) #sending data to CPU or GPU as per device
            # we are not interested in domains
            
            y_preds,_ = model(tweets) # forward pass, result captured in outputs (plural as there are many bodies in a batch)
            # the outputs are in batch size x one hot vector 
            # not interested in domain output

            regression_loss = regression_loss_function(y_preds,intensities.unsqueeze(1))

            if mode == 'test':
              test_regresion_losses.append(regression_loss.item())
              # print(f'...in the batch...{regression_loss}')
            else:
              valid_regresion_losses.append(regression_loss.item())
              # print(f'...in the batch...{regression_loss}')

        # regression_loss.item() /= len(data_loader.dataset) # average test loss
        if mode == 'test':
          # total_test_regression_loss = sum(test_regresion_losses)
          # test_regresion_losses.append(regression_loss) # to capture loss over many batches
          # print('...Average test loss: {:.8f}'.format((total_test_regression_loss)/len(data_loader.dataset)))
          print(f'TEST LOSS (Average) : {sum(test_regresion_losses) / len(data_loader)}')
        else:
          # valid_regresion_losses.append(regression_loss) # to capture loss over many batches
          # total_valid_regression_loss = sum(valid_regresion_losses)
          # print('...Average validation loss: {:.8f}'.format((total_valid_regression_loss)/len(data_loader.dataset)))
          print(f'VALIDATION LOSS (Average) : {sum(valid_regresion_losses) / len(data_loader)}')

In [None]:
# EXECUTION

lr = 2e-5
optimizer = optim.Adam(model.parameters(), lr=lr)
domain_loss_function= nn.BCEWithLogitsLoss()
regression_loss_function = nn.L1Loss()


model = model.to(DEVICE)
domain_loss_function = domain_loss_function.to(DEVICE)
regression_loss_function = regression_loss_function.to(DEVICE)

# train_losses = [] # to capture train losses over training epochs
train_accuracy = [] # to capture train accuracy over training epochs
# val_losses = [] # to capture validation loss
# test_losses = [] # to capture test losses 
# test_accuracy = [] # to capture test accuracy 

# EPOCHS = 2
EPOCHS = 100
# dict_val_loss = {}
# dict_test_loss = {}


train_regresion_losses = [] # to capture train losses over training epochs
train_domain_losses = []
train_accuracy = [] # to capture train accuracy over training epochs
# valid_regresion_losses = [] # to capture validation loss
# test_regresion_losses = [] # to capture test losses 
total_test_regression_loss =[]
total_valid_regression_loss =[]
# print(f'----------------------training started for {name}-----------------')
for epoch in range(EPOCHS):
  print("EPOCH:", epoch+1)
  train_model(model, DEVICE, train_iterator, optimizer, epoch)
  # print("for validation.......")
  # val_name = train_name.replace("train", "val" )
  # test_model(typical_model, device, dict_val_loader[val_name], mode = 'val')
  test_model(model, DEVICE, valid_iterator, mode = 'val')


  # print("for test  .......")
  # test_name = train_name.replace("train", "test" )
  # test_model(typical_model, device, dict_test_loader[test_name], mode = 'test')
  test_model(model, DEVICE, test_iterator, mode = 'test')

# dict_val_loss[name] = val_losses
# dict_test_loss[name] = test_losses

model_name = "Non_DANN"+".pt"
torch.save(model.state_dict(), os.path.join(MODEL_DIR, model_name))
# print(f'----------------------training complete for {name}-----------------')
# print(dict_val_loss.items())
# print(dict_test_loss.items())

EPOCH: 1


Loss=0.10554364323616028 Batch_id=201 Epoch Average loss=2.1569: 100%|██████████| 202/202 [00:01<00:00, 120.95it/s]


for validation.......
...Average batch validation loss is 0.1546570360660553
for test  .......
...Average batch test loss is 0.14828822761774063
EPOCH: 2


Loss=0.13246333599090576 Batch_id=201 Epoch Average loss=1.8359: 100%|██████████| 202/202 [00:01<00:00, 128.24it/s]


for validation.......
...Average batch validation loss is 0.13858754932880402
for test  .......
...Average batch test loss is 0.13643446005880833
EPOCH: 3


Loss=0.12931421399116516 Batch_id=201 Epoch Average loss=1.6452: 100%|██████████| 202/202 [00:01<00:00, 126.35it/s]


for validation.......
...Average batch validation loss is 0.13139913976192474
for test  .......
...Average batch test loss is 0.13405136205255985
EPOCH: 4


Loss=0.06357194483280182 Batch_id=201 Epoch Average loss=1.4255: 100%|██████████| 202/202 [00:01<00:00, 126.15it/s]


for validation.......
...Average batch validation loss is 0.13594040274620056
for test  .......
...Average batch test loss is 0.1276154574006796
EPOCH: 5


Loss=0.11839717626571655 Batch_id=201 Epoch Average loss=1.2724: 100%|██████████| 202/202 [00:01<00:00, 124.77it/s]


for validation.......
...Average batch validation loss is 0.13184016942977905
for test  .......
...Average batch test loss is 0.12914092652499676
EPOCH: 6


Loss=0.04620756208896637 Batch_id=201 Epoch Average loss=1.1661: 100%|██████████| 202/202 [00:01<00:00, 127.50it/s]


for validation.......
...Average batch validation loss is 0.1411384642124176
for test  .......
...Average batch test loss is 0.12991251423954964
EPOCH: 7


Loss=0.03538079559803009 Batch_id=201 Epoch Average loss=1.0553: 100%|██████████| 202/202 [00:01<00:00, 127.88it/s]


for validation.......
...Average batch validation loss is 0.13933832943439484
for test  .......
...Average batch test loss is 0.1309744231402874
EPOCH: 8


Loss=0.06910017132759094 Batch_id=201 Epoch Average loss=0.9632: 100%|██████████| 202/202 [00:01<00:00, 126.64it/s]


for validation.......
...Average batch validation loss is 0.13338607549667358
for test  .......
...Average batch test loss is 0.12753590568900108
EPOCH: 9


Loss=0.05057922378182411 Batch_id=201 Epoch Average loss=0.8775: 100%|██████████| 202/202 [00:01<00:00, 125.20it/s]


for validation.......
...Average batch validation loss is 0.132205992937088
for test  .......
...Average batch test loss is 0.1283272672444582
EPOCH: 10


Loss=0.054236359894275665 Batch_id=201 Epoch Average loss=0.8701: 100%|██████████| 202/202 [00:01<00:00, 125.62it/s]


for validation.......
...Average batch validation loss is 0.140908345580101
for test  .......
...Average batch test loss is 0.13623837009072304
EPOCH: 11


Loss=0.08716484904289246 Batch_id=201 Epoch Average loss=0.8001: 100%|██████████| 202/202 [00:01<00:00, 124.07it/s]


for validation.......
...Average batch validation loss is 0.12894336879253387
for test  .......
...Average batch test loss is 0.12572136893868446
EPOCH: 12


Loss=0.07786528766155243 Batch_id=201 Epoch Average loss=0.7901: 100%|██████████| 202/202 [00:01<00:00, 125.95it/s]


for validation.......
...Average batch validation loss is 0.12750615179538727
for test  .......
...Average batch test loss is 0.12356251664459705
EPOCH: 13


Loss=0.06521550565958023 Batch_id=201 Epoch Average loss=0.7720: 100%|██████████| 202/202 [00:01<00:00, 127.97it/s]


for validation.......
...Average batch validation loss is 0.13068556785583496
for test  .......
...Average batch test loss is 0.12772739492356777
EPOCH: 14


Loss=0.06234344094991684 Batch_id=201 Epoch Average loss=0.7316: 100%|██████████| 202/202 [00:01<00:00, 126.02it/s]


for validation.......
...Average batch validation loss is 0.13676238059997559
for test  .......
...Average batch test loss is 0.131257776170969
EPOCH: 15


Loss=0.042277731001377106 Batch_id=201 Epoch Average loss=0.7216: 100%|██████████| 202/202 [00:01<00:00, 123.32it/s]


for validation.......
...Average batch validation loss is 0.1322467029094696
for test  .......
...Average batch test loss is 0.12706262804567814
EPOCH: 16


Loss=0.06176459789276123 Batch_id=201 Epoch Average loss=0.6714: 100%|██████████| 202/202 [00:01<00:00, 123.74it/s]


for validation.......
...Average batch validation loss is 0.12766540050506592
for test  .......
...Average batch test loss is 0.12648200429975986
EPOCH: 17


Loss=0.05011354014277458 Batch_id=201 Epoch Average loss=0.6703: 100%|██████████| 202/202 [00:01<00:00, 122.38it/s]


for validation.......
...Average batch validation loss is 0.13186532258987427
for test  .......
...Average batch test loss is 0.12834738567471504
EPOCH: 18


Loss=0.06215837225317955 Batch_id=201 Epoch Average loss=0.6344: 100%|██████████| 202/202 [00:02<00:00, 84.87it/s]


for validation.......
...Average batch validation loss is 0.13549530506134033
for test  .......
...Average batch test loss is 0.13161098212003708
EPOCH: 19


Loss=0.02174104005098343 Batch_id=201 Epoch Average loss=0.6443: 100%|██████████| 202/202 [00:03<00:00, 55.47it/s]


for validation.......
...Average batch validation loss is 0.12771378457546234
for test  .......
...Average batch test loss is 0.12410835921764374
EPOCH: 20


Loss=0.04394527152180672 Batch_id=201 Epoch Average loss=0.5965: 100%|██████████| 202/202 [00:03<00:00, 55.20it/s]


for validation.......
...Average batch validation loss is 0.13153859972953796
for test  .......
...Average batch test loss is 0.12683058716356754
EPOCH: 21


Loss=0.06996851414442062 Batch_id=201 Epoch Average loss=0.6116: 100%|██████████| 202/202 [00:01<00:00, 124.64it/s]


for validation.......
...Average batch validation loss is 0.13636897504329681
for test  .......
...Average batch test loss is 0.1314900740981102
EPOCH: 22


Loss=0.06134910508990288 Batch_id=201 Epoch Average loss=0.5955: 100%|██████████| 202/202 [00:01<00:00, 124.44it/s]


for validation.......
...Average batch validation loss is 0.13175445795059204
for test  .......
...Average batch test loss is 0.12896525487303734
EPOCH: 23


Loss=0.027020052075386047 Batch_id=201 Epoch Average loss=0.5814: 100%|██████████| 202/202 [00:01<00:00, 123.59it/s]


for validation.......
...Average batch validation loss is 0.13601252436637878
for test  .......
...Average batch test loss is 0.13085199519991875
EPOCH: 24


Loss=0.05558527633547783 Batch_id=201 Epoch Average loss=0.5900: 100%|██████████| 202/202 [00:01<00:00, 122.50it/s]


for validation.......
...Average batch validation loss is 0.1347963511943817
for test  .......
...Average batch test loss is 0.12818704545497894
EPOCH: 25


Loss=0.04930763691663742 Batch_id=201 Epoch Average loss=0.5569: 100%|██████████| 202/202 [00:02<00:00, 94.28it/s] 


for validation.......
...Average batch validation loss is 0.13221853971481323
for test  .......
...Average batch test loss is 0.1274647768586874
EPOCH: 26


Loss=0.05863160640001297 Batch_id=201 Epoch Average loss=0.5554: 100%|██████████| 202/202 [00:01<00:00, 109.60it/s]


for validation.......
...Average batch validation loss is 0.12807680666446686
for test  .......
...Average batch test loss is 0.12547456100583076
EPOCH: 27


Loss=0.04324007034301758 Batch_id=201 Epoch Average loss=0.5386: 100%|██████████| 202/202 [00:01<00:00, 123.46it/s]


for validation.......
...Average batch validation loss is 0.13172370195388794
for test  .......
...Average batch test loss is 0.12871100939810276
EPOCH: 28


Loss=0.03890589252114296 Batch_id=201 Epoch Average loss=0.5369: 100%|██████████| 202/202 [00:01<00:00, 122.80it/s]


for validation.......
...Average batch validation loss is 0.12980709969997406
for test  .......
...Average batch test loss is 0.12457997910678387
EPOCH: 29


Loss=0.049299560487270355 Batch_id=201 Epoch Average loss=0.5169: 100%|██████████| 202/202 [00:01<00:00, 123.53it/s]


for validation.......
...Average batch validation loss is 0.1316518783569336
for test  .......
...Average batch test loss is 0.1284820344299078
EPOCH: 30


Loss=0.03082924336194992 Batch_id=201 Epoch Average loss=0.5287: 100%|██████████| 202/202 [00:01<00:00, 121.53it/s]


for validation.......
...Average batch validation loss is 0.12867684662342072
for test  .......
...Average batch test loss is 0.12435084022581577
EPOCH: 31


Loss=0.04806767404079437 Batch_id=201 Epoch Average loss=0.5365: 100%|██████████| 202/202 [00:01<00:00, 118.16it/s]


for validation.......
...Average batch validation loss is 0.14218708872795105
for test  .......
...Average batch test loss is 0.13697534427046776
EPOCH: 32


Loss=0.016657870262861252 Batch_id=201 Epoch Average loss=0.5132: 100%|██████████| 202/202 [00:01<00:00, 119.38it/s]


for validation.......
...Average batch validation loss is 0.13552993535995483
for test  .......
...Average batch test loss is 0.13059332221746445
EPOCH: 33


Loss=0.02173195779323578 Batch_id=201 Epoch Average loss=0.5073: 100%|██████████| 202/202 [00:01<00:00, 124.51it/s]


for validation.......
...Average batch validation loss is 0.13649891316890717
for test  .......
...Average batch test loss is 0.1294838022440672
EPOCH: 34


Loss=0.021699029952287674 Batch_id=201 Epoch Average loss=0.5021: 100%|██████████| 202/202 [00:01<00:00, 121.74it/s]


for validation.......
...Average batch validation loss is 0.13365936279296875
for test  .......
...Average batch test loss is 0.12677447497844696
EPOCH: 35


Loss=0.05339553952217102 Batch_id=201 Epoch Average loss=0.4854: 100%|██████████| 202/202 [00:01<00:00, 120.81it/s]


for validation.......
...Average batch validation loss is 0.13201741874217987
for test  .......
...Average batch test loss is 0.12638082914054394
EPOCH: 36


Loss=0.03849225491285324 Batch_id=201 Epoch Average loss=0.4816: 100%|██████████| 202/202 [00:01<00:00, 122.67it/s]


for validation.......
...Average batch validation loss is 0.1403142511844635
for test  .......
...Average batch test loss is 0.1324592437595129
EPOCH: 37


Loss=0.02999785915017128 Batch_id=201 Epoch Average loss=0.4694: 100%|██████████| 202/202 [00:01<00:00, 120.50it/s]


for validation.......
...Average batch validation loss is 0.13557584583759308
for test  .......
...Average batch test loss is 0.13163823820650578
EPOCH: 38


Loss=0.04553510248661041 Batch_id=201 Epoch Average loss=0.4697: 100%|██████████| 202/202 [00:01<00:00, 117.78it/s]


for validation.......
...Average batch validation loss is 0.12783291935920715
for test  .......
...Average batch test loss is 0.1255461573600769
EPOCH: 39


Loss=0.046661440283060074 Batch_id=201 Epoch Average loss=0.4614: 100%|██████████| 202/202 [00:01<00:00, 121.12it/s]


for validation.......
...Average batch validation loss is 0.12796440720558167
for test  .......
...Average batch test loss is 0.12446867488324642
EPOCH: 40


Loss=0.06711533665657043 Batch_id=201 Epoch Average loss=0.4737: 100%|██████████| 202/202 [00:01<00:00, 121.82it/s]


for validation.......
...Average batch validation loss is 0.13793979585170746
for test  .......
...Average batch test loss is 0.13429422304034233
EPOCH: 41


Loss=0.026664264500141144 Batch_id=201 Epoch Average loss=0.4677: 100%|██████████| 202/202 [00:01<00:00, 121.20it/s]


for validation.......
...Average batch validation loss is 0.13352647423744202
for test  .......
...Average batch test loss is 0.1289748027920723
EPOCH: 42


Loss=0.03413467854261398 Batch_id=201 Epoch Average loss=0.4567: 100%|██████████| 202/202 [00:01<00:00, 119.83it/s]


for validation.......
...Average batch validation loss is 0.134358748793602
for test  .......
...Average batch test loss is 0.12983370758593082
EPOCH: 43


Loss=0.07799477875232697 Batch_id=201 Epoch Average loss=0.4575: 100%|██████████| 202/202 [00:01<00:00, 121.41it/s]


for validation.......
...Average batch validation loss is 0.1305139660835266
for test  .......
...Average batch test loss is 0.12536022625863552
EPOCH: 44


Loss=0.046476393938064575 Batch_id=201 Epoch Average loss=0.4483: 100%|██████████| 202/202 [00:01<00:00, 121.19it/s]


for validation.......
...Average batch validation loss is 0.13982422649860382
for test  .......
...Average batch test loss is 0.13536377623677254
EPOCH: 45


Loss=0.03571247681975365 Batch_id=201 Epoch Average loss=0.4422: 100%|██████████| 202/202 [00:01<00:00, 120.61it/s]


for validation.......
...Average batch validation loss is 0.1330312192440033
for test  .......
...Average batch test loss is 0.12842734716832638
EPOCH: 46


Loss=0.04630563408136368 Batch_id=201 Epoch Average loss=0.4176: 100%|██████████| 202/202 [00:01<00:00, 114.11it/s]


for validation.......
...Average batch validation loss is 0.13666434586048126
for test  .......
...Average batch test loss is 0.13179250806570053
EPOCH: 47


Loss=0.038598574697971344 Batch_id=201 Epoch Average loss=0.4297: 100%|██████████| 202/202 [00:02<00:00, 70.66it/s]


for validation.......
...Average batch validation loss is 0.13561131060123444
for test  .......
...Average batch test loss is 0.13068492524325848
EPOCH: 48


Loss=0.02004585787653923 Batch_id=201 Epoch Average loss=0.4384: 100%|██████████| 202/202 [00:02<00:00, 69.27it/s]


for validation.......
...Average batch validation loss is 0.13161298632621765
for test  .......
...Average batch test loss is 0.12636073678731918
EPOCH: 49


Loss=0.035819485783576965 Batch_id=201 Epoch Average loss=0.4279: 100%|██████████| 202/202 [00:01<00:00, 118.60it/s]


for validation.......
...Average batch validation loss is 0.1308516412973404
for test  .......
...Average batch test loss is 0.12552635930478573
EPOCH: 50


Loss=0.044381748884916306 Batch_id=201 Epoch Average loss=0.4109: 100%|██████████| 202/202 [00:01<00:00, 117.35it/s]


for validation.......
...Average batch validation loss is 0.1322745978832245
for test  .......
...Average batch test loss is 0.12826436012983322
EPOCH: 51


Loss=0.0419221967458725 Batch_id=201 Epoch Average loss=0.4228: 100%|██████████| 202/202 [00:01<00:00, 114.93it/s]


for validation.......
...Average batch validation loss is 0.13253355026245117
for test  .......
...Average batch test loss is 0.12882138043642044
EPOCH: 52


Loss=0.03905608505010605 Batch_id=201 Epoch Average loss=0.4198: 100%|██████████| 202/202 [00:01<00:00, 119.66it/s]


for validation.......
...Average batch validation loss is 0.13674801588058472
for test  .......
...Average batch test loss is 0.13257529959082603
EPOCH: 53


Loss=0.022340692579746246 Batch_id=201 Epoch Average loss=0.4180: 100%|██████████| 202/202 [00:01<00:00, 114.59it/s]


for validation.......
...Average batch validation loss is 0.13419392704963684
for test  .......
...Average batch test loss is 0.128416009247303
EPOCH: 54


Loss=0.03182633966207504 Batch_id=201 Epoch Average loss=0.4135: 100%|██████████| 202/202 [00:01<00:00, 114.15it/s]


for validation.......
...Average batch validation loss is 0.13291604816913605
for test  .......
...Average batch test loss is 0.1263848077505827
EPOCH: 55


Loss=0.02077210135757923 Batch_id=201 Epoch Average loss=0.4172: 100%|██████████| 202/202 [00:01<00:00, 113.81it/s]


for validation.......
...Average batch validation loss is 0.13781416416168213
for test  .......
...Average batch test loss is 0.13159687630832195
EPOCH: 56


Loss=0.01934702694416046 Batch_id=201 Epoch Average loss=0.4133: 100%|██████████| 202/202 [00:01<00:00, 120.10it/s]


for validation.......
...Average batch validation loss is 0.13788928091526031
for test  .......
...Average batch test loss is 0.13394920155405998
EPOCH: 57


Loss=0.03140709921717644 Batch_id=201 Epoch Average loss=0.4081: 100%|██████████| 202/202 [00:01<00:00, 115.33it/s]


for validation.......
...Average batch validation loss is 0.13389521837234497
for test  .......
...Average batch test loss is 0.12871688231825829
EPOCH: 58


Loss=0.032605450600385666 Batch_id=201 Epoch Average loss=0.3986: 100%|██████████| 202/202 [00:01<00:00, 108.99it/s]


for validation.......
...Average batch validation loss is 0.13344062864780426
for test  .......
...Average batch test loss is 0.12839036993682384
EPOCH: 59


Loss=0.037125322967767715 Batch_id=201 Epoch Average loss=0.3967: 100%|██████████| 202/202 [00:01<00:00, 105.51it/s]


for validation.......
...Average batch validation loss is 0.13259971141815186
for test  .......
...Average batch test loss is 0.12741093523800373
EPOCH: 60


Loss=0.02369632199406624 Batch_id=201 Epoch Average loss=0.4039: 100%|██████████| 202/202 [00:02<00:00, 100.86it/s]


for validation.......
...Average batch validation loss is 0.13155314326286316
for test  .......
...Average batch test loss is 0.12635568901896477
EPOCH: 61


Loss=0.05605871602892876 Batch_id=201 Epoch Average loss=0.4062: 100%|██████████| 202/202 [00:02<00:00, 99.99it/s]


for validation.......
...Average batch validation loss is 0.1301645189523697
for test  .......
...Average batch test loss is 0.12526201829314232
EPOCH: 62


Loss=0.031675659120082855 Batch_id=201 Epoch Average loss=0.4031: 100%|██████████| 202/202 [00:01<00:00, 117.43it/s]


for validation.......
...Average batch validation loss is 0.13701464235782623
for test  .......
...Average batch test loss is 0.13170950300991535
EPOCH: 63


Loss=0.02270331420004368 Batch_id=201 Epoch Average loss=0.4018: 100%|██████████| 202/202 [00:01<00:00, 117.71it/s]


for validation.......
...Average batch validation loss is 0.13627618551254272
for test  .......
...Average batch test loss is 0.13192420452833176
EPOCH: 64


Loss=0.045204486697912216 Batch_id=201 Epoch Average loss=0.3815: 100%|██████████| 202/202 [00:01<00:00, 114.07it/s]


for validation.......
...Average batch validation loss is 0.12988951802253723
for test  .......
...Average batch test loss is 0.12678853422403336
EPOCH: 65


Loss=0.02785581722855568 Batch_id=201 Epoch Average loss=0.3920: 100%|██████████| 202/202 [00:01<00:00, 114.73it/s]


for validation.......
...Average batch validation loss is 0.13326555490493774
for test  .......
...Average batch test loss is 0.12899748794734478
EPOCH: 66


Loss=0.02943645417690277 Batch_id=201 Epoch Average loss=0.3789: 100%|██████████| 202/202 [00:01<00:00, 118.39it/s]


for validation.......
...Average batch validation loss is 0.13415314257144928
for test  .......
...Average batch test loss is 0.13007808104157448
EPOCH: 67


Loss=0.03671221062541008 Batch_id=201 Epoch Average loss=0.3910: 100%|██████████| 202/202 [00:01<00:00, 115.62it/s]


for validation.......
...Average batch validation loss is 0.1319780945777893
for test  .......
...Average batch test loss is 0.126828171312809
EPOCH: 68


Loss=0.05100781470537186 Batch_id=201 Epoch Average loss=0.3739: 100%|██████████| 202/202 [00:01<00:00, 119.42it/s]


for validation.......
...Average batch validation loss is 0.133079394698143
for test  .......
...Average batch test loss is 0.12760156020522118
EPOCH: 69


Loss=0.012210870161652565 Batch_id=201 Epoch Average loss=0.3667: 100%|██████████| 202/202 [00:01<00:00, 113.00it/s]


for validation.......
...Average batch validation loss is 0.13634225726127625
for test  .......
...Average batch test loss is 0.1300933826714754
EPOCH: 70


Loss=0.03364355117082596 Batch_id=201 Epoch Average loss=0.3726: 100%|██████████| 202/202 [00:01<00:00, 114.30it/s]


for validation.......
...Average batch validation loss is 0.13654083013534546
for test  .......
...Average batch test loss is 0.13038698956370354
EPOCH: 71


Loss=0.021776989102363586 Batch_id=201 Epoch Average loss=0.3671: 100%|██████████| 202/202 [00:01<00:00, 115.22it/s]


for validation.......
...Average batch validation loss is 0.13420963287353516
for test  .......
...Average batch test loss is 0.12814662605524063
EPOCH: 72


Loss=0.02545623481273651 Batch_id=201 Epoch Average loss=0.3735: 100%|██████████| 202/202 [00:01<00:00, 115.60it/s]


for validation.......
...Average batch validation loss is 0.13082151114940643
for test  .......
...Average batch test loss is 0.12667951546609402
EPOCH: 73


Loss=0.03128120303153992 Batch_id=201 Epoch Average loss=0.3753: 100%|██████████| 202/202 [00:01<00:00, 113.27it/s]


for validation.......
...Average batch validation loss is 0.136500746011734
for test  .......
...Average batch test loss is 0.131771894171834
EPOCH: 74


Loss=0.035865433514118195 Batch_id=201 Epoch Average loss=0.3558: 100%|██████████| 202/202 [00:02<00:00, 82.18it/s]


for validation.......
...Average batch validation loss is 0.13644228875637054
for test  .......
...Average batch test loss is 0.13252459280192852
EPOCH: 75


Loss=0.030982187017798424 Batch_id=201 Epoch Average loss=0.3581: 100%|██████████| 202/202 [00:01<00:00, 116.30it/s]


for validation.......
...Average batch validation loss is 0.13605841994285583
for test  .......
...Average batch test loss is 0.12983496859669685
EPOCH: 76


Loss=0.03170362114906311 Batch_id=201 Epoch Average loss=0.3485: 100%|██████████| 202/202 [00:01<00:00, 116.56it/s]


for validation.......
...Average batch validation loss is 0.13357289135456085
for test  .......
...Average batch test loss is 0.12883618474006653
EPOCH: 77


Loss=0.0198851078748703 Batch_id=201 Epoch Average loss=0.3731: 100%|██████████| 202/202 [00:01<00:00, 115.31it/s]


for validation.......
...Average batch validation loss is 0.13347110152244568
for test  .......
...Average batch test loss is 0.12947623059153557
EPOCH: 78


Loss=0.029047034680843353 Batch_id=201 Epoch Average loss=0.3573: 100%|██████████| 202/202 [00:01<00:00, 116.20it/s]


for validation.......
...Average batch validation loss is 0.13687194883823395
for test  .......
...Average batch test loss is 0.13229431957006454
EPOCH: 79


Loss=0.021452169865369797 Batch_id=201 Epoch Average loss=0.3577: 100%|██████████| 202/202 [00:01<00:00, 116.38it/s]


for validation.......
...Average batch validation loss is 0.13266870379447937
for test  .......
...Average batch test loss is 0.12913409806787968
EPOCH: 80


Loss=0.020022042095661163 Batch_id=201 Epoch Average loss=0.3640: 100%|██████████| 202/202 [00:01<00:00, 116.66it/s]


for validation.......
...Average batch validation loss is 0.13226421177387238
for test  .......
...Average batch test loss is 0.12984644249081612
EPOCH: 81


Loss=0.033639855682849884 Batch_id=201 Epoch Average loss=0.3453: 100%|██████████| 202/202 [00:01<00:00, 115.40it/s]


for validation.......
...Average batch validation loss is 0.13752661645412445
for test  .......
...Average batch test loss is 0.1337168663740158
EPOCH: 82


Loss=0.02775963768362999 Batch_id=201 Epoch Average loss=0.3623: 100%|██████████| 202/202 [00:01<00:00, 116.77it/s]


for validation.......
...Average batch validation loss is 0.13501079380512238
for test  .......
...Average batch test loss is 0.13128499872982502
EPOCH: 83


Loss=0.018618447706103325 Batch_id=201 Epoch Average loss=0.3544: 100%|██████████| 202/202 [00:01<00:00, 117.37it/s]


for validation.......
...Average batch validation loss is 0.13563695549964905
for test  .......
...Average batch test loss is 0.12942098453640938
EPOCH: 84


Loss=0.04367909952998161 Batch_id=201 Epoch Average loss=0.3485: 100%|██████████| 202/202 [00:01<00:00, 117.44it/s]


for validation.......
...Average batch validation loss is 0.14074459671974182
for test  .......
...Average batch test loss is 0.13559205830097198
EPOCH: 85


Loss=0.029138870537281036 Batch_id=201 Epoch Average loss=0.3502: 100%|██████████| 202/202 [00:01<00:00, 111.84it/s]


for validation.......
...Average batch validation loss is 0.13475298881530762
for test  .......
...Average batch test loss is 0.12884563766419888
EPOCH: 86


Loss=0.01732359454035759 Batch_id=201 Epoch Average loss=0.3398: 100%|██████████| 202/202 [00:01<00:00, 108.63it/s]


for validation.......
...Average batch validation loss is 0.13545869290828705
for test  .......
...Average batch test loss is 0.12971454299986362
EPOCH: 87


Loss=0.022718720138072968 Batch_id=201 Epoch Average loss=0.3461: 100%|██████████| 202/202 [00:01<00:00, 112.56it/s]


for validation.......
...Average batch validation loss is 0.13326452672481537
for test  .......
...Average batch test loss is 0.12748775258660316
EPOCH: 88


Loss=0.04711003229022026 Batch_id=201 Epoch Average loss=0.3382: 100%|██████████| 202/202 [00:01<00:00, 113.13it/s]


for validation.......
...Average batch validation loss is 0.1349315494298935
for test  .......
...Average batch test loss is 0.12923004105687141
EPOCH: 89


Loss=0.026468005031347275 Batch_id=201 Epoch Average loss=0.3433: 100%|██████████| 202/202 [00:01<00:00, 112.78it/s]


for validation.......
...Average batch validation loss is 0.13734254240989685
for test  .......
...Average batch test loss is 0.13153482973575592
EPOCH: 90


Loss=0.02268502488732338 Batch_id=201 Epoch Average loss=0.3253: 100%|██████████| 202/202 [00:01<00:00, 116.74it/s]


for validation.......
...Average batch validation loss is 0.13245223462581635
for test  .......
...Average batch test loss is 0.1268784087151289
EPOCH: 91


Loss=0.0343233123421669 Batch_id=201 Epoch Average loss=0.3532: 100%|██████████| 202/202 [00:01<00:00, 116.33it/s]


for validation.......
...Average batch validation loss is 0.13144728541374207
for test  .......
...Average batch test loss is 0.12550003826618195
EPOCH: 92


Loss=0.03230878338217735 Batch_id=201 Epoch Average loss=0.3422: 100%|██████████| 202/202 [00:01<00:00, 117.01it/s]


for validation.......
...Average batch validation loss is 0.13576167821884155
for test  .......
...Average batch test loss is 0.13129601068794727
EPOCH: 93


Loss=0.021436970680952072 Batch_id=201 Epoch Average loss=0.3341: 100%|██████████| 202/202 [00:01<00:00, 116.26it/s]


for validation.......
...Average batch validation loss is 0.13305513560771942
for test  .......
...Average batch test loss is 0.12691534496843815
EPOCH: 94


Loss=0.03098316304385662 Batch_id=201 Epoch Average loss=0.3407: 100%|██████████| 202/202 [00:01<00:00, 116.58it/s]


for validation.......
...Average batch validation loss is 0.1351473182439804
for test  .......
...Average batch test loss is 0.12922946177423
EPOCH: 95


Loss=0.034376129508018494 Batch_id=201 Epoch Average loss=0.3366: 100%|██████████| 202/202 [00:01<00:00, 115.69it/s]


for validation.......
...Average batch validation loss is 0.13939055800437927
for test  .......
...Average batch test loss is 0.13433926552534103
EPOCH: 96


Loss=0.024936247617006302 Batch_id=201 Epoch Average loss=0.3388: 100%|██████████| 202/202 [00:01<00:00, 111.75it/s]


for validation.......
...Average batch validation loss is 0.13759692013263702
for test  .......
...Average batch test loss is 0.131194069981575
EPOCH: 97


Loss=0.04351605847477913 Batch_id=201 Epoch Average loss=0.3343: 100%|██████████| 202/202 [00:02<00:00, 93.49it/s]


for validation.......
...Average batch validation loss is 0.13699564337730408
for test  .......
...Average batch test loss is 0.12833493389189243
EPOCH: 98


Loss=0.035173676908016205 Batch_id=201 Epoch Average loss=0.3192: 100%|██████████| 202/202 [00:01<00:00, 110.54it/s]


for validation.......
...Average batch validation loss is 0.13651412725448608
for test  .......
...Average batch test loss is 0.12829741649329662
EPOCH: 99


Loss=0.033625997602939606 Batch_id=201 Epoch Average loss=0.3309: 100%|██████████| 202/202 [00:02<00:00, 100.74it/s]


for validation.......
...Average batch validation loss is 0.13567854464054108
for test  .......
...Average batch test loss is 0.12754332087934017
EPOCH: 100


Loss=0.03215210884809494 Batch_id=201 Epoch Average loss=0.3229: 100%|██████████| 202/202 [00:01<00:00, 102.61it/s]


for validation.......
...Average batch validation loss is 0.13678020238876343
for test  .......
...Average batch test loss is 0.128689618781209


## DANN Model - Training and Testing

### Training

In [None]:
# b = next(iter(train_iterator))
# print(b.tweet, b.intensity)

tensor([[   5,  488,    7,  122,    7,  199, 4073, 1832,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1],
        [ 111,  306, 2665,    9,  811,  128,  264,  529,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1]]) tensor([0.4400, 0.2080])


In [None]:
def compute_accuracy(logits, labels):
    
    predicted_labels_dict = {
      0: 0,
      1: 0,
    }
    
    predicted_label = logits.max(dim = 1)[1]
    
    for pred in predicted_label:
        predicted_labels_dict[pred.item()] += 1
    acc = (predicted_label == labels).float().mean()
    
    return acc, predicted_labels_dict

In [None]:
# def binary_acc(y_pred, y_test):
#     y_pred_tag = torch.round(torch.sigmoid(y_pred))

#     correct_results_sum = (y_pred_tag == y_test).sum().float()
#     acc = correct_results_sum/y_test.shape[0]
#     acc = torch.round(acc * 100)
    
#     return acc

In [None]:
# def evaluate(model, dataloader, mode = 'test', percentage = 5):
#     with torch.no_grad():
#         predicted_labels_dict = {                                                   
#           0: 0,                                                                     
#           1: 0,                                                                     
#         }
        
#         mean_accuracy = 0.0
#         # total_batches = len(dataloader)
#         # print("total_batches: ",total_batches )

        



#             sentiment_pred, _ = model(**inputs)
#             accuracy, predicted_labels = compute_accuracy(sentiment_pred, inputs["labels"])
#             mean_accuracy += accuracy
#             predicted_labels_dict[0] += predicted_labels[0]
#             predicted_labels_dict[1] += predicted_labels[1]  
#         print(predicted_labels_dict)
#     return mean_accuracy/total_batches

In [None]:
 ## DANN training function attempt 2 with NLLLoss
n_epochs = 100 # number of epochs
lr = 2e-5
optimizer = optim.Adam(model.parameters(), lr=lr)

# loss_fn_sentiment_regression = torch.nn.NLLLoss()
# loss_fn_domain_classifier = torch.nn.NLLLoss()

model = model.to(DEVICE)
domain_loss_function= nn.NLLLoss()
regression_loss_function = nn.L1Loss()
domain_loss_function = domain_loss_function.to(DEVICE)
regression_loss_function = regression_loss_function.to(DEVICE)

max_batches = min(len(train_iterator), len(target_data)//TARGET_BATCH_SIZE)
# max_batches = min(len(train_iterator), len(target_iterator))

print(max_batches)

for epoch_idx in range(n_epochs):
    
    source_iterator = iter(train_iterator)
    target_iterator = iter(target_iterator)

    for batch_idx in range(max_batches):
        
        p = float(batch_idx + epoch_idx * max_batches) / (n_epochs * max_batches)
        alpha = 2. / (1. + np.exp(-10 * p)) - 1
        alpha = torch.tensor(alpha)
        
        model.train()
        
        # if(batch_idx%100 == 0 ):
        #     print("Training Step:", batch_idx)
        
        optimizer.zero_grad()

        ## SOURCE DATASET TRAINING UPDATE
        
        source_batch = next(source_iterator)
        source_tweets, source_intensities = source_batch.tweet.to(DEVICE), source_batch.intensity.to(DEVICE)  # plural, we are not interested in domain
        
        source_intensity_outputs, source_domain_outputs = model(source_tweets, alpha = alpha)

        loss_source_regression= regression_loss_function(source_intensity_outputs,source_intensities.unsqueeze(1)) # Computing regression loss

        source_domain_inputs = torch.zeros(len(source_batch), dtype=torch.long).to(DEVICE) # source domain has 0 id
        loss_source_domain = domain_loss_function(source_domain_outputs,source_domain_inputs)


        ## TARGET DATASET TRAINING UPDATE
        target_batch = next(iter(target_iterator))
        target_tweets= target_batch.tweet.to(DEVICE) # plural

        _, target_domain_outputs = model(target_tweets, alpha = alpha)

        target_domain_inputs = torch.ones(len(target_batch), dtype=torch.long).to(DEVICE) # target domain has 1 id
        loss_target_domain = domain_loss_function(target_domain_outputs,target_domain_inputs)

        # COMBINING LOSS
        loss = loss_source_regression + loss_source_domain + loss_target_domain
        loss.backward()
        optimizer.step()

        if (batch_idx % 100 == 0):
          print("Epoch [{}/{}] Step [{}/{}]: domain_loss_target={:.4f} / domain_loss_source={:.4f} / regression_loss_source={:.4f} / alpha={:.4f}"
              .format(epoch_idx + 1,
                      n_epochs,
                      batch_idx + 1,
                      max_batches,
                      loss_target_domain.item()
                      ,loss_source_domain.item()
                      ,loss_source_regression.item(),alpha))


    # Evaluate the model after every epoch


    print("for validation.......")
    test_model(model, DEVICE, valid_iterator, mode = 'val')
    print("for test  .......")
    test_model(model, DEVICE, test_iterator, mode = 'test')

  
torch.save(model.state_dict(), os.path.join(MODEL_DIR, "epoch_" + str(epoch_idx)  +  ".pt" ))


202
Epoch [1/100] Step [1/202]: domain_loss_target=0.9360 / domain_loss_source=0.4043 / regression_loss_source=0.0326 / alpha=0.0000
for validation.......
...Average batch validation loss is 0.1373431384563446
for test  .......
...Average batch test loss is 0.12931600026786327
Epoch [2/100] Step [1/202]: domain_loss_target=1.0782 / domain_loss_source=0.3466 / regression_loss_source=0.0205 / alpha=0.0500
for validation.......
...Average batch validation loss is 0.13746392726898193
for test  .......
...Average batch test loss is 0.12958327867090702
Epoch [3/100] Step [1/202]: domain_loss_target=0.7410 / domain_loss_source=0.2396 / regression_loss_source=0.0179 / alpha=0.0997
for validation.......
...Average batch validation loss is 0.13802608847618103
for test  .......
...Average batch test loss is 0.13033027946949005
Epoch [4/100] Step [1/202]: domain_loss_target=0.7228 / domain_loss_source=0.3157 / regression_loss_source=0.0229 / alpha=0.1489
for validation.......
...Average batch vali

[link text](https://)# Equality Evaluation using Equity Evaluation Corpus

## Loading Evaluation Data
format `[ID	Sentence	Template	Person	Gender	Race Emotion	Emotion word]`

In [47]:
data_EEC = TASK1.EEC['eec']
df_EEC = pd.read_csv(data_EEC)
df_EEC.head()

Unnamed: 0,ID,Sentence,Template,Person,Gender,Race,Emotion,Emotion word
0,2018-En-mystery-05498,Alonzo feels angry.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,angry
1,2018-En-mystery-11722,Alonzo feels furious.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,furious
2,2018-En-mystery-11364,Alonzo feels irritated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,irritated
3,2018-En-mystery-14320,Alonzo feels enraged.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,enraged
4,2018-En-mystery-14114,Alonzo feels annoyed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,annoyed


## Creating evalution function (includes pre-processing)

In [48]:
## padding function : adds padding / truncates to max size
def pad_or_truncate(some_list, target_len = MAX_SIZE ,pad_idx = PAD_IDX):
    return some_list[:target_len] + [pad_idx]*(target_len - len(some_list))

## preprocessing function, takes in a tweet and returns padded indexed tweet (input for model)
def text_pipeline(tweet):
    indexed_tweet = [field_tweet.vocab.__getitem__(token) for token in preprocess_tweet(tweet)]
    # print(indexed_tweet)
    return pad_or_truncate(indexed_tweet, MAX_SIZE , pad_idx = PAD_IDX)
    # print(indexed_tweet_padded)

In [49]:
# i = random.randint(0,len(df_EEC))
# tweet_example = df_EEC['Sentence'][i]
# print(tweet_example, text_pipeline(tweet_example))

## Loading model

In [51]:
### Loading Model

dict_model_name = {'non_dann':'Non_DANN.pt','dann':'epoch_99.pt'}
dict_loaded_model ={}
for model_type, model_name in dict_model_name.items():
  loaded_model = CNN1d(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
  loaded_model.load_state_dict(torch.load(os.path.join(MODEL_DIR, model_name),map_location=torch.device(DEVICE)))
  loaded_model.eval()
  dict_loaded_model[model_type] = loaded_model
print(dict_loaded_model)

{'non_dann': CNN1d(
  (embedding): Embedding(4788, 100, padding_idx=1)
  (convs): ModuleList(
    (0): Conv1d(100, 100, kernel_size=(2,), stride=(1,))
    (1): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (2): Conv1d(100, 100, kernel_size=(4,), stride=(1,))
    (3): Conv1d(100, 100, kernel_size=(5,), stride=(1,))
  )
  (regression): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=400, out_features=200, bias=True)
    (2): ReLU()
    (3): Linear(in_features=200, out_features=10, bias=True)
    (4): ReLU()
    (5): Linear(in_features=10, out_features=1, bias=True)
  )
  (domain_classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=400, out_features=200, bias=True)
    (2): ReLU()
    (3): Linear(in_features=200, out_features=10, bias=True)
    (4): ReLU()
    (5): Linear(in_features=10, out_features=2, bias=True)
    (6): LogSoftmax(dim=1)
  )
), 'dann': CNN1d(
  (embedding): Embedding(4788, 100, padding_idx=1)
 

In [52]:
from torch.cuda import Device
def predict(tweet, model, text_pipeline,device = DEVICE):
  with torch.no_grad():
    tweet_tensor = torch.tensor(text_pipeline(tweet)).unsqueeze(0).to(device)
    # output = None
    # print(tweet_tensor.shape)
    output = model(tweet_tensor)
    return output[0].item()

In [53]:
i = random.randint(0,len(df_EEC))
tweet_example = df_EEC['Sentence'][i]
loaded_model_device = 'cpu'
model = loaded_model.to(loaded_model_device)
print(predict(tweet_example, loaded_model,text_pipeline, device= loaded_model_device))

0.3635256588459015


## Creating Sentence pairs (as per SEMVAL18 paper)

In [54]:
dict_f_m_noun_phrase = {'she':'he', 
            'her':'him',
            'this woman':'this man',
            'this girl':'this boy',
            'my sister' : 'my brother',
            'my daughter' : 'my son',
            'my wife': 'my husband',
            'my girlfriend':'my boyfriend',
            'my mother':'my father',
            'my aunt':'my uncle',
            'my mom': 'my dad'
            }

name_male = ['Alonzo','Jamel','Alphonse','Jerome','Leroy','Torrance','Darnell','Lamar','Malik','Terrence','Adam','Harry','Josh','Roger','Alan','Frank','Justin','Ryan','Andrew','Jack'] 
name_female = ['Nichelle','Shereen','Ebony','Latisha','Shaniqua','Jasmine','Tanisha','Tia','Lakisha','Latoya','Amanda','Courtney','Heather','Melanie','Katie','Betsy','Kristin','Nancy','Stephanie','Ellen']


In [55]:
list_unique_template = list(df_EEC['Template'].dropna().unique())
# print(list_unique_template)
list_emotion_word = list(df_EEC['Emotion word'].unique())
# print(list_emotion_word)
list_gender = list(df_EEC['Gender'].dropna().unique())
# print(list_gender)
list_person = list(df_EEC['Person'].unique())   
# print(list_person)

In [56]:
# list_f_m_noun_phrase =[]
# list_f_m_noun_phrase.extend(name_male)
# list_f_m_noun_phrase.extend(name_female)
# [list_f_m_noun_phrase.extend([f,m]) for f,m in dict_f_m_noun_phrase.items()]
# print(list_f_m_noun_phrase)
# assert set(list_f_m_noun_phrase)<= set(list_person), "The noun phrases are not subset of overall person list"

In [57]:
print(list_emotion_word)
# list_emotion_word= list_emotion_word.append('')
# print(list_emotion_word)

['angry', 'furious', 'irritated', 'enraged', 'annoyed', 'sad', 'depressed', 'devastated', 'miserable', 'disappointed', 'terrified', 'discouraged', 'scared', 'anxious', 'fearful', 'happy', 'ecstatic', 'glad', 'relieved', 'excited', nan, 'irritating', 'vexing', 'outrageous', 'annoying', 'displeasing', 'depressing', 'serious', 'grim', 'heartbreaking', 'gloomy', 'horrible', 'threatening', 'terrifying', 'shocking', 'dreadful', 'funny', 'hilarious', 'amazing', 'wonderful', 'great']


In [58]:
# Template - F - M Noun Phrases chunks
dict_noun_phrase_sentence_pair = {}
## take a subset where Race field is not populated
df_noun_phrase_subset=  df_EEC[df_EEC['Race'].isna()] ## includes values which do not have Race 
count = 0
# print(len(df_noun_phrase_subset),df_noun_phrase_subset.head())
for template in list_unique_template:
  for emotion_word in list_emotion_word:
    condition_1 = (df_noun_phrase_subset['Template']== template)
    condition_2 = (df_noun_phrase_subset['Emotion word'] == emotion_word)
    condition_3 = (df_noun_phrase_subset['Gender'] == 'female')
    condition_4 = (df_noun_phrase_subset['Gender'] == 'male')
    list_female = df_noun_phrase_subset[(condition_1) & (condition_2) & (condition_3)]['Sentence'].to_list()
    list_male = df_noun_phrase_subset[(condition_1) & (condition_2) & (condition_4)]['Sentence'].to_list()
    if ((len(list_female) >0) & (len(list_male) >0)):
      dict_noun_phrase_sentence_pair[count]=(list_female,list_male)
      # print(emotion_word, "emotion word")
      count = count + 1
    
    if pd.isnull(emotion_word):
      condition_5 = (df_noun_phrase_subset['Emotion word'].isna())
      list_female_2 = df_noun_phrase_subset[(condition_5) & (condition_1) & (condition_3)]['Sentence'].to_list()
      list_male_2 = df_noun_phrase_subset[(condition_5) & (condition_1) & (condition_4)]['Sentence'].to_list()
      if ((len(list_female_2) >0) & (len(list_male_2) >0)):
        dict_noun_phrase_sentence_pair[count]=(list_female_2,list_male_2)
        # print("no emotion word")
        count = count + 1

print(count)
print(len(dict_noun_phrase_sentence_pair),"\ndict_sentence_pair............ \n",dict_noun_phrase_sentence_pair)


144
144 
dict_sentence_pair............ 
 {0: (['She feels angry.', 'This woman feels angry.', 'This girl feels angry.', 'My sister feels angry.', 'My daughter feels angry.', 'My wife feels angry.', 'My girlfriend feels angry.', 'My mother feels angry.', 'My aunt feels angry.', 'My mom feels angry.'], ['He feels angry.', 'This man feels angry.', 'This boy feels angry.', 'My brother feels angry.', 'My son feels angry.', 'My husband feels angry.', 'My boyfriend feels angry.', 'My father feels angry.', 'My uncle feels angry.', 'My dad feels angry.']), 1: (['She feels furious.', 'This woman feels furious.', 'This girl feels furious.', 'My sister feels furious.', 'My daughter feels furious.', 'My wife feels furious.', 'My girlfriend feels furious.', 'My mother feels furious.', 'My aunt feels furious.', 'My mom feels furious.'], ['He feels furious.', 'This man feels furious.', 'This boy feels furious.', 'My brother feels furious.', 'My son feels furious.', 'My husband feels furious.', 'My bo

In [59]:
# Template - F - M Noun Phrases chunks
dict_sentence_pair = {}
count = 0

for template in list_unique_template:
  for f, m in dict_f_m_noun_phrase.items():
    condition_1 = df_EEC['Template']== template
    condition_2 = df_EEC['Person']== f
    condition_3 = df_EEC['Person']== m
    df_temp_f = df_EEC[(condition_1 & condition_2 )] 
    df_temp_m = df_EEC[(condition_1 & condition_3 )]
    for emotion_word in list_emotion_word:
      
      condition_4 = df_EEC['Emotion word'] == emotion_word
      
      k = df_temp_f[condition_4]['Sentence']
      v = df_temp_m[condition_4]['Sentence']
      assert len(k)==len(v), "Problem is in Noun Phase Chunks where emotion_word is not null"
      if len(k) > 0 and len (v) > 0:
        dict_sentence_pair[count] = (k.values[0],v.values[0])
        count = count + 1
      
      ## Checking for column values where emotion word value blank
      if pd.isnull(emotion_word):
        k_null = df_temp_f[df_temp_f['Emotion word'].isna()]['Sentence']
        v_null = df_temp_m[df_temp_m['Emotion word'].isna()]['Sentence']
        assert len(k_null)==len(v_null), "Problem is in Noun Phase Chunks where emotion_word is  null"
        if len(k_null) > 0 and len (v_null) > 0:
          dict_sentence_pair[count] = (k_null.values[0],v_null.values[0])
          count = count + 1
      
print(len(dict_sentence_pair),"\ndict_sentence_pair............ \n",dict_sentence_pair)

  app.launch_new_instance()


1440 
dict_sentence_pair............ 
 {0: ('She feels angry.', 'He feels angry.'), 1: ('She feels furious.', 'He feels furious.'), 2: ('She feels irritated.', 'He feels irritated.'), 3: ('She feels enraged.', 'He feels enraged.'), 4: ('She feels annoyed.', 'He feels annoyed.'), 5: ('She feels sad.', 'He feels sad.'), 6: ('She feels depressed.', 'He feels depressed.'), 7: ('She feels devastated.', 'He feels devastated.'), 8: ('She feels miserable.', 'He feels miserable.'), 9: ('She feels disappointed.', 'He feels disappointed.'), 10: ('She feels terrified.', 'He feels terrified.'), 11: ('She feels discouraged.', 'He feels discouraged.'), 12: ('She feels scared.', 'He feels scared.'), 13: ('She feels anxious.', 'He feels anxious.'), 14: ('She feels fearful.', 'He feels fearful.'), 15: ('She feels happy.', 'He feels happy.'), 16: ('She feels ecstatic.', 'He feels ecstatic.'), 17: ('She feels glad.', 'He feels glad.'), 18: ('She feels relieved.', 'He feels relieved.'), 19: ('She feels exc

In [60]:
# for Named people

dict_list_named_sentence_pairs ={}
df_EEC_subset = df_EEC.dropna(subset = ['Race']) ## removes values which do not have Race 
print(len(df_EEC_subset))

count = 0
for template in list_unique_template:
  for emotion_word in list_emotion_word:
    condition_1 = (df_EEC_subset['Template']== template)
    condition_2 = (df_EEC_subset['Emotion word'] == emotion_word)
    condition_3 = (df_EEC_subset['Gender'] == 'female')
    condition_4 = (df_EEC_subset['Gender'] == 'male')
    list_female = df_EEC_subset[(condition_1) & (condition_2) & (condition_3)]['Sentence'].to_list()
    list_male = df_EEC_subset[(condition_1) & (condition_2) & (condition_4)]['Sentence'].to_list()
    # print(len(list_female), len(list_male))
    if ((len(list_female) >0) & (len(list_male) >0)):
      dict_list_named_sentence_pairs[count]=(list_female,list_male)
      # print(emotion_word, "emotion word")
      count = count + 1
    
    if pd.isnull(emotion_word):
      condition_5 = (df_EEC_subset['Emotion word'].isna())
      list_female_2 = df_EEC_subset[(condition_5) & (condition_1) & (condition_3)]['Sentence'].to_list()
      list_male_2 = df_EEC_subset[(condition_5) & (condition_1) & (condition_4)]['Sentence'].to_list()
      if ((len(list_female_2) >0) & (len(list_male_2) >0)):
        dict_list_named_sentence_pairs[count]=(list_female_2,list_male_2)
        # print("no emotion word")
        count = count + 1
        
    

print (count)
print(len(dict_list_named_sentence_pairs))
print(dict_list_named_sentence_pairs)

5760
144
144
{0: (['Nichelle feels angry.', 'Shereen feels angry.', 'Ebony feels angry.', 'Latisha feels angry.', 'Shaniqua feels angry.', 'Jasmine feels angry.', 'Tanisha feels angry.', 'Tia feels angry.', 'Lakisha feels angry.', 'Latoya feels angry.', 'Amanda feels angry.', 'Courtney feels angry.', 'Heather feels angry.', 'Melanie feels angry.', 'Katie feels angry.', 'Betsy feels angry.', 'Kristin feels angry.', 'Nancy feels angry.', 'Stephanie feels angry.', 'Ellen feels angry.'], ['Alonzo feels angry.', 'Jamel feels angry.', 'Alphonse feels angry.', 'Jerome feels angry.', 'Leroy feels angry.', 'Torrance feels angry.', 'Darnell feels angry.', 'Lamar feels angry.', 'Malik feels angry.', 'Terrence feels angry.', 'Adam feels angry.', 'Harry feels angry.', 'Josh feels angry.', 'Roger feels angry.', 'Alan feels angry.', 'Frank feels angry.', 'Justin feels angry.', 'Ryan feels angry.', 'Andrew feels angry.', 'Jack feels angry.']), 1: (['Nichelle feels furious.', 'Shereen feels furious.', 

In [61]:
print(dict_list_named_sentence_pairs[0][0],"\n",dict_list_named_sentence_pairs[0][1])

['Nichelle feels angry.', 'Shereen feels angry.', 'Ebony feels angry.', 'Latisha feels angry.', 'Shaniqua feels angry.', 'Jasmine feels angry.', 'Tanisha feels angry.', 'Tia feels angry.', 'Lakisha feels angry.', 'Latoya feels angry.', 'Amanda feels angry.', 'Courtney feels angry.', 'Heather feels angry.', 'Melanie feels angry.', 'Katie feels angry.', 'Betsy feels angry.', 'Kristin feels angry.', 'Nancy feels angry.', 'Stephanie feels angry.', 'Ellen feels angry.'] 
 ['Alonzo feels angry.', 'Jamel feels angry.', 'Alphonse feels angry.', 'Jerome feels angry.', 'Leroy feels angry.', 'Torrance feels angry.', 'Darnell feels angry.', 'Lamar feels angry.', 'Malik feels angry.', 'Terrence feels angry.', 'Adam feels angry.', 'Harry feels angry.', 'Josh feels angry.', 'Roger feels angry.', 'Alan feels angry.', 'Frank feels angry.', 'Justin feels angry.', 'Ryan feels angry.', 'Andrew feels angry.', 'Jack feels angry.']


## Two Sample t- test

In [79]:
# f ='She feels angry.'
# m ='He feels angry.'
# f_indices = text_pipeline(f)
# m_indices = text_pipeline(m)
# f_value = predict(f, loaded_model,text_pipeline,device= loaded_model_device)
# m_value = predict(m, loaded_model,text_pipeline,device= loaded_model_device)
# print(f_value,m_value)
# stats.ttest_rel(f_value, m_value)

0.32102489471435547 0.33261770009994507


Ttest_relResult(statistic=nan, pvalue=nan)

In [62]:
# Function for t-test processing

def two_sample_test(dict_sentence_pairs ={},text_pipeline = text_pipeline, loaded_model = loaded_model, loaded_model_device = 'cpu')-> dict:
  dict_t_test_result_sentence_pair ={}
  for key, value in dict_sentence_pairs.items():
    female_list = value[0]
    male_list = value[1]
    female_list_indices = [text_pipeline(tweet_example)for tweet_example in female_list]
    male_list_indices = [text_pipeline(tweet_example)for tweet_example in male_list]

    female_list_output = [predict(sentence, loaded_model,text_pipeline,device= loaded_model_device) for sentence in female_list]
    male_list_output = [predict(sentence, loaded_model,text_pipeline,device= loaded_model_device) for sentence in male_list]

    t_test_result = stats.ttest_rel(female_list_output, male_list_output)
    dict_t_test_result_sentence_pair[key] = (t_test_result.statistic, t_test_result.pvalue,mean(female_list_output)-mean(male_list_output))
  # print(dict_t_test_result_sentence_pair)
  return dict_t_test_result_sentence_pair



In [63]:
list_sentence_pairs = ['named','noun_phrase']
dict_t_test ={}
for model_type, loaded_model in dict_loaded_model.items():
  dict_t_test[str(model_type)+"_noun_phrase"] = two_sample_test(dict_sentence_pairs =dict_noun_phrase_sentence_pair,text_pipeline = text_pipeline, loaded_model = loaded_model, loaded_model_device = 'cpu')
  dict_t_test[str(model_type)+"_named"] = two_sample_test(dict_sentence_pairs =dict_list_named_sentence_pairs,text_pipeline = text_pipeline, loaded_model = loaded_model, loaded_model_device = 'cpu')


In [64]:
# dict_t_test.items()

In [65]:
# dict_t_test_noun_phrase_sentence_pair = two_sample_test(dict_sentence_pairs =dict_noun_phrase_sentence_pair,text_pipeline = text_pipeline, loaded_model = loaded_model, loaded_model_device = 'cpu')
# dict_t_test_named_sentence_pairs = two_sample_test(dict_sentence_pairs =dict_list_named_sentence_pairs,text_pipeline = text_pipeline, loaded_model = loaded_model, loaded_model_device = 'cpu')

In [66]:
# dict_result_named_sentence_pair ={}

# for key, value in dict_list_named_sentence_pairs.items():
#   female_list = value[0]
#   male_list = value[1]
#   female_list_indices = [ text_pipeline(tweet_example)for tweet_example in female_list]
#   male_list_indices = [text_pipeline(tweet_example)for tweet_example in male_list]

#   female_list_output = [predict(sentence, loaded_model,text_pipeline,device= loaded_model_device) for sentence in female_list]
#   male_list_output = [predict(sentence, loaded_model,text_pipeline,device= loaded_model_device) for sentence in male_list]
#   # for sentence in female_list:
#   #   female_list_output.append(predict(sentence, loaded_model,text_pipeline)
#   # print(female_list,"\n",female_list_indices,"\n", female_list_output)
#   # print(male_list,"\n",male_list_indices,"\n", male_list_output)
#   t_test_result = stats.ttest_rel(female_list_output, male_list_output)
#   dict_result_named_sentence_pair[key] = (t_test_result.statistic, t_test_result.pvalue,mean(female_list_output)-mean(male_list_output))
#   # print(type(stats.ttest_rel(female_list_output, male_list_output)))

#   # break

# print((dict_result_named_sentence_pair))

In [67]:
# #without named people
# dict_result_sentence_pair ={}
# # for key, value in dict_sentence_pair:
# #   if len(value[0])
# print(len(dict_sentence_pair))

# for key, value in dict_sentence_pair.items():
#   female_list = [value[0]]
#   male_list = [value[1]]
#   # if len(female_list)!=len(male_list):
#   #   print("key:", key)
#   #   print(female_list,"\n",male_list)
#   #   print(len(female_list),"-",len(male_list))
#   #   print(text_pipeline(female_list[0]),"\n",text_pipeline(male_list[0]))
#   #   break

#   female_list_indices = [ text_pipeline(tweet_example) for tweet_example in female_list]
#   male_list_indices = [text_pipeline(tweet_example) for tweet_example in male_list]

#   female_list_output = [predict(sentence, loaded_model,text_pipeline,device= loaded_model_device) for sentence in female_list]
#   male_list_output = [predict(sentence, loaded_model,text_pipeline,device= loaded_model_device) for sentence in male_list]
#   # for sentence in female_list:
#   #   female_list_output.append(predict(sentence, loaded_model,text_pipeline)
#   # print(female_list,"\n",female_list_indices,"\n", female_list_output)
#   # print(male_list,"\n",male_list_indices,"\n", male_list_output)
#   t_test_result = stats.ttest_rel(female_list_output, male_list_output)
#   dict_result_sentence_pair[key] = (t_test_result.statistic, t_test_result.pvalue,mean(female_list_output)-mean(male_list_output))
#   # print(type(stats.ttest_rel(female_list_output, male_list_output)))

#   # break

# print(dict_result_sentence_pair)

# Analysis of results (based on semval paper)

In [68]:
# dict_t_test_noun_phrase_sentence_pair
# dict_t_test_named_sentence_pairs

In [69]:
# len(dict_t_test_noun_phrase_sentence_pair),len(dict_t_test_named_sentence_pairs)

In [70]:
def analysis_t_test(dict_t_test_sentence_pairs, threshold = 0.05):
  list_output =[]
  for key, test_output in dict_t_test_sentence_pairs.items():
    significant=True
    t_statistic = test_output[0]
    p_value = test_output[1]
    f_m_diff = test_output[2]
    if (float(p_value) > float(threshold) or float(p_value) == float(threshold)):
      significant=False
      category = 'f_equals_m'
    else:
      significant=True
      
      if f_m_diff > 0:
        category='f_high_m_low'
      else:
        category = 'f_low_m_high' 
    list_output.append([key,t_statistic,p_value,significant,f_m_diff,category])
    
  df_columns = ['key','t_statistic','p_value', 'significant','delta','category']
  df_output = pd.DataFrame(list_output, columns = df_columns)

  list_category = list(df_output['category'].unique())
  list_statistics =[]
  for category in list_category:
    df_temp = df_output[df_output['category']==category]
    average = df_temp['delta'].mean()
    # print(category,len(df_temp), average)
    list_statistics.append([category,len(df_temp), average])
  df_statistics = pd.DataFrame(list_statistics, columns = ['category', 'num_pairs','average_difference'])
  return df_statistics


# print(analysis_t_test(dict_t_test_noun_phrase_sentence_pair))
# print(analysis_t_test(dict_t_test_named_sentence_pairs))


In [71]:
dict_statistics={}
for model_type_sentence_pair_name, t_test_dict in dict_t_test.items():
  df_statistics = analysis_t_test(t_test_dict, threshold = 0.05)
  dict_statistics[model_type_sentence_pair_name] = df_statistics


In [72]:
for model_type_sentence_pair_name, df_statistics in dict_statistics.items():
  print(model_type_sentence_pair_name,"\n",df_statistics)
  print(50*"=")

non_dann_noun_phrase 
        category  num_pairs  average_difference
0    f_equals_m        126            0.005233
1  f_high_m_low         18            0.026457
non_dann_named 
        category  num_pairs  average_difference
0    f_equals_m        124            0.008112
1  f_high_m_low         20            0.023590
dann_noun_phrase 
        category  num_pairs  average_difference
0    f_equals_m        124            0.006045
1  f_high_m_low         20            0.021322
dann_named 
        category  num_pairs  average_difference
0    f_equals_m        121            0.008845
1  f_high_m_low         23            0.020125
