<a href="https://colab.research.google.com/github/peeyushsinghal/da/blob/main/mitigating_bias_sa_da_v6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Mitigating bias in sentiment analysis using domain adaptation

In [1]:
! pip install torchtext==0.10.0 --quiet # DOWNGRADE YOUR TORCHTEXT
! pip install ekphrasis --quiet # library to pre process twitter data
! pip install emoji --upgrade --quiet #library to deal with emoji data


In [2]:
## Import statements
import pandas as pd
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchtext.legacy.data import Dataset, Field, TabularDataset, BucketIterator
from torchtext.vocab import GloVe
# import torchtext.functional as TTF
import numpy as np
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import emoji
from torchtext.legacy.vocab import Vectors
from tqdm import tqdm
import random
import torch.optim as optim
# import json
# Importing library
import scipy.stats as stats
from statistics import mean

In [26]:
# checking device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:{}".format(DEVICE))

Running on:cpu


## Data loading

In [4]:
#Mounting google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Data Configuration

In [5]:
BASE_PATH = '/content/drive/MyDrive/semeval-2018'

DATA_DIR = os.path.join(BASE_PATH,'datasets')
TARGET_DIR = os.path.join(BASE_PATH,'targetdataset')

MODEL_DIR = os.path.join(BASE_PATH,'models')
REF_DIR = os.path.join(BASE_PATH,'reference')

if not os.path.exists(MODEL_DIR):
  os.makedirs(MODEL_DIR)
  print("The new directory is created!")

domain_source = 0.0
domain_target = 1.0

In [6]:
class TASK1(object):
  
    EI_reg = {
        'anger': {
            'train': os.path.join(
                DATA_DIR, 'task1/EI-reg/training/EI-reg-En-anger-train.txt'),
            'dev': os.path.join(
                DATA_DIR, 'task1/EI-reg/development/2018-EI-reg-En-anger-dev.txt'),
            'gold': os.path.join(
                DATA_DIR, 'task1/EI-reg/test-gold/2018-EI-reg-En-anger-test-gold.txt')
                },
        'joy': {
                'train': os.path.join(
                    DATA_DIR, 'task1/EI-reg/training/EI-reg-En-joy-train.txt'),
                'dev': os.path.join(
                    DATA_DIR, 'task1/EI-reg/development/2018-EI-reg-En-joy-dev.txt'),
                'gold': os.path.join(
                    DATA_DIR, 'task1/EI-reg/test-gold/2018-EI-reg-En-joy-test-gold.txt')
                    }
        }

    # V_reg = {
    #     'train': os.path.join(
    #         DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-train.txt'),
    #     'dev': os.path.join(
    #         DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-dev.txt'),
    #     'gold': os.path.join(
    #         DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-test-gold.txt')
    #          }

    EEC = {
        'eec': os.path.join(
            DATA_DIR, 'task1/Equity-Evaluation-Corpus/Equity-Evaluation-Corpus.csv')
             }

## Source Data
Parsing Emotion and Valence regression data : `format [ID	Tweet	Affect Dimension	Intensity Score]`

In [7]:
def parse_reg(data_file, label_format='tuple'):#->  pd.DataFrame:
    """
    This is for datasets for the EI-reg and V-reg English tasks 
    Returns:
        df: dataframe with columns in the first row of file [ID-Tweet-Affect Dimension-Intensity Score]
    """
    with open(data_file, 'r') as fd:
      data = [l.strip().split('\t') for l in fd.readlines()]
    
    # print(data)
    df = pd.DataFrame (data[1:],columns=data[0])
    csv_file_name = (data_file.split("/")[-1]).split('.')[0]+".csv"
    csv_file = df.to_csv(str(csv_file_name))
    # df['domain'] = domain_source
    return csv_file_name
    # return df

In [8]:
parse_reg(os.path.join(DATA_DIR, 'task1/EI-reg/training/EI-reg-En-joy-train.txt'))

'EI-reg-En-joy-train.csv'

## Source Parser
Generic Source Data Parser

In [9]:
def parse_csv(task, dataset, emotion='anger'):
    if task == 'EI-reg':
        data_train = TASK1.EI_reg[emotion][dataset]
        csv_file_name = parse_reg(data_train)
        return csv_file_name
        # df[df.columns[-1]] = df[df.columns[-1]].astype(float)
        # return df
    # elif task == 'V-reg':
    #     data_train = TASK1.V_reg[dataset]
    #     df = parse_reg(data_train)
    #     df[df.columns[-1]] = df[df.columns[-1]].astype(float)
    #     return df
    else:
        return None

In [10]:
file_EI_reg_train = parse_csv('EI-reg','train','joy')
file_EI_reg_train

file_EI_reg_val = parse_csv('EI-reg','dev','joy')
print(file_EI_reg_val)

file_EI_reg_test = parse_csv('EI-reg','gold','joy')
(file_EI_reg_test)

2018-EI-reg-En-joy-dev.csv


'2018-EI-reg-En-joy-test-gold.csv'

## Preprocess tweets

In [11]:
# reference : https://github.com/cbaziotis/ekphrasis


text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading twitter - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_1grams.txt
Reading twitter - 2grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_2grams.txt
Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [12]:
# #### Example checks of pre-processing
# sentences = [
#     "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))",
#     "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
#     "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/.",
#     "@MGBarbieri @SpalkTalk a@b.com And just saw your LinkedIn comment after I sent this! Thanks for the message :) 😀",
#     "💙💛🏆 @GeorgeePitman Young Player of The Season 🏆💛💙 #irony #actuallyseventy"
# ]

# for s in sentences:
#     print(" ".join(text_processor.pre_process_doc(s)))
# # print ([text_processor.pre_process_doc(s) for s in sentences])

In [12]:
def preprocess_tweet(tweet): 
  tweet_processed = text_processor.pre_process_doc(tweet)
  # print (tweet_processed)
  final_list =[]
  for index, tweet in enumerate(tweet_processed):
      final_list.append(emoji.demojize(tweet, language = 'en'))
  
  # print(df)
  return final_list

In [14]:
# #### Example checks of pre-processing
# sentences = [
#     "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))",
#     "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
#     "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/.",
#     "@MGBarbieri @SpalkTalk a@b.com And just saw your LinkedIn comment after I sent this! Thanks for the message :) 😀",
#     "💙💛🏆 @GeorgeePitman Young Player of The Season 🏆💛💙 #irony #actuallyseventy"
# ]

# for s in sentences:
#   print(preprocess_tweet(s))
#   # print(" ".join(preprocess_tweet(s)))

## TorchText Treatment

In [20]:
MAX_SIZE = 50
field_tweet = Field(sequential=True, use_vocab = True, tokenize = preprocess_tweet, fix_length = MAX_SIZE, batch_first = True)
# field_tweet = Field(sequential=True, use_vocab = False, tokenize = preprocess_tweet)

field_intensity = Field(sequential= False, 
                        dtype = torch.float,
                        use_vocab = False 
                        )

In [21]:
fields = {
    'Tweet':('tweet', field_tweet ), #
    'Intensity Score': ('intensity',field_intensity) # Intensity Score is name of the dataset column, field_intensity is how we have defined the field, intensity is the name of the variable going fwd
    }

In [22]:
train_data, valid_data, test_data = TabularDataset.splits( path = './', 
                                              train = file_EI_reg_train,
                                              validation = file_EI_reg_val, 
                                              test = file_EI_reg_test,
                                              format = 'csv',
                                              fields = fields
                                          )

In [18]:
print(type(train_data))

<class 'torchtext.legacy.data.dataset.TabularDataset'>


In [19]:
print(train_data[0].__dict__.keys())

dict_keys(['tweet', 'intensity'])


In [20]:
print(valid_data[0].__dict__.values())

dict_values([['<user>', 'oh', '!', 'that', 'actually', 'was', 'my', 'first', 'guess', 'but', '.', '<repeated>', 'i', 'thought', 'he', 'was', 'too', 'dark', 'for', 'an', 'irish', 'man', 'from', '<allcaps>', 'bce', '</allcaps>', '.', 'thanks', 'for', 'clearing', 'it', 'up', '<laugh>'], '0.470'])


In [21]:
print((train_data[2].__dict__.values()))

dict_values([['positive', '<hashtag>', 'psychology', '</hashtag>', 'research', 'shows', 'salespeople', 'who', 'score', 'in', 'the', 'top', '<percent>', 'for', '<hashtag>', 'optimism', '</hashtag>', 'have', '<percent>', '>', 'sales', 'than', 'those', 'in', 'top', '<percent>', 'for', 'pessimism', '.'], '0.274'])


In [23]:
count = 0
for example in test_data.examples:
  print(example.tweet, example.intensity)
  count += 1
  if count > 10:
    break

['people', 'are', 'truly', '<hashtag>', 'amazing', '</hashtag>', '.', '<hashtag>', 'inspiring', '</hashtag>', 'day'] 0.712
['what', 'are', 'some', 'good', '<hashtag>', 'funny', '</hashtag>', '<hashtag>', 'entertaining', '</hashtag>', '<hashtag>', 'interesting', '</hashtag>', 'accounts', 'i', 'should', 'follow', '?', 'my', 'twitter', 'is', 'dry'] 0.339
['<user>', 'a', 'review', 'of', 'my', 'book', 'faulted', 'me', 'for', 'spending', 'so', 'much', 'time', 'on', 'webster', "'", 's', 'introductory', 'front', 'matter', '.', 'but', 'i', 'had', 'to', ',', '<hashtag>', 'brilliant', '</hashtag>', '!'] 0.533
['<hashtag>', 'good', '</hashtag>', 'to', 'learning', '<hashtag>', 'wisdom', '</hashtag>', '<', '<', 'reform', '(', 'v', ')', ':', 'make', 'in', 'order', 'to', 'improve', 'something', '>', '>'] 0.281
['really', 'excited', 'to', 'see', 'our', 'team', 'this', 'year', ',', 'and', 'especially', 'moving', 'in', 'the', 'next', '<number>', '-', '<number>', 'years', 'when', 'his', 'guys', 'start', '

## Building iterator and Vocabulary

In [24]:
MAX_VOCAB_SIZE = 10000

field_tweet.build_vocab(train_data, 
                  max_size = MAX_VOCAB_SIZE,
                  min_freq = 1,
                  vectors = "glove.6B.100d",
                  unk_init=torch.Tensor.normal_)

vocab = field_tweet.build_vocab(train_data, 
                  max_size = MAX_VOCAB_SIZE,
                  min_freq = 1,
                  vectors = "glove.6B.100d",
                  unk_init=torch.Tensor.normal_)
field_intensity.build_vocab(train_data)

In [24]:
type(vocab)

NoneType

In [27]:
BATCH_SIZE = 2
train_iterator, valid_iterator, test_iterator = BucketIterator.splits((train_data, valid_data, test_data), 
                                                      batch_size= BATCH_SIZE,
                                                      sort_key = lambda x: len(x.tweet),
                                                      sort_within_batch=True,
                                                      device = DEVICE,
                                                      shuffle= True)

In [28]:
count = 0
for batch in train_iterator:
  print (batch.tweet)
  print (batch.intensity)
  count += 1
  if count > 2:
    break

tensor([[   5,   57,   22,   99,   28,   10, 3412,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1],
        [   5,   31,  477,  540,   14,  961,  374,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1]])
tensor([0.1460, 0.7080])
tensor([[   5, 2419,   50, 4687, 2817,   30, 3326,   21,   43, 2892,   42, 4384,
            9,  176,   45,   23,  443,   50, 2847, 3793, 4136,    4,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1, 

In [None]:
print(list(field_tweet.vocab.stoi.items()))

In [29]:
print(field_tweet.vocab.vectors[field_tweet.vocab.stoi['the']])

tensor([-0.0382, -0.2449,  0.7281, -0.3996,  0.0832,  0.0440, -0.3914,  0.3344,
        -0.5755,  0.0875,  0.2879, -0.0673,  0.3091, -0.2638, -0.1323, -0.2076,
         0.3340, -0.3385, -0.3174, -0.4834,  0.1464, -0.3730,  0.3458,  0.0520,
         0.4495, -0.4697,  0.0263, -0.5415, -0.1552, -0.1411, -0.0397,  0.2828,
         0.1439,  0.2346, -0.3102,  0.0862,  0.2040,  0.5262,  0.1716, -0.0824,
        -0.7179, -0.4153,  0.2033, -0.1276,  0.4137,  0.5519,  0.5791, -0.3348,
        -0.3656, -0.5486, -0.0629,  0.2658,  0.3020,  0.9977, -0.8048, -3.0243,
         0.0125, -0.3694,  2.2167,  0.7220, -0.2498,  0.9214,  0.0345,  0.4674,
         1.1079, -0.1936, -0.0746,  0.2335, -0.0521, -0.2204,  0.0572, -0.1581,
        -0.3080, -0.4162,  0.3797,  0.1501, -0.5321, -0.2055, -1.2526,  0.0716,
         0.7056,  0.4974, -0.4206,  0.2615, -1.5380, -0.3022, -0.0734, -0.2831,
         0.3710, -0.2522,  0.0162, -0.0171, -0.3898,  0.8742, -0.7257, -0.5106,
        -0.5203, -0.1459,  0.8278,  0.27

In [29]:
field_tweet.vocab.vectors[9]

tensor([-1.8970e-01,  5.0024e-02,  1.9084e-01, -4.9184e-02, -8.9737e-02,
         2.1006e-01, -5.4952e-01,  9.8377e-02, -2.0135e-01,  3.4241e-01,
        -9.2677e-02,  1.6100e-01, -1.3268e-01, -2.8160e-01,  1.8737e-01,
        -4.2959e-01,  9.6039e-01,  1.3972e-01, -1.0781e+00,  4.0518e-01,
         5.0539e-01, -5.5064e-01,  4.8440e-01,  3.8044e-01, -2.9055e-03,
        -3.4942e-01, -9.9696e-02, -7.8368e-01,  1.0363e+00, -2.3140e-01,
        -4.7121e-01,  5.7126e-01, -2.1454e-01,  3.5958e-01, -4.8319e-01,
         1.0875e+00,  2.8524e-01,  1.2447e-01, -3.9248e-02, -7.6732e-02,
        -7.6343e-01, -3.2409e-01, -5.7490e-01, -1.0893e+00, -4.1811e-01,
         4.5120e-01,  1.2112e-01, -5.1367e-01, -1.3349e-01, -1.1378e+00,
        -2.8768e-01,  1.6774e-01,  5.5804e-01,  1.5387e+00,  1.8859e-02,
        -2.9721e+00, -2.4216e-01, -9.2495e-01,  2.1992e+00,  2.8234e-01,
        -3.4780e-01,  5.1621e-01, -4.3387e-01,  3.6852e-01,  7.4573e-01,
         7.2102e-02,  2.7931e-01,  9.2569e-01, -5.0

In [None]:
# for batch_number, batch in enumerate(train_iterator):
#   print(batch_number)
#   print(batch.tweet)
#   print(batch.intensity)

## TorchText treatment of Target Data

In [31]:
# Target Data
df_target = pd.read_csv(os.path.join(TARGET_DIR,'winomt_saunders_combined.csv'))
# df_target.columns = ['Sentence']
df_target.columns = ['Tweet']
df_target['Intensity Score'] = 0
print(df_target.head())

target_file_name = 'target_csv.csv'
target_csv_file = df_target.to_csv(str(target_file_name))

                             Tweet  Intensity Score
0    the actor finished his work .                0
1  the actuary finished her work .                0
2  the actuary finished his work .                0
3    the agent finished her work .                0
4    the agent finished his work .                0


In [32]:
target_data, _ = TabularDataset.splits( path = './', 
                                              train = target_file_name,
                                              test = file_EI_reg_test, # arbitary file
                                              format = 'csv',
                                              fields = fields
                                          )
print(target_data[0].__dict__.keys())
print(target_data[0].__dict__.values())

dict_keys(['tweet', 'intensity'])
dict_values([['the', 'actor', 'finished', 'his', 'work', '.'], '0'])


In [33]:
count = 0
for example in target_data:
  print(example.tweet, example.intensity)
  count += 1
  if count > 2:
    break

['the', 'actor', 'finished', 'his', 'work', '.'] 0
['the', 'actuary', 'finished', 'her', 'work', '.'] 0
['the', 'actuary', 'finished', 'his', 'work', '.'] 0


In [34]:
TARGET_BATCH_SIZE = 2
target_iterator = BucketIterator(target_data, # given that there is only one dataset we are not using splits
                                        batch_size= TARGET_BATCH_SIZE,
                                        sort_key = lambda x: len(x.tweet),
                                        sort_within_batch=True,
                                        device = DEVICE,
                                        shuffle= True)

In [35]:
count = 0
for batch in target_iterator:
  print(batch)
  print (batch.tweet)
  print (batch.intensity)
  count += 1
  if count > 2:
    break


[torchtext.legacy.data.batch.Batch of size 2]
	[.tweet]:[torch.LongTensor of size 2x50]
	[.intensity]:[torch.FloatTensor of size 2]
tensor([[   6,    0, 4385,    6,    0,   12, 1220,   83,   93,    9,  792,    6,
         3060,    4,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1],
        [   6,    0, 4027,    6,  800,   32,    0,    9, 4137,    9,   83,  937,
            0,    4,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1]])
tensor([0., 0.])

[torchtext.legacy.data.batch.Batch of size 2]
	[.tweet]:[torch.LongTensor of size 2x50]
	[.intensity]:[torch.FloatTensor of size 2]
tensor([[   6,    0,  62

## CNN 1d model

### Gradient Reversal layer

In [36]:
from torch.autograd import Function

class ReverseLayerF(Function):

    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha

        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha

        return output, None

CNN 1 D model
Reference: A Sensitivity Analysis of (and Practitioners' Guide to) Convolutional Neural Networks for Sentence Classification, Ye Zhang, Byron Wallace 2015

Difference:

use of embedding
use of sigmoid function, as we are having a regression model not a classififer as the main task

In [37]:
import torch.nn as nn
import torch.nn.functional as F

class CNN1d(nn.Module):
    def __init__(self, 
                 vocab_size, 
                 embedding_dim, 
                 n_filters, 
                 filter_sizes, 
                 output_dim, 
                 dropout, 
                 pad_idx
                 ):
        super().__init__()
        
        #---------------------Feature Extractor Network----------------------#
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)

        # Convolutional Network
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size = fs)
                                    for fs in filter_sizes
                                    ])
        
        #---------------------Regression Network------------------------#
        # Fully-connected layer and Dropout
        self.regression = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(len(filter_sizes) * n_filters, len(filter_sizes) * n_filters // 2),
            nn.ReLU(),
            nn.Linear(len(filter_sizes) * n_filters // 2, output_dim * 10),
            nn.ReLU(),
            nn.Linear(output_dim * 10, output_dim)
            # ,
            # nn.Sigmoid()
        )
        # self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim * 10)
        # self.fc2 = nn.Linear(output_dim * 10, output_dim)
        # self.dropout = nn.Dropout(dropout)

        #---------------------Domain Classifier Network------------------------#
        # Fully-connected layer and Dropout
        self.domain_classifier = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(len(filter_sizes) * n_filters, len(filter_sizes) * n_filters // 2),
            nn.ReLU(),
            nn.Linear(len(filter_sizes) * n_filters // 2, output_dim * 10),
            nn.ReLU(),
            nn.Linear(output_dim * 10, output_dim)
        )
        
    def forward(self, text, alpha=1):
        
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.permute(0, 2, 1)
        
        #embedded = [batch size, emb dim, sent len]
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        x_feature = torch.cat(pooled, dim = 1)
        
        #x_feature = [batch size, n_filters * len(filter_sizes)]
        
        reverse_feature = ReverseLayerF.apply(x_feature, alpha)
        # print("reverse_feature",reverse_feature)
    
        regression_output = self.regression(x_feature)
    
        domain_classifier_output = self.domain_classifier(reverse_feature)


        return regression_output, domain_classifier_output

In [38]:
INPUT_DIM = len(field_tweet.vocab)
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2, 3, 4, 5]
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = field_tweet.vocab.stoi[field_tweet.pad_token]

In [39]:
model = CNN1d(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
model.to(DEVICE)

CNN1d(
  (embedding): Embedding(4788, 100, padding_idx=1)
  (convs): ModuleList(
    (0): Conv1d(100, 100, kernel_size=(2,), stride=(1,))
    (1): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (2): Conv1d(100, 100, kernel_size=(4,), stride=(1,))
    (3): Conv1d(100, 100, kernel_size=(5,), stride=(1,))
  )
  (regression): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=400, out_features=200, bias=True)
    (2): ReLU()
    (3): Linear(in_features=200, out_features=10, bias=True)
    (4): ReLU()
    (5): Linear(in_features=10, out_features=1, bias=True)
  )
  (domain_classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=400, out_features=200, bias=True)
    (2): ReLU()
    (3): Linear(in_features=200, out_features=10, bias=True)
    (4): ReLU()
    (5): Linear(in_features=10, out_features=1, bias=True)
  )
)

### Load Pre trained embeddings
we'll load the pre-trained *embeddings*

In [43]:
pretrained_embeddings = field_tweet.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.0660, -1.4252,  0.0830,  ...,  0.8138, -0.3719,  1.1007],
        [-0.9660,  0.5005,  0.9372,  ...,  0.1022, -0.3530, -0.1004],
        [-1.1255, -0.5884, -0.3867,  ..., -0.2800,  1.5338, -0.2280],
        ...,
        [-1.0397,  0.2452,  0.9258,  ...,  0.3180,  0.1356,  0.5064],
        [ 0.9789, -1.5869, -1.5187,  ...,  0.6894,  0.8391, -0.6396],
        [ 0.5113,  0.8151, -0.9798,  ...,  0.0531,  0.5839,  0.7307]])

In [44]:
field_tweet.vocab.vectors.shape

torch.Size([4788, 100])

In [45]:
UNK_IDX = field_tweet.vocab.stoi[field_tweet.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

## Training the model

### Without training one forward pass

In [46]:
for batch in train_iterator:
  print(batch.tweet)
  output = model(batch.tweet)
  print (output)
  break

tensor([[   5,  272,  132,   33, 1199,  932,  401,    7,   27,   84,  201,  220,
           15,  125,    3, 1900,    2,    3, 1226,    2,    3, 1511,    2,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1],
        [   5,   36,  108, 1730,   32,   25, 1423,  538,    7,    8,   35, 1446,
           28,    6,  878,  267,    8,  906,   49,   32, 1344,    4,   18,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1]])
(tensor([[-0.1135],
        [-0.0542]], grad_fn=<AddmmBackward>), tensor([[-0.3355],
        [-0.3775]], grad_fn=<AddmmBackward>))


  return torch.max_pool1d(input, kernel_size, stride, padding, dilation, ceil_mode)


In [None]:
# import torch.optim as optim

# optimizer = optim.Adam(model.parameters())

# criterion = nn.BCEWithLogitsLoss()

# model = model.to(DEVICE)
# criterion = criterion.to(DEVICE)

### Typical Train Model Function

In [None]:
# Typical Training Function

from tqdm import tqdm # for beautiful model training updates


def train_model(model, device, train_loader, optimizer, epoch):
    model.train() # setting the model in training mode
    pbar = tqdm(train_loader) # putting the iterator in pbara
    correct = 0 # for accuracy numerator
    processed =0 # for accuracy denominator
    epoch_loss = 0.0
    for batch_idx, batch in enumerate(pbar):

        tweets, intensities = batch.tweet.to(device), batch.intensity.to(device)  # plural, we are not interested in domain
        #sending data to CPU or GPU as per device

        optimizer.zero_grad() # setting gradients to zero to avoid accumulation

        y_preds,_ = model(tweets) # forward pass, result captured in y_preds (plural as there are many body in a batch)
        # we are not interested in domain prediction
        # the predictions are in one hot vector

        regression_loss = regression_loss_function(y_preds,intensities.unsqueeze(1)) # Computing loss
        # loss = F.mse_loss(y_preds,intensities.unsqueeze(1)) # Computing loss

        train_regresion_losses.append(regression_loss.item()) # to capture loss over many epochs

        regression_loss.backward() # backpropagation
        optimizer.step() # updating the params

        # preds = y_preds.argmax(dim=1, keepdim=True)  # get the index olf the max log-probability
        # correct += preds.eq(labels.view_as(preds)).sum().item()
        epoch_loss += regression_loss.item()

        processed += len(tweets)

        pbar.set_description(desc= f'Loss={regression_loss.item()} Batch_id={batch_idx} Epoch Average loss={100*epoch_loss/processed:0.4f}')
    train_accuracy.append(100*epoch_loss/len(train_loader))

### Typical Test Function

In [None]:
def test_model(model,device, data_loader, mode= 'test'):
    model.eval() # setting the model in evaluation mode
    loss = 0
    correct = 0 # for accuracy numerator
    test_regresion_losses =[] # for overall batches (summed over batches)
    valid_regresion_losses =[] # for overall batches (summed over batches)

    with torch.no_grad():
        for batch in data_loader:

            tweets, intensities  = batch.tweet.to(device), batch.intensity.to(device) #sending data to CPU or GPU as per device
            # we are not interested in domains
            
            y_preds,_ = model(tweets) # forward pass, result captured in outputs (plural as there are many bodies in a batch)
            # the outputs are in batch size x one hot vector 
            # not interested in domain output

            regression_loss = regression_loss_function(y_preds,intensities.unsqueeze(1))

            if mode == 'test':
              test_regresion_losses.append(regression_loss.item())
              # print(f'...in the batch...{regression_loss}')
            else:
              valid_regresion_losses.append(regression_loss.item())
              # print(f'...in the batch...{regression_loss}')

        # regression_loss.item() /= len(data_loader.dataset) # average test loss
        if mode == 'test':
          # total_test_regression_loss = sum(test_regresion_losses)
          # test_regresion_losses.append(regression_loss) # to capture loss over many batches
          # print('...Average test loss: {:.8f}'.format((total_test_regression_loss)/len(data_loader.dataset)))
          print(f'...Average batch test loss is {sum(test_regresion_losses) / len(data_loader)}')
        else:
          # valid_regresion_losses.append(regression_loss) # to capture loss over many batches
          # total_valid_regression_loss = sum(valid_regresion_losses)
          # print('...Average validation loss: {:.8f}'.format((total_valid_regression_loss)/len(data_loader.dataset)))
          print(f'...Average batch validation loss is {sum(valid_regresion_losses) / len(data_loader)}')

In [None]:
# EXECUTION


optimizer = optim.Adam(model.parameters(), lr=0.001)
domain_loss_function= nn.BCEWithLogitsLoss()
regression_loss_function = nn.L1Loss()


model = model.to(DEVICE)
domain_loss_function = domain_loss_function.to(DEVICE)
regression_loss_function = regression_loss_function.to(DEVICE)

# train_losses = [] # to capture train losses over training epochs
train_accuracy = [] # to capture train accuracy over training epochs
# val_losses = [] # to capture validation loss
# test_losses = [] # to capture test losses 
# test_accuracy = [] # to capture test accuracy 

# EPOCHS = 2
EPOCHS = 100
# dict_val_loss = {}
# dict_test_loss = {}


train_regresion_losses = [] # to capture train losses over training epochs
train_domain_losses = []
train_accuracy = [] # to capture train accuracy over training epochs
# valid_regresion_losses = [] # to capture validation loss
# test_regresion_losses = [] # to capture test losses 
total_test_regression_loss =[]
total_valid_regression_loss =[]
# print(f'----------------------training started for {name}-----------------')
for epoch in range(EPOCHS):
  print("EPOCH:", epoch+1)
  train_model(model, DEVICE, train_iterator, optimizer, epoch)
  print("for validation.......")
  # val_name = train_name.replace("train", "val" )
  # test_model(typical_model, device, dict_val_loader[val_name], mode = 'val')
  test_model(model, DEVICE, valid_iterator, mode = 'val')


  print("for test  .......")
  # test_name = train_name.replace("train", "test" )
  # test_model(typical_model, device, dict_test_loader[test_name], mode = 'test')
  test_model(model, DEVICE, test_iterator, mode = 'test')

# dict_val_loss[name] = val_losses
# dict_test_loss[name] = test_losses

model_name = "Non_DANN"+".pt"
torch.save(model.state_dict(), os.path.join(MODEL_DIR, model_name))
# print(f'----------------------training complete for {name}-----------------')
# print(dict_val_loss.items())
# print(dict_test_loss.items())

## DANN Model - Training and Testing

### Training

In [50]:
# b = next(iter(train_iterator))
# print(b.tweet, b.intensity)

tensor([[   5,  488,    7,  122,    7,  199, 4073, 1832,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1],
        [ 111,  306, 2665,    9,  811,  128,  264,  529,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1]]) tensor([0.4400, 0.2080])


In [107]:
def binary_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))

    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc

In [111]:
## DANN training function

from tqdm import tqdm # for beautiful model training updates


def train_dann_model(model, 
                     device, 
                     train_loader, 
                     target_loader, # this is the target data source
                     optimizer, 
                     epoch, 
                     num_epochs # this is done for alpha parameter tuning
                     ):
    model.train() # setting the model in training mode
    len_dataloader = min(len(train_loader), len(target_loader)) # training for minimum of two dataloaders
    
    m = nn.LogSoftmax(dim=1) ## to be used with NLLLoss

    epoch_loss=0
    epoch_acc=0
    i = 0 # as the training progresses the alpha changes
    while i < len_dataloader -1:
        # implementation of alpha as per paper
        p = float(i + epoch * len_dataloader) / (num_epochs * len_dataloader)
        alpha = 2. / (1. + np.exp(-10 * p)) - 1
        alpha = torch.tensor(alpha)

        optimizer.zero_grad() # setting gradients to zero to avoid accumulation

        # training model using source data
        source_batch = next(iter(train_loader))
        source_tweets, source_intensities = source_batch.tweet.to(device), source_batch.intensity.to(device)  # plural, we are not interested in domain
        #sending data to CPU or GPU as per device

        y_preds, source_domain_outputs = model(source_tweets, alpha = alpha) # forward pass, regresion result captured in y_preds, classificaiton in source_domain_outputs (plural as there are many tweets in a source batch)
        # we are interested in domain prediction
        
        loss_source_regression= regression_loss_function(y_preds,source_intensities.unsqueeze(1)) # Computing loss
        # loss = F.mse_loss(y_preds,intensities.unsqueeze(1)) # Computing loss

        source_domain_inputs = torch.zeros(len(source_batch), dtype=torch.float).to(device)

        # print(f'source_domain_outputs: {source_domain_outputs} with {source_domain_outputs.shape}  , source_domain_inputs: {source_domain_inputs.unsqueeze(1)} with {source_domain_inputs.unsqueeze(1).shape}')
        loss_source_domain = domain_loss_function(source_domain_outputs,source_domain_inputs.unsqueeze(1)) # classification loss
        # loss_source_domain = domain_loss_function(m(source_domain_outputs).argmax(),source_domain_inputs.unsqueeze(1)) # classification loss to work with NLLLoss, we need log softmax

        acc = binary_acc(source_domain_outputs, source_domain_inputs.unsqueeze(1))
        epoch_acc += acc.item()
      
        ###--------------------------###
        # training model using target data
        target_batch = next(iter(target_loader))
        target_tweets = target_batch.tweet.to(device)  # plural, we are not interested in tweet only

        _, target_domain_outputs = model(target_tweets, alpha = alpha) # forward pass, regresion result captured in y_preds, classificaiton in source_domain_outputs (plural as there are many tweets in a source batch)

        target_domain_inputs = torch.ones(len(target_batch), dtype=torch.float).to(device)
        # print(f'target_domain_inputs: {target_domain_inputs}')
        loss_target_domain = domain_loss_function(target_domain_outputs, target_domain_inputs.unsqueeze(1)) # classification loss
        # loss_target_domain = domain_loss_function(m(target_domain_outputs).argmax(), target_domain_inputs.unsqueeze(1)) # classification loss to work with NLLLoss, we need log softmax

        # print(f' loss_source_regression: {loss_source_regression}, loss_source_domain: {loss_source_domain} , loss_target_domain: {loss_target_domain}')
        total_loss = loss_source_regression + loss_source_domain + loss_target_domain

        total_loss.backward() # backpropagation
        optimizer.step() # updating the params

        if ((i + 1) % 50 == 0):
                print("Epoch [{}/{}] Step [{}/{}]: domain_loss_target={:.4f} / domain_loss_source={:.4f} / regression_loss_source={:.4f}"
                      .format(epoch + 1,
                              num_epochs,
                              i + 1,
                              len_dataloader,
                              loss_target_domain.item()
                              ,loss_source_domain.item()
                              ,loss_source_regression.item()))
        
        i = i + 1
    print (epoch_loss, epoch_acc)

### Evaluation

In [112]:
# EXECUTION DAANN


optimizer = optim.Adam(model.parameters(), lr=0.001)
domain_loss_function= nn.BCEWithLogitsLoss()
# domain_loss_function= nn.NLLLoss()
regression_loss_function = nn.L1Loss()


model = model.to(DEVICE)
domain_loss_function = domain_loss_function.to(DEVICE)
regression_loss_function = regression_loss_function.to(DEVICE)

# train_losses = [] # to capture train losses over training epochs
train_accuracy = [] # to capture train accuracy over training epochs
# val_losses = [] # to capture validation loss
# test_losses = [] # to capture test losses 
# test_accuracy = [] # to capture test accuracy 

EPOCHS = 2
# EPOCHS = 100


# train_regresion_losses = [] # to capture train losses over training epochs
# train_domain_losses = []
# train_accuracy = [] # to capture train accuracy over training epochs
# # valid_regresion_losses = [] # to capture validation loss
# # test_regresion_losses = [] # to capture test losses 
# total_test_regression_loss =[]
# total_valid_regression_loss =[]
# print(f'----------------------training started for {name}-----------------')
for epoch in range(EPOCHS):
  print("EPOCH:", epoch+1)
  train_dann_model(model, DEVICE, train_iterator, target_iterator, optimizer, epoch, num_epochs = EPOCHS)

  print("for validation.......")
  # val_name = train_name.replace("train", "val" )
  # test_model(typical_model, device, dict_val_loader[val_name], mode = 'val')
  # test_model(model, DEVICE, valid_iterator, mode = 'val')


  print("for test  .......")
  # test_name = train_name.replace("train", "test" )
  # test_model(typical_model, device, dict_test_loader[test_name], mode = 'test')
  # test_model(model, DEVICE, test_iterator, mode = 'test')

# dict_val_loss[name] = val_losses
# dict_test_loss[name] = test_losses

model_name = "DANN"+".pt"
torch.save(model.state_dict(), os.path.join(MODEL_DIR, model_name))

EPOCH: 1
Epoch [1/2] Step [50/808]: domain_loss_target=0.6931 / domain_loss_source=0.6931 / regression_loss_source=0.0378
Epoch [1/2] Step [100/808]: domain_loss_target=0.6931 / domain_loss_source=0.6931 / regression_loss_source=0.0252
Epoch [1/2] Step [150/808]: domain_loss_target=0.6931 / domain_loss_source=0.6931 / regression_loss_source=0.0383
Epoch [1/2] Step [200/808]: domain_loss_target=0.6931 / domain_loss_source=0.6931 / regression_loss_source=0.0514
Epoch [1/2] Step [250/808]: domain_loss_target=0.6931 / domain_loss_source=0.6931 / regression_loss_source=0.0705
Epoch [1/2] Step [300/808]: domain_loss_target=0.6931 / domain_loss_source=0.6931 / regression_loss_source=0.0618
Epoch [1/2] Step [350/808]: domain_loss_target=0.6931 / domain_loss_source=0.6931 / regression_loss_source=0.0480
Epoch [1/2] Step [400/808]: domain_loss_target=0.6931 / domain_loss_source=0.6931 / regression_loss_source=0.0519
Epoch [1/2] Step [450/808]: domain_loss_target=0.6931 / domain_loss_source=0.693

In [55]:
m = nn.LogSoftmax(dim=1)
input = torch.randn(2, 3)
output = m(input)
input, output

(tensor([[-0.2370, -1.1551,  1.6109],
         [-0.4073,  1.1949, -0.6669]]), tensor([[-2.0471, -2.9653, -0.1992],
         [-1.9074, -0.3052, -2.1670]]))

In [57]:
loss = nn.BCEWithLogitsLoss()
input = torch.randn(3, requires_grad=True)
target = torch.empty(3).random_(2)
output = loss(input, target)
output.backward()
print( input, "\n", target,"\n",output.item())

tensor([0.3200, 1.3016, 0.9202], requires_grad=True) 
 tensor([0., 1., 1.]) 
 0.48063743114471436


# Equality Evaluation using Equity Evaluation Corpus

## Loading Evaluation Data
format `[ID	Sentence	Template	Person	Gender	Race Emotion	Emotion word]`

In [44]:
data_EEC = TASK1.EEC['eec']
df_EEC = pd.read_csv(data_EEC)

In [45]:
df_EEC.head()

Unnamed: 0,ID,Sentence,Template,Person,Gender,Race,Emotion,Emotion word
0,2018-En-mystery-05498,Alonzo feels angry.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,angry
1,2018-En-mystery-11722,Alonzo feels furious.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,furious
2,2018-En-mystery-11364,Alonzo feels irritated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,irritated
3,2018-En-mystery-14320,Alonzo feels enraged.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,enraged
4,2018-En-mystery-14114,Alonzo feels annoyed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,annoyed


## Creating evalution function (includes pre-processing)

In [46]:
## padding function : adds padding / truncates to max size
def pad_or_truncate(some_list, target_len = MAX_SIZE ,pad_idx = PAD_IDX):
    return some_list[:target_len] + [pad_idx]*(target_len - len(some_list))

## preprocessing function, takes in a tweet and returns padded indexed tweet (input for model)
def text_pipeline(tweet):
    indexed_tweet = [field_tweet.vocab.__getitem__(token) for token in preprocess_tweet(tweet)]
    # print(indexed_tweet)
    return pad_or_truncate(indexed_tweet, MAX_SIZE , pad_idx = PAD_IDX)
    # print(indexed_tweet_padded)

In [47]:
# i = random.randint(0,len(df_EEC))
# tweet_example = df_EEC['Sentence'][i]
# print(tweet_example, text_pipeline(tweet_example))

## Loading model

In [48]:
### Loading Model
model_name = "Non_DANN.pt"
loaded_model = CNN1d(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)
loaded_model.load_state_dict(torch.load(os.path.join(MODEL_DIR, model_name)))
loaded_model.eval()

CNN1d(
  (embedding): Embedding(4788, 100, padding_idx=1)
  (convs): ModuleList(
    (0): Conv1d(100, 100, kernel_size=(2,), stride=(1,))
    (1): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (2): Conv1d(100, 100, kernel_size=(4,), stride=(1,))
    (3): Conv1d(100, 100, kernel_size=(5,), stride=(1,))
  )
  (regression): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=400, out_features=200, bias=True)
    (2): ReLU()
    (3): Linear(in_features=200, out_features=10, bias=True)
    (4): ReLU()
    (5): Linear(in_features=10, out_features=1, bias=True)
  )
  (domain_classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=400, out_features=200, bias=True)
    (2): ReLU()
    (3): Linear(in_features=200, out_features=10, bias=True)
    (4): ReLU()
    (5): Linear(in_features=10, out_features=1, bias=True)
  )
)

In [49]:
from torch.cuda import Device
def predict(tweet, model, text_pipeline,device = DEVICE):
  with torch.no_grad():
    tweet_tensor = torch.tensor(text_pipeline(tweet)).unsqueeze(0).to(device)
    # output = None
    # print(tweet_tensor.shape)
    output = model(tweet_tensor)
    return output[0].item()

In [50]:
i = random.randint(0,len(df_EEC))
tweet_example = df_EEC['Sentence'][i]
loaded_model_device = 'cpu'
model = loaded_model.to(loaded_model_device)
print(predict(tweet_example, loaded_model,text_pipeline, device= loaded_model_device))

0.43260928988456726


## Creating Sentence pairs (as per SEMVAL18 paper)

In [51]:
dict_f_m_noun_phrase = {'she':'he', 
            'her':'him',
            'this woman':'this man',
            'this girl':'this boy',
            'my sister' : 'my brother',
            'my daughter' : 'my son',
            'my wife': 'my husband',
            'my girlfriend':'my boyfriend',
            'my mother':'my father',
            'my aunt':'my uncle',
            'my mom': 'my dad'
            }

name_male = ['Alonzo','Jamel','Alphonse','Jerome','Leroy','Torrance','Darnell','Lamar','Malik','Terrence','Adam','Harry','Josh','Roger','Alan','Frank','Justin','Ryan','Andrew','Jack'] 
name_female = ['Nichelle','Shereen','Ebony','Latisha','Shaniqua','Jasmine','Tanisha','Tia','Lakisha','Latoya','Amanda','Courtney','Heather','Melanie','Katie','Betsy','Kristin','Nancy','Stephanie','Ellen']


In [52]:
list_unique_template = list(df_EEC['Template'].dropna().unique())
# print(list_unique_template)
list_emotion_word = list(df_EEC['Emotion word'].unique())
# print(list_emotion_word)
list_gender = list(df_EEC['Gender'].dropna().unique())
# print(list_gender)
list_person = list(df_EEC['Person'].unique())   
# print(list_person)

In [53]:
# list_f_m_noun_phrase =[]
# list_f_m_noun_phrase.extend(name_male)
# list_f_m_noun_phrase.extend(name_female)
# [list_f_m_noun_phrase.extend([f,m]) for f,m in dict_f_m_noun_phrase.items()]
# print(list_f_m_noun_phrase)
# assert set(list_f_m_noun_phrase)<= set(list_person), "The noun phrases are not subset of overall person list"

In [54]:
print(list_emotion_word)
# list_emotion_word= list_emotion_word.append('')
# print(list_emotion_word)

['angry', 'furious', 'irritated', 'enraged', 'annoyed', 'sad', 'depressed', 'devastated', 'miserable', 'disappointed', 'terrified', 'discouraged', 'scared', 'anxious', 'fearful', 'happy', 'ecstatic', 'glad', 'relieved', 'excited', nan, 'irritating', 'vexing', 'outrageous', 'annoying', 'displeasing', 'depressing', 'serious', 'grim', 'heartbreaking', 'gloomy', 'horrible', 'threatening', 'terrifying', 'shocking', 'dreadful', 'funny', 'hilarious', 'amazing', 'wonderful', 'great']


In [55]:
# Template - F - M Noun Phrases chunks
dict_sentence_pair = {}
count = 0

for template in list_unique_template:
  for f, m in dict_f_m_noun_phrase.items():
    condition_1 = df_EEC['Template']== template
    condition_2 = df_EEC['Person']== f
    condition_3 = df_EEC['Person']== m
    df_temp_f = df_EEC[(condition_1 & condition_2 )] 
    df_temp_m = df_EEC[(condition_1 & condition_3 )]
    for emotion_word in list_emotion_word:
      
      condition_4 = df_EEC['Emotion word'] == emotion_word
      
      k = df_temp_f[condition_4]['Sentence']
      v = df_temp_m[condition_4]['Sentence']
      assert len(k)==len(v), "Problem is in Noun Phase Chunks where emotion_word is not null"
      if len(k) > 0 and len (v) > 0:
        dict_sentence_pair[count] = (k.values[0],v.values[0])
        count = count + 1
      
      ## Checking for column values where emotion word value blank
      if pd.isnull(emotion_word):
        k_null = df_temp_f[df_temp_f['Emotion word'].isna()]['Sentence']
        v_null = df_temp_m[df_temp_m['Emotion word'].isna()]['Sentence']
        assert len(k_null)==len(v_null), "Problem is in Noun Phase Chunks where emotion_word is  null"
        if len(k_null) > 0 and len (v_null) > 0:
          dict_sentence_pair[count] = (k_null.values[0],v_null.values[0])
          count = count + 1
      
print(len(dict_sentence_pair),"\ndict_sentence_pair............ \n",dict_sentence_pair)

  app.launch_new_instance()


1440 
dict_sentence_pair............ 
 {0: ('She feels angry.', 'He feels angry.'), 1: ('She feels furious.', 'He feels furious.'), 2: ('She feels irritated.', 'He feels irritated.'), 3: ('She feels enraged.', 'He feels enraged.'), 4: ('She feels annoyed.', 'He feels annoyed.'), 5: ('She feels sad.', 'He feels sad.'), 6: ('She feels depressed.', 'He feels depressed.'), 7: ('She feels devastated.', 'He feels devastated.'), 8: ('She feels miserable.', 'He feels miserable.'), 9: ('She feels disappointed.', 'He feels disappointed.'), 10: ('She feels terrified.', 'He feels terrified.'), 11: ('She feels discouraged.', 'He feels discouraged.'), 12: ('She feels scared.', 'He feels scared.'), 13: ('She feels anxious.', 'He feels anxious.'), 14: ('She feels fearful.', 'He feels fearful.'), 15: ('She feels happy.', 'He feels happy.'), 16: ('She feels ecstatic.', 'He feels ecstatic.'), 17: ('She feels glad.', 'He feels glad.'), 18: ('She feels relieved.', 'He feels relieved.'), 19: ('She feels exc

In [56]:
# for Named people

dict_list_named_sentence_pairs ={}
df_EEC_subset = df_EEC.dropna(subset = ['Race']) ## removes values which do not have Race 
print(len(df_EEC_subset))

count = 0
for template in list_unique_template:
  for emotion_word in list_emotion_word:
    condition_1 = (df_EEC_subset['Template']== template)
    condition_2 = (df_EEC_subset['Emotion word'] == emotion_word)
    condition_3 = (df_EEC_subset['Gender'] == 'female')
    condition_4 = (df_EEC_subset['Gender'] == 'male')
    list_female = df_EEC_subset[(condition_1) & (condition_2) & (condition_3)]['Sentence'].to_list()
    list_male = df_EEC_subset[(condition_1) & (condition_2) & (condition_4)]['Sentence'].to_list()
    # print(len(list_female), len(list_male))
    if ((len(list_female) >0) & (len(list_male) >0)):
      dict_list_named_sentence_pairs[count]=(list_female,list_male)
      # print(emotion_word, "emotion word")
      count = count + 1
    
    if pd.isnull(emotion_word):
      condition_5 = (df_EEC_subset['Emotion word'].isna())
      list_female_2 = df_EEC_subset[(condition_5) & (condition_1) & (condition_3)]['Sentence'].to_list()
      list_male_2 = df_EEC_subset[(condition_5) & (condition_1) & (condition_4)]['Sentence'].to_list()
      if ((len(list_female_2) >0) & (len(list_male_2) >0)):
        dict_list_named_sentence_pairs[count]=(list_female_2,list_male_2)
        # print("no emotion word")
        count = count + 1
        
    

print (count)
print(len(dict_list_named_sentence_pairs))
print(dict_list_named_sentence_pairs)

5760
144
144
{0: (['Nichelle feels angry.', 'Shereen feels angry.', 'Ebony feels angry.', 'Latisha feels angry.', 'Shaniqua feels angry.', 'Jasmine feels angry.', 'Tanisha feels angry.', 'Tia feels angry.', 'Lakisha feels angry.', 'Latoya feels angry.', 'Amanda feels angry.', 'Courtney feels angry.', 'Heather feels angry.', 'Melanie feels angry.', 'Katie feels angry.', 'Betsy feels angry.', 'Kristin feels angry.', 'Nancy feels angry.', 'Stephanie feels angry.', 'Ellen feels angry.'], ['Alonzo feels angry.', 'Jamel feels angry.', 'Alphonse feels angry.', 'Jerome feels angry.', 'Leroy feels angry.', 'Torrance feels angry.', 'Darnell feels angry.', 'Lamar feels angry.', 'Malik feels angry.', 'Terrence feels angry.', 'Adam feels angry.', 'Harry feels angry.', 'Josh feels angry.', 'Roger feels angry.', 'Alan feels angry.', 'Frank feels angry.', 'Justin feels angry.', 'Ryan feels angry.', 'Andrew feels angry.', 'Jack feels angry.']), 1: (['Nichelle feels furious.', 'Shereen feels furious.', 

In [57]:
print(dict_list_named_sentence_pairs[0][0],"\n",dict_list_named_sentence_pairs[0][1])

['Nichelle feels angry.', 'Shereen feels angry.', 'Ebony feels angry.', 'Latisha feels angry.', 'Shaniqua feels angry.', 'Jasmine feels angry.', 'Tanisha feels angry.', 'Tia feels angry.', 'Lakisha feels angry.', 'Latoya feels angry.', 'Amanda feels angry.', 'Courtney feels angry.', 'Heather feels angry.', 'Melanie feels angry.', 'Katie feels angry.', 'Betsy feels angry.', 'Kristin feels angry.', 'Nancy feels angry.', 'Stephanie feels angry.', 'Ellen feels angry.'] 
 ['Alonzo feels angry.', 'Jamel feels angry.', 'Alphonse feels angry.', 'Jerome feels angry.', 'Leroy feels angry.', 'Torrance feels angry.', 'Darnell feels angry.', 'Lamar feels angry.', 'Malik feels angry.', 'Terrence feels angry.', 'Adam feels angry.', 'Harry feels angry.', 'Josh feels angry.', 'Roger feels angry.', 'Alan feels angry.', 'Frank feels angry.', 'Justin feels angry.', 'Ryan feels angry.', 'Andrew feels angry.', 'Jack feels angry.']


## Two Sample t- test

In [58]:
dict_result_named_sentence_pair ={}

for key, value in dict_list_named_sentence_pairs.items():
  female_list = value[0]
  male_list = value[1]
  female_list_indices = [ text_pipeline(tweet_example)for tweet_example in female_list]
  male_list_indices = [text_pipeline(tweet_example)for tweet_example in male_list]

  female_list_output = [predict(sentence, loaded_model,text_pipeline,device= loaded_model_device) for sentence in female_list]
  male_list_output = [predict(sentence, loaded_model,text_pipeline,device= loaded_model_device) for sentence in male_list]
  # for sentence in female_list:
  #   female_list_output.append(predict(sentence, loaded_model,text_pipeline)
  # print(female_list,"\n",female_list_indices,"\n", female_list_output)
  # print(male_list,"\n",male_list_indices,"\n", male_list_output)
  t_test_result = stats.ttest_rel(female_list_output, male_list_output)
  dict_result_named_sentence_pair[key] = (t_test_result.statistic, t_test_result.pvalue,mean(female_list_output)-mean(male_list_output))
  # print(type(stats.ttest_rel(female_list_output, male_list_output)))

  # break

print((dict_result_named_sentence_pair))

{0: (1.0433770610437052, 0.3098681188734157, 0.0030918866395950317), 1: (1.2877640719575254, 0.2132927429841425, 0.0036482512950897217), 2: (1.2877640719575254, 0.2132927429841425, 0.0036482512950897217), 3: (1.2877640719575254, 0.2132927429841425, 0.0036482512950897217), 4: (1.3653038566333346, 0.1881060959294786, 0.0023211814463138802), 5: (1.3972088980681912, 0.17845573351968774, 0.0028791576623916626), 6: (1.1591479507369913, 0.26075697911552875, 0.0027571335434913857), 7: (1.2877640719575254, 0.2132927429841425, 0.0036482512950897217), 8: (1.2441279121243596, 0.22858284012743746, 0.002765095233917214), 9: (1.2371706118907124, 0.23109685660493712, 0.0027804538607597573), 10: (1.2056573701072844, 0.24275034095992626, 0.0029569059610367043), 11: (1.2877640719575254, 0.2132927429841425, 0.0036482512950897217), 12: (1.2877640719575254, 0.2132927429841425, 0.0036482512950897217), 13: (1.1664719915928772, 0.2578568492110525, 0.002702723443508137), 14: (1.2877640719575254, 0.2132927429841

In [59]:
#without named people
dict_result_sentence_pair ={}
# for key, value in dict_sentence_pair:
#   if len(value[0])
print(len(dict_sentence_pair))

for key, value in dict_sentence_pair.items():
  female_list = [value[0]]
  male_list = [value[1]]
  # if len(female_list)!=len(male_list):
  #   print("key:", key)
  #   print(female_list,"\n",male_list)
  #   print(len(female_list),"-",len(male_list))
  #   print(text_pipeline(female_list[0]),"\n",text_pipeline(male_list[0]))
  #   break

  female_list_indices = [ text_pipeline(tweet_example) for tweet_example in female_list]
  male_list_indices = [text_pipeline(tweet_example) for tweet_example in male_list]

  female_list_output = [predict(sentence, loaded_model,text_pipeline,device= loaded_model_device) for sentence in female_list]
  male_list_output = [predict(sentence, loaded_model,text_pipeline,device= loaded_model_device) for sentence in male_list]
  # for sentence in female_list:
  #   female_list_output.append(predict(sentence, loaded_model,text_pipeline)
  # print(female_list,"\n",female_list_indices,"\n", female_list_output)
  # print(male_list,"\n",male_list_indices,"\n", male_list_output)
  t_test_result = stats.ttest_rel(female_list_output, male_list_output)
  dict_result_sentence_pair[key] = (t_test_result.statistic, t_test_result.pvalue,mean(female_list_output)-mean(male_list_output))
  # print(type(stats.ttest_rel(female_list_output, male_list_output)))

  # break

print(dict_result_sentence_pair)

1440


  **kwargs)
  ret = ret.dtype.type(ret / rcount)


{0: (nan, nan, -0.002807527780532837), 1: (nan, nan, -0.005637109279632568), 2: (nan, nan, -0.005637109279632568), 3: (nan, nan, -0.005637109279632568), 4: (nan, nan, -0.001216977834701538), 5: (nan, nan, -0.0029365718364715576), 6: (nan, nan, -0.0021606087684631348), 7: (nan, nan, -0.005637109279632568), 8: (nan, nan, -0.0038300752639770508), 9: (nan, nan, -0.0025099217891693115), 10: (nan, nan, -0.003965973854064941), 11: (nan, nan, -0.005637109279632568), 12: (nan, nan, -0.005637109279632568), 13: (nan, nan, -0.004354029893875122), 14: (nan, nan, -0.005637109279632568), 15: (nan, nan, -0.008865177631378174), 16: (nan, nan, -0.005637109279632568), 17: (nan, nan, -0.003093719482421875), 18: (nan, nan, -0.005637109279632568), 19: (nan, nan, -0.004892110824584961), 20: (nan, nan, -0.025273025035858154), 21: (nan, nan, -0.017873615026474), 22: (nan, nan, -0.017873615026474), 23: (nan, nan, -0.017873615026474), 24: (nan, nan, -0.017204195261001587), 25: (nan, nan, -0.01885947585105896), 2