<a href="https://colab.research.google.com/github/peeyushsinghal/da/blob/main/mitigating_bias_sa_da.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
! pip install ekphrasis # library to pre process twitter data
! pip install emoji --upgrade #library to deal with emoji data

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [43]:
## Import statements
import pandas as pd
import os
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import emoji
from tqdm import tqdm
import random

In [44]:
# checking device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:{}".format(DEVICE))

Running on:cpu


# Data Handling

Mounting google drive for data in there

In [45]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


##Data configuration

In [46]:
BASE_PATH = '/content/drive/MyDrive/semeval-2018'
DATA_DIR = os.path.join(BASE_PATH,'datasets')

In [47]:
class TASK1(object):
  
    EI_reg = {
        'anger': {
            'train': os.path.join(
                DATA_DIR, 'task1/EI-reg/training/EI-reg-En-anger-train.txt'),
            'dev': os.path.join(
                DATA_DIR, 'task1/EI-reg/development/2018-EI-reg-En-anger-dev.txt'),
            'gold': os.path.join(
                DATA_DIR, 'task1/EI-reg/test-gold/2018-EI-reg-En-anger-test-gold.txt')
                }
        }

    V_reg = {
        'train': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-train.txt'),
        'dev': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-dev.txt'),
        'gold': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-test-gold.txt')
             }

    EEC = {
        'eec': os.path.join(
            DATA_DIR, 'task1/Equity-Evaluation-Corpus/Equity-Evaluation-Corpus.csv')
             }

##Dataloaders

Parsing regression data : `format [ID	Tweet	Affect Dimension	Intensity Score]`

In [48]:
def parse_reg(data_file, label_format='tuple')-> pd.DataFrame:
    """
    This is for datasets for the EI-reg and V-reg English tasks 
    Returns:
        df: dataframe with columns in the first row of file [ID-Tweet-Affect Dimension-Intensity Score]
    """
    with open(data_file, 'r') as fd:
      data = [l.strip().split('\t') for l in fd.readlines()]
    
    df = pd.DataFrame (data[1:],columns=data[0])
    return df

In [49]:
# def parse_reg(data_file, label_format='tuple')-> (list, list):
#     """
#     This is for datasets for the EI-reg and V-reg English tasks 
#     Returns:
#         X: a list of tweets
#         y: a list of (affect dimension, v) tuples corresponding to
#          the regression targets of the tweets
#     """
#     with open(data_file, 'r') as fd:
#         data = [l.strip().split('\t') for l in fd.readlines()][1:]
#     X = [d[1] for d in data]
#     y = [(d[2], float(d[3])) for d in data]
#     if label_format == 'list':
#         y = [l[1] for l in y]
#     return X, y

parsing EEC data : `format [ID	Sentence	Template	Person	Gender	Race Emotion	Emotion word]`

In [50]:
def parse_eec()->pd.DataFrame:
  """
  This is for EEC Dataset, it is a csv file
  Returns:
        df_eec: dataframe 
  """
  data_train = TASK1.EEC['eec']
  df_eec = pd.read_csv(data_train)
  return df_eec


In [51]:
def parse(task, dataset, emotion='anger') -> pd.DataFrame:
    if task == 'EI-reg':
        data_train = TASK1.EI_reg[emotion][dataset]
        df = parse_reg(data_train)
        df[df.columns[-1]] = df[df.columns[-1]].astype(float)
        return df
    elif task == 'V-reg':
        data_train = TASK1.V_reg[dataset]
        df = parse_reg(data_train)
        df[df.columns[-1]] = df[df.columns[-1]].astype(float)
        return df
    else:
        return None, None

In [52]:
## Creating Dataframes
df_EI_reg_train = parse('EI-reg','train')
df_EI_reg_val = parse('EI-reg','dev')
df_EI_reg_test = parse('EI-reg','gold')
df_V_reg_train = parse('V-reg','train')
df_V_reg_val = parse('V-reg','dev')
df_V_reg_test = parse('V-reg','gold')

dict_df= {'df_EI_reg_train':df_EI_reg_train, 
          'df_EI_reg_val':df_EI_reg_val, 
          'df_EI_reg_test':df_EI_reg_test, 
          'df_V_reg_train': df_V_reg_train, 
          'df_V_reg_val':df_V_reg_val, 
          'df_V_reg_test': df_V_reg_test 
          }

##PreProcess Twitter Data

In [53]:
# reference : https://github.com/cbaziotis/ekphrasis


text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [54]:
# #### Example checks of pre-processing
# sentences = [
#     "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))",
#     "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
#     "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/.",
#     "@MGBarbieri @SpalkTalk a@b.com And just saw your LinkedIn comment after I sent this! Thanks for the message :) 😀",
#     "💙💛🏆 @GeorgeePitman Young Player of The Season 🏆💛💙 #irony #actuallyseventy"
# ]

# for s in sentences:
#     print(" ".join(text_processor.pre_process_doc(s)))
# # print ([text_processor.pre_process_doc(s) for s in sentences])

In [55]:
def preprocess_tweets(df)-> pd.DataFrame:
  tweets = df.Tweet.to_list()
  # df['TweetTokens'] = [emoji.demojize(text_processor.pre_process_doc(tweet),language = 'en') for tweet in tweets] # Translates emoji in to word and preprocesss
  # df['TweetTokens'] = [text_processor.pre_process_doc(tweet) for tweet in tweets] # preprocesss
  # tweets_processed = [text_processor.pre_process_doc(tweet) for tweet in tweets] # preprocesss
  # for tweet in tweets_processed:
  #   for index, token in enumerate(tweet):
  #     if emoji.is_emoji(token):
  #       tweet[index] = emoji.demojize(token, language = 'en')

  tweets_processed = [" ".join(text_processor.pre_process_doc(tweet)) for tweet in tweets] # preprocesss
  # print (tweets_processed)
  for index, tweet in enumerate(tweets_processed):
      tweets_processed[index] = emoji.demojize(tweet, language = 'en')
  
  df['TweetTokens'] = tweets_processed
  # print(df)
  return df

In [56]:
# df_EI_reg_train = preprocess_tweets(df_EI_reg_train)
# df_V_reg_train = preprocess_tweets(df_V_reg_train)

for name, df in dict_df.items():
  df = preprocess_tweets(df)


In [57]:
# df_V_reg_train

TODO : 
* remove stop words
* stem
* lemmetize


In [58]:
df_V_reg_train.columns

Index(['ID', 'Tweet', 'Affect Dimension', 'Intensity Score', 'TweetTokens'], dtype='object')

In [59]:
def subset_df(df):
  return df[['TweetTokens','Intensity Score']]

In [60]:
dict_df_subset ={name+"_subset": subset_df(df) for name, df in dict_df.items() }

In [61]:
print (dict_df_subset)

{'df_EI_reg_train_subset':                                             TweetTokens  Intensity Score
0     <user> <user> shut up hashtags are cool <hasht...            0.562
1     it makes me so fucking irate jesus . nobody is...            0.750
2     lol adam the bull with his fake outrage . <rep...            0.417
3     <user> passed away early this morning in a fas...            0.354
4     <user> lol wow i was gonna say really ? ! <rep...            0.438
...                                                 ...              ...
1696  got a <money> tip from a drunk uber passenger ...            0.708
1697  <user> <user> <user> <user> fucker blocked me ...            0.625
1698                                <user> i look rabid            0.472
1699  <user> i am not surprised , i would be fuming ...            0.479
1700  <user> the pout tips me over the edge . i am m...            0.490

[1701 rows x 2 columns], 'df_EI_reg_val_subset':                                            Twee

## Creating Pytorch Datasets

### Creating Vocabulary
Before we create the [link text](https://)Dataset, we need to define a process to build our vocabulary. For this,
We’ll create a “Vocabulary” class which will create the word-to-index and index-to-word mappings using only the train dataframe we created before
Also, the “Vocabulary” class returns the numericalized version of each sentence in our dataframe. Eg: [‘i’, ‘love’, ‘apple’] -> [23, 54, 1220]. We need to convert the words to numbers as models expect each word in our vocabulary to be represented by a number

In [62]:
#######################################################
#               Define Vocabulary Class
#######################################################

class Vocabulary:
  
    '''
    __init__ method is called by default as soon as an object of this class is initiated
    we use this method to initiate our vocab dictionaries
    '''
    def __init__(self, freq_threshold = 3, max_size = 10000):
        '''
        freq_threshold : the minimum times a word must occur in corpus to be treated in vocab
        max_size : max source vocab size. Eg. if set to 10,000, we pick the top 10,000 most frequent words and discard others
        '''
        #initiate the index to token dict
        ## <PAD> -> padding, used for padding the shorter sentences in a batch to match the length of longest sentence in the batch
        ## <UNK> -> words which are not found in the vocab are replace by this token
        # self.itos = {0: '<PAD>', 1: '<UNK>', 2:'<NUMBER>', 3: '<CURRENCY>', 4: '<URL>'}
        self.itos = {0: '<PAD>', 1: '<UNK>'}
        
        
        #initiate the token to index dict
        self.stoi = {k:j for j,k in self.itos.items()}
        self.original_stoi = self.stoi.copy()
#         print(self.stoi)
        
        self.freq_threshold = freq_threshold
        self.max_size = max_size
    
    '''
    __len__ is used by dataloader later to create batches
    '''
    def __len__(self):
        return len(self.itos)
    
    '''
    a simple tokenizer to split on space and converts the sentence to list of words
    '''
    @staticmethod
    def tokenizer(text):
#         return [tok.strip() for tok in text.split(' ')]
        return [tok.lower().strip() for tok in text.split(' ')] # this is commented out to avoid <NUMBER> ,<UNK> lowering
#         return [tok.lower().strip() for tok in text.split(' ') if tok not in list(self.stoi.keys())] 
    
    '''
    build the vocab: create a dictionary mapping of index to string (itos) and string to index (stoi)
    output ex. for stoi -> {'the':6, 'a':7, 'an':8}
    '''
    def build_vocabulary(self, sentence_list):
        #calculate the frequencies of each word first to remove the words with freq < freq_threshold
#         frequencies = {}  #init the freq dict
        frequencies = {k:self.max_size+1 for _,k in self.itos.items()}  # updated so that intial ones are also part of this
        
        # idx = 5 #index from which we want our dict to start. We already used 4 indexes for pad, unk...
        idx = len(self.original_stoi)
        
        #calculate freq of words
        for sentence in sentence_list:
            list_word = [tok.lower().strip() for tok in sentence.split(' ') if tok not in list(self.stoi.keys())] 
            for word in list_word:
#             for word in self.tokenizer(sentence):
                
                if word not in frequencies.keys():
                    frequencies[word]=1
                else:
                    
                    frequencies[word]+=1
                    
#         print ("----2-----\n",frequencies)
        
        #limit vocab by removing low freq words
        frequencies = {k:v for k,v in frequencies.items() if v>self.freq_threshold} 
        
#         print ("----3-----\n",frequencies)
        
        #limit vocab to the max_size specified
        frequencies = dict(sorted(frequencies.items(), key = lambda x: -x[1])[:self.max_size-idx]) # idx =5 for pad, unk...
        
#         print ("----4-----\n",frequencies)
            
        #create vocab
        for key in set(self.stoi.keys()):
            frequencies.pop(key)
        
#         print ("----5-----\n",frequencies)
        
        for word in frequencies.keys():
            self.stoi[word] = idx
            self.itos[idx] = word
            idx+=1
        
#         print ("----6-----\n",self.stoi)
        
    '''
    convert the list of words to a list of corresponding indexes
    '''    
    def numericalize(self, text):
        #tokenize text
#         tokenized_text = self.tokenizer(text)
#         print("---------\n",self.original_stoi.keys())
        tokenized_text = []
        for tok in text.split(' '):
            if tok not in list(self.original_stoi.keys()):
                tokenized_text.append(tok.lower().strip())
            else:
                tokenized_text.append(tok.strip())
                
#         tokenized_text = [tok.lower().strip() for tok in text.split(' ') if tok not in list(self.original_stoi.keys())]
        numericalized_text = []
        for token in tokenized_text:
            if token in self.stoi.keys():
                numericalized_text.append(self.stoi[token])
            else: #out-of-vocab words are represented by UNK token index
                numericalized_text.append(self.stoi['<UNK>'])
                
        return numericalized_text

In [63]:
# # #create a vocab class with freq_threshold=0 and max_size=100
# voc = Vocabulary(0, 100)
# sentence_list = ['that is a cat CAT', 'that is not a dog']
# #build vocab
# voc.build_vocabulary(sentence_list)

# print('index to string: ',voc.itos)
# print('string to index:',voc.stoi)

# print('numericalize -> cat and a dog <URL>: ', voc.numericalize('cat and a dog <NUMBER>'))

### Build Train_Dataset
We first inherit PyTorch's Dataset class.
Then, we initialize and build the vocabs for subject in our train data frame.
Then, we use the getitem() method to numericalize the subject 1 example at a time for the data loader (a function to load data in batches).

In [64]:
#######################################################
#               Define Train_Dataset class
#######################################################

class Train_Dataset(Dataset):
    '''
    Initiating Variables
    df: the training dataframe
    subject : the name of target text column in the dataframe
    transform : If we want to add any augmentation
    freq_threshold : the minimum times a word must occur in corpus to be treated in vocab
    vocab_max_size : max  vocab size
    '''
    
    def __init__(self, df, subject, label_col, max_sentence_length = 150, transform=None, freq_threshold = 5,
                vocab_max_size = 50000):
    
        self.df = df
        self.transform = transform
        
        #get body and label
        self.subject_texts = self.df[subject]
        self.labels = self.df[label_col]
        
        
        ##VOCAB class has been created above
        #Initialize vocab object and build vocabulary
        self.vocab = Vocabulary(freq_threshold, vocab_max_size)
        self.vocab.build_vocabulary(self.subject_texts.tolist())
        self.max_sentence_length = max_sentence_length
        
    def __len__(self):
        return len(self.df)
    
    '''
    __getitem__ runs on 1 example at a time. Here, we get an example at index and return its numericalize source and
    target values using the vocabulary objects we created in __init__
    '''
    def __getitem__(self, index):
        subject_text = self.subject_texts[index]
        label = self.labels[index]
#         print(subject_text)
        
        if self.transform is not None:
            subject_text = self.transform(subject_text)
            
        #numericalize texts ['<SOS>','cat', 'in', 'a', 'bag','<EOS>'] -> [1,12,2,9,24,2]
        numerialized_subject =[]
        numerialized_subject += self.vocab.numericalize(subject_text)
        
        while len(numerialized_subject) < self.max_sentence_length:
            numerialized_subject.append(0)
        
        #convert the list to tensor and return
        return torch.tensor(numerialized_subject[:self.max_sentence_length]),torch.tensor(label)
#         return torch.tensor(numerialized_subject[:self.train_dataset.max_sentence_length]),label

In [65]:
df_train = dict_df_subset['df_EI_reg_train_subset']
df_train

Unnamed: 0,TweetTokens,Intensity Score
0,<user> <user> shut up hashtags are cool <hasht...,0.562
1,it makes me so fucking irate jesus . nobody is...,0.750
2,lol adam the bull with his fake outrage . <rep...,0.417
3,<user> passed away early this morning in a fas...,0.354
4,<user> lol wow i was gonna say really ? ! <rep...,0.438
...,...,...
1696,got a <money> tip from a drunk uber passenger ...,0.708
1697,<user> <user> <user> <user> fucker blocked me ...,0.625
1698,<user> i look rabid,0.472
1699,"<user> i am not surprised , i would be fuming ...",0.479


In [66]:
for name,df in dict_df_subset.items():
  if "train" in name:
    dataset_name = name+"_dataset"
    vars()[dataset_name] = Train_Dataset(df,'TweetTokens','Intensity Score', max_sentence_length =200) # dynamically assigning datasetname

In [73]:
i = random.randint(0,len(df_EI_reg_train_subset_dataset))
print(dict_df_subset['df_EI_reg_train_subset'].loc[i][['TweetTokens','Intensity Score']])
print((df_EI_reg_train_subset_dataset[i][0]))
print((df_EI_reg_train_subset_dataset[i][1]))

TweetTokens        <user> * she could sense the anger stirring wi...
Intensity Score                                                0.375
Name: 1185, dtype: object
tensor([  5, 267,  80, 253, 659,   7,  60,   1, 465, 130,  11, 444,  74,   1,
         21,   1,   4, 267,  62, 325,  62,  65, 514,   1,  10,  65,  50, 337,
         39,   1,   4,  34,  34,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0

In [68]:
train_dataset_obj = Train_Dataset(df_train,'TweetTokens','Intensity Score', max_sentence_length =200)

In [69]:
i = random.randint(0,len(train_dataset_obj))
print(df_train.loc[i][['TweetTokens','Intensity Score']])
print((train_dataset_obj[i][1]))
print(len(train_dataset_obj[i][0]))
print(train_dataset_obj[i][0])

TweetTokens        <user> <user> <user> my snap is andriaprebles ...
Intensity Score                                                0.271
Name: 300, dtype: object
tensor(0.2710, dtype=torch.float64)
200
tensor([ 5,  5,  5, 20, 82, 15,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  

In [70]:
# print(df_EI_reg_train_subset_dataset)

In [71]:
# i = random.randint(0,len(df_EI_reg_train_subset_dataset))
# # print(train.loc[i][['body','label']])
# print(type(df_EI_reg_train_subset_dataset[i][1]))
# len(df_EI_reg_train_subset_dataset[i][0])

### Build Validation Dataset

In [None]:
#######################################################
#               Define Dataset Class
#######################################################

class Validation_Dataset(Dataset):
    def __init__(self, train_dataset, df, subject, label_col, transform = None):
        self.df = df
        self.transform = transform
        
        #train dataset will be used as lookup for vocab
        self.train_dataset = train_dataset
        
        #get body and label
        self.subject_texts = self.df[subject]
        self.labels = self.df[label_col]
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self,index):
        
        subject_text = self.subject_texts[index]
        label = self.labels[index]
#         print(subject_text)
        
        if self.transform is not None:
            subject_text = self.transform(subject_text)
            
            

        #numericalize texts ['cat', 'in', 'a', 'bag'] -> [12,2,9,24]
        numerialized_subject = []
        numerialized_subject += self.train_dataset.vocab.numericalize(subject_text)
#         print("max sentence length", self.train_dataset.max_sentence_length)
        while len(numerialized_subject) < self.train_dataset.max_sentence_length:
            numerialized_subject.append(0)
            

        
        #convert the list to tensor and return
#         return torch.tensor(numerialized_subject),label

#         #convert the list to tensor and return
        return torch.tensor(numerialized_subject[:self.train_dataset.max_sentence_length]),torch.tensor(label)

