<a href="https://colab.research.google.com/github/peeyushsinghal/da/blob/main/mitigating_bias_sa_da.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
! pip install ekphrasis # library to pre process twitter data
! pip install emoji --upgrade #library to deal with emoji data

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.1.0.tar.gz (216 kB)
[K     |████████████████████████████████| 216 kB 5.4 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.1.0-py3-none-any.whl size=212392 sha256=19788328f3ea4287b1a563d0870d2a74935d6224058bf92ea6e1e97cefcc57c2
  Stored in directory: /root/.cache/pip/wheels/77/75/99/51c2a119f4cfd3af7b49cc57e4f737bed7e40b348a85d82804
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.1.0


In [2]:
## Import statements
import pandas as pd
import os
import torch
import numpy as np
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
import emoji
from tqdm import tqdm

In [3]:
# checking device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:{}".format(DEVICE))

Running on:cpu


# Data Handling

Mounting google drive for data in there

In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


##Data configuration

In [10]:
BASE_PATH = '/content/drive/MyDrive/semeval-2018'
DATA_DIR = os.path.join(BASE_PATH,'datasets')

In [11]:
class TASK1(object):
  
    EI_reg = {
        'anger': {
            'train': os.path.join(
                DATA_DIR, 'task1/EI-reg/training/EI-reg-En-anger-train.txt'),
            'dev': os.path.join(
                DATA_DIR, 'task1/EI-reg/development/2018-EI-reg-En-anger-dev.txt'),
            'gold': os.path.join(
                DATA_DIR, 'task1/EI-reg/test-gold/2018-EI-reg-En-anger-test-gold.txt')
                }
        }

    V_reg = {
        'train': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-train.txt'),
        'dev': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-dev.txt'),
        'gold': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-test-gold.txt')
             }

    EEC = {
        'eec': os.path.join(
            DATA_DIR, 'task1/Equity-Evaluation-Corpus/Equity-Evaluation-Corpus.csv')
             }

##Dataloaders

Parsing regression data : `format [ID	Tweet	Affect Dimension	Intensity Score]`

In [12]:
def parse_reg(data_file, label_format='tuple')-> pd.DataFrame:
    """
    This is for datasets for the EI-reg and V-reg English tasks 
    Returns:
        df: dataframe with columns in the first row of file [ID-Tweet-Affect Dimension-Intensity Score]
    """
    with open(data_file, 'r') as fd:
      data = [l.strip().split('\t') for l in fd.readlines()]
    
    df = pd.DataFrame (data[1:],columns=data[0])
    return df

In [13]:
# def parse_reg(data_file, label_format='tuple')-> (list, list):
#     """
#     This is for datasets for the EI-reg and V-reg English tasks 
#     Returns:
#         X: a list of tweets
#         y: a list of (affect dimension, v) tuples corresponding to
#          the regression targets of the tweets
#     """
#     with open(data_file, 'r') as fd:
#         data = [l.strip().split('\t') for l in fd.readlines()][1:]
#     X = [d[1] for d in data]
#     y = [(d[2], float(d[3])) for d in data]
#     if label_format == 'list':
#         y = [l[1] for l in y]
#     return X, y

parsing EEC data : `format [ID	Sentence	Template	Person	Gender	Race Emotion	Emotion word]`

In [14]:
def parse_eec()->pd.DataFrame:
  """
  This is for EEC Dataset, it is a csv file
  Returns:
        df_eec: dataframe 
  """
  data_train = TASK1.EEC['eec']
  df_eec = pd.read_csv(data_train)
  return df_eec


In [15]:
def parse(task, dataset, emotion='anger') -> pd.DataFrame:
    if task == 'EI-reg':
        data_train = TASK1.EI_reg[emotion][dataset]
        df = parse_reg(data_train)
        return df
    elif task == 'V-reg':
        data_train = TASK1.V_reg[dataset]
        df = parse_reg(data_train)
        return df
    else:
        return None, None

In [71]:
## Creating Dataframes
df_EI_reg_train = parse('EI-reg','train')
df_EI_reg_val = parse('EI-reg','dev')
df_EI_reg_test = parse('EI-reg','gold')
df_V_reg_train = parse('V-reg','train')
df_V_reg_val = parse('V-reg','dev')
df_V_reg_test = parse('V-reg','gold')

dict_df= {'df_EI_reg_train':df_EI_reg_train, 
          'df_EI_reg_val':df_EI_reg_val, 
          'df_EI_reg_test':df_EI_reg_test, 
          'df_V_reg_train': df_V_reg_train, 
          'df_V_reg_val':df_V_reg_val, 
          'df_V_reg_test': df_V_reg_test 
          }

##PreProcess Twitter Data

In [64]:
# reference : https://github.com/cbaziotis/ekphrasis


text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [84]:
# #### Example checks of pre-processing
sentences = [
    "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))",
    "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
    "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/.",
    "@MGBarbieri @SpalkTalk a@b.com And just saw your LinkedIn comment after I sent this! Thanks for the message :) 😀",
    "💙💛🏆 @GeorgeePitman Young Player of The Season 🏆💛💙 #irony #actuallyseventy"
]

for s in sentences:
    print(text_processor.pre_process_doc(s))
# print ([text_processor.pre_process_doc(s) for s in sentences])

['<allcaps>', 'cant', 'wait', '</allcaps>', 'for', 'the', 'new', 'season', 'of', '<hashtag>', 'twin', 'peaks', '</hashtag>', '＼(^o^)／', '!', '<repeated>', '<hashtag>', 'david', 'lynch', '</hashtag>', '<hashtag>', 'tv', 'series', '</hashtag>', '<happy>']
['i', 'saw', 'the', 'new', '<hashtag>', 'john', 'doe', '</hashtag>', 'movie', 'and', 'it', 'sucks', '<elongated>', '!', '<repeated>', '<allcaps>', 'waisted', '</allcaps>', '<money>', '.', '<repeated>', '<hashtag>', 'bad', 'movies', '</hashtag>', '<annoyed>']
['<user>', ':', 'can', 'not', 'wait', 'for', 'the', '<date>', '<hashtag>', 'sentiment', '</hashtag>', 'talks', '!', '<allcaps>', 'yay', '<elongated>', '</allcaps>', '!', '<repeated>', '<laugh>', '<url>']
['<user>', '<user>', '<email>', 'and', 'just', 'saw', 'your', 'linkedin', 'comment', 'after', 'i', 'sent', 'this', '!', 'thanks', 'for', 'the', 'message', '<happy>', '😀']
['💙', '💛', '🏆', '<user>', 'young', 'player', 'of', 'the', 'season', '🏆', '💛', '💙', '<hashtag>', 'irony', '</hash

In [69]:
def preprocess_tweets(df)-> pd.DataFrame:
  tweets = df.Tweet.to_list()
  # df['TweetTokens'] = [emoji.demojize(text_processor.pre_process_doc(tweet),language = 'en') for tweet in tweets] # Translates emoji in to word and preprocesss
  # df['TweetTokens'] = [text_processor.pre_process_doc(tweet) for tweet in tweets] # preprocesss
  tweets_processed = [text_processor.pre_process_doc(tweet) for tweet in tweets] # preprocesss
  for tweet in tweets_processed:
    for index, token in enumerate(tweet):
      if emoji.is_emoji(token):
        tweet[index] = emoji.demojize(token, language = 'en')
  
  df['TweetTokens'] = tweets_processed
  # print(df)
  return df

In [72]:
# df_EI_reg_train = preprocess_tweets(df_EI_reg_train)
# df_V_reg_train = preprocess_tweets(df_V_reg_train)

for name, df in dict_df.items():
  df = preprocess_tweets(df)


TODO : 
* remove stop words
* stem
* lemmetize


In [50]:
df_V_reg_train.columns

Index(['ID', 'Tweet', 'Affect Dimension', 'Intensity Score', 'TweetTokens'], dtype='object')

In [75]:
def subset_df(df):
  # df_reduced = df[['TweetTokens','Intensity Score']]
  # print (df_reduced)
  return df[['TweetTokens','Intensity Score']]

In [60]:
# subset_df_list = [subset_df(df) for df in df_list]

In [76]:
dict_df_subset ={name+"_subset": subset_df(df) for name, df in dict_df.items() }

In [80]:
# print (dict_df_subset)

## Creating Pytorch Datasets

### Creating Vocabulary
Before we create the Dataset, we need to define a process to build our vocabulary. For this,
We’ll create a “Vocabulary” class which will create the word-to-index and index-to-word mappings using only the train dataframe we created before
Also, the “Vocabulary” class returns the numericalized version of each sentence in our dataframe. Eg: [‘i’, ‘love’, ‘apple’] -> [23, 54, 1220]. We need to convert the words to numbers as models expect each word in our vocabulary to be represented by a number

In [89]:
#######################################################
#               Define Vocabulary Class
#######################################################

class Vocabulary:
  
    '''
    __init__ method is called by default as soon as an object of this class is initiated
    we use this method to initiate our vocab dictionaries
    '''
    def __init__(self, freq_threshold = 3, max_size = 10000):
        '''
        freq_threshold : the minimum times a word must occur in corpus to be treated in vocab
        max_size : max source vocab size. Eg. if set to 10,000, we pick the top 10,000 most frequent words and discard others
        '''
        #initiate the index to token dict
        ## <PAD> -> padding, used for padding the shorter sentences in a batch to match the length of longest sentence in the batch
        ## <UNK> -> words which are not found in the vocab are replace by this token
        # self.itos = {0: '<PAD>', 1: '<UNK>', 2:'<NUMBER>', 3: '<CURRENCY>', 4: '<URL>'}
        self.itos = {0: '<PAD>', 1: '<UNK>'}
        
        
        #initiate the token to index dict
        self.stoi = {k:j for j,k in self.itos.items()}
        self.original_stoi = self.stoi.copy()
#         print(self.stoi)
        
        self.freq_threshold = freq_threshold
        self.max_size = max_size
    
    '''
    __len__ is used by dataloader later to create batches
    '''
    def __len__(self):
        return len(self.itos)
    
    '''
    a simple tokenizer to split on space and converts the sentence to list of words
    '''
    @staticmethod
    def tokenizer(text):
#         return [tok.strip() for tok in text.split(' ')]
        return [tok.lower().strip() for tok in text.split(' ')] # this is commented out to avoid <NUMBER> ,<UNK> lowering
#         return [tok.lower().strip() for tok in text.split(' ') if tok not in list(self.stoi.keys())] 
    
    '''
    build the vocab: create a dictionary mapping of index to string (itos) and string to index (stoi)
    output ex. for stoi -> {'the':6, 'a':7, 'an':8}
    '''
    def build_vocabulary(self, sentence_list):
        #calculate the frequencies of each word first to remove the words with freq < freq_threshold
#         frequencies = {}  #init the freq dict
        frequencies = {k:self.max_size+1 for _,k in self.itos.items()}  # updated so that intial ones are also part of this
        
        # idx = 5 #index from which we want our dict to start. We already used 4 indexes for pad, unk...
        idx = len(self.original_stoi)
        
        #calculate freq of words
        for sentence in sentence_list:
            list_word = [tok.lower().strip() for tok in sentence.split(' ') if tok not in list(self.stoi.keys())] 
            for word in list_word:
#             for word in self.tokenizer(sentence):
                
                if word not in frequencies.keys():
                    frequencies[word]=1
                else:
                    
                    frequencies[word]+=1
                    
#         print ("----2-----\n",frequencies)
        
        #limit vocab by removing low freq words
        frequencies = {k:v for k,v in frequencies.items() if v>self.freq_threshold} 
        
#         print ("----3-----\n",frequencies)
        
        #limit vocab to the max_size specified
        frequencies = dict(sorted(frequencies.items(), key = lambda x: -x[1])[:self.max_size-idx]) # idx =5 for pad, unk...
        
#         print ("----4-----\n",frequencies)
            
        #create vocab
        for key in set(self.stoi.keys()):
            frequencies.pop(key)
        
#         print ("----5-----\n",frequencies)
        
        for word in frequencies.keys():
            self.stoi[word] = idx
            self.itos[idx] = word
            idx+=1
        
#         print ("----6-----\n",self.stoi)
        
    '''
    convert the list of words to a list of corresponding indexes
    '''    
    def numericalize(self, text):
        #tokenize text
#         tokenized_text = self.tokenizer(text)
#         print("---------\n",self.original_stoi.keys())
        tokenized_text = []
        for tok in text.split(' '):
            if tok not in list(self.original_stoi.keys()):
                tokenized_text.append(tok.lower().strip())
            else:
                tokenized_text.append(tok.strip())
                
#         tokenized_text = [tok.lower().strip() for tok in text.split(' ') if tok not in list(self.original_stoi.keys())]
        numericalized_text = []
        for token in tokenized_text:
            if token in self.stoi.keys():
                numericalized_text.append(self.stoi[token])
            else: #out-of-vocab words are represented by UNK token index
                numericalized_text.append(self.stoi['<UNK>'])
                
        return numericalized_text

In [90]:
# #create a vocab class with freq_threshold=0 and max_size=100
voc = Vocabulary(0, 100)
sentence_list = ['that is a cat CAT', 'that is not a dog']
#build vocab
voc.build_vocabulary(sentence_list)

print('index to string: ',voc.itos)
print('string to index:',voc.stoi)

print('numericalize -> cat and a dog <URL>: ', voc.numericalize('cat and a dog <NUMBER>'))

index to string:  {0: '<PAD>', 1: '<UNK>', 2: 'that', 3: 'is', 4: 'a', 5: 'cat', 6: 'not', 7: 'dog'}
string to index: {'<PAD>': 0, '<UNK>': 1, 'that': 2, 'is': 3, 'a': 4, 'cat': 5, 'not': 6, 'dog': 7}
numericalize -> cat and a dog <URL>:  [5, 1, 4, 7, 1]
