<a href="https://colab.research.google.com/github/peeyushsinghal/da/blob/main/mitigating_bias_sa_da.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
! pip install ekphrasis # library to pre process twitter data

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ekphrasis
  Downloading ekphrasis-0.5.4-py3-none-any.whl (83 kB)
[K     |████████████████████████████████| 83 kB 884 kB/s 
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |████████████████████████████████| 53 kB 1.4 MB/s 
Collecting colorama
  Downloading colorama-0.4.5-py2.py3-none-any.whl (16 kB)
Installing collected packages: ftfy, colorama, ekphrasis
Successfully installed colorama-0.4.5 ekphrasis-0.5.4 ftfy-6.1.1


In [4]:
## Import statements
import pandas as pd
import os
import torch
import numpy as np
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from tqdm import tqdm

In [5]:
# checking device
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Running on:{}".format(DEVICE))

Running on:cpu


# Data Handling

Mounting google drive for data in there

In [6]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


##Data configuration

In [7]:
BASE_PATH = '/content/drive/MyDrive/semeval-2018'
DATA_DIR = os.path.join(BASE_PATH,'datasets')

In [8]:
class TASK1(object):
  
    EI_reg = {
        'anger': {
            'train': os.path.join(
                DATA_DIR, 'task1/EI-reg/training/EI-reg-En-anger-train.txt'),
            'dev': os.path.join(
                DATA_DIR, 'task1/EI-reg/development/2018-EI-reg-En-anger-dev.txt'),
            'gold': os.path.join(
                DATA_DIR, 'task1/EI-reg/test-gold/2018-EI-reg-En-anger-test-gold.txt')
                }
        }

    V_reg = {
        'train': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-train.txt'),
        'dev': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-dev.txt'),
        'gold': os.path.join(
            DATA_DIR, 'task1/V-reg/2018-Valence-reg-En-test-gold.txt')
             }

    EEC = {
        'eec': os.path.join(
            DATA_DIR, 'task1/Equity-Evaluation-Corpus/Equity-Evaluation-Corpus.csv')
             }

##Dataloaders

Parsing regression data : `format [ID	Tweet	Affect Dimension	Intensity Score]`

In [9]:
def parse_reg(data_file, label_format='tuple')-> pd.DataFrame:
    """
    This is for datasets for the EI-reg and V-reg English tasks 
    Returns:
        df: dataframe with columns in the first row of file [ID-Tweet-Affect Dimension-Intensity Score]
    """
    with open(data_file, 'r') as fd:
      data = [l.strip().split('\t') for l in fd.readlines()]
    
    df = pd.DataFrame (data[1:],columns=data[0])
    return df

In [10]:
# def parse_reg(data_file, label_format='tuple')-> (list, list):
#     """
#     This is for datasets for the EI-reg and V-reg English tasks 
#     Returns:
#         X: a list of tweets
#         y: a list of (affect dimension, v) tuples corresponding to
#          the regression targets of the tweets
#     """
#     with open(data_file, 'r') as fd:
#         data = [l.strip().split('\t') for l in fd.readlines()][1:]
#     X = [d[1] for d in data]
#     y = [(d[2], float(d[3])) for d in data]
#     if label_format == 'list':
#         y = [l[1] for l in y]
#     return X, y

parsing EEC data : `format [ID	Sentence	Template	Person	Gender	Race Emotion	Emotion word]`

In [11]:
def parse_eec()->pd.DataFrame:
  """
  This is for EEC Dataset, it is a csv file
  Returns:
        df_eec: dataframe 
  """
  data_train = TASK1.EEC['eec']
  df_eec = pd.read_csv(data_train)
  return df_eec


In [12]:
def parse(task, dataset, emotion='anger') -> (list, list):
    if task == 'EI-reg':
        data_train = TASK1.EI_reg[emotion][dataset]
        df = parse_reg(data_train)
        return df
    elif task == 'V-reg':
        data_train = TASK1.V_reg[dataset]
        df = parse_reg(data_train)
        return df
    else:
        return None, None

In [18]:
## Checking if the parsing is working 
# parse('EI-reg','train').head()
# parse('V-reg','gold').head()
# parse_eec().head()

Unnamed: 0,﻿ID,Tweet,Affect Dimension,Intensity Score
0,2018-En-01964,Gm and have a #Tuesday!,valence,0.589
1,2018-En-01539,@realDonaldTrump But you have a lot of time fo...,valence,0.5
2,2018-En-04235,I graduated yesterday and already had 8 family...,valence,0.55
3,2018-En-03711,@jaimitoelcrack7 Seriously...I've been sitting...,valence,0.633
4,2018-En-01177,Whether my glass is half empty or its half ful...,valence,0.75


##PreProcess Twitter Data

In [14]:
def tokenize(text, lowercase=True):
  """
  Return:
    text: tokenized text (split) and optionally lower case
  
  """
  if lowercase:
    text = text.lower()
  return text.split()

In [21]:
def twitter_preprocess():
    preprocessor = TextPreProcessor(
        normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
                   'time',
                   'date', 'number'],
        annotate={"hashtag", "elongated", "allcaps", "repeated", 'emphasis',
                  'censored'},
        all_caps_tag="wrap",
        fix_text=True,
        segmenter="twitter_2018",
        corrector="twitter_2018",
        unpack_hashtags=True,
        unpack_contractions=True,
        spell_correct_elong=False,
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons]
    ).pre_process_doc

    def preprocess(name, dataset):
        desc = "PreProcessing dataset {}...".format(name)

        data = [preprocessor(x)
                for x in tqdm(dataset, desc=desc)]
        return data

    return preprocess

In [15]:
# def twitter_preprocess(name, dataset):
#   """
#   reference : https://github.com/FengJiaChunFromSYSU/ntua-slp-semeval2018
#   https://github.com/cbaziotis/ekphrasis
#   """
#   preprocessor = TextPreProcessor(
#     normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
#                 'time',
#                 'date', 'number'],
#     annotate={"hashtag", "elongated", "allcaps", "repeated", 'emphasis',
#                 'censored'},
#     all_caps_tag="wrap",
#     fix_text=True,
#     segmenter="twitter_2018",
#     corrector="twitter_2018",
#     unpack_hashtags=True,
#     unpack_contractions=True,
#     spell_correct_elong=False,
#     tokenizer=SocialTokenizer(lowercase=True).tokenize,
#     dicts=[emoticons]
#     ).pre_process_doc
    
#   desc = "PreProcessing dataset {}...".format(name)

#   data = [preprocessor(x) for x in tqdm(dataset, desc=desc)]

#   return data

In [30]:

# reference : https://github.com/cbaziotis/ekphrasis

text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

Reading twitter - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_1grams.txt
Reading twitter - 2grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_2grams.txt
Reading twitter - 1grams ...


In [35]:
sentences = [
    "CANT WAIT for the new season of #TwinPeaks ＼(^o^)／!!! #davidlynch #tvseries :)))",
    "I saw the new #johndoe movie and it suuuuucks!!! WAISTED $10... #badmovies :/",
    "@SentimentSymp:  can't wait for the Nov 9 #Sentiment talks!  YAAAAAAY !!! :-D http://sentimentsymposium.com/.",
    "@MGBarbieri @SpalkTalk a@b.com And just saw your LinkedIn comment after I sent this! Thanks for the message :) 😀",
    "💙💛🏆 @GeorgeePitman Young Player of The Season 🏆💛💙 #irony #actuallyseventy"
]

for s in sentences:
    print(text_processor.pre_process_doc(s))

['<allcaps>', 'cant', 'wait', '</allcaps>', 'for', 'the', 'new', 'season', 'of', '<hashtag>', 'twin', 'peaks', '</hashtag>', '＼(^o^)／', '!', '<repeated>', '<hashtag>', 'david', 'lynch', '</hashtag>', '<hashtag>', 'tv', 'series', '</hashtag>', '<happy>']
['i', 'saw', 'the', 'new', '<hashtag>', 'john', 'doe', '</hashtag>', 'movie', 'and', 'it', 'sucks', '<elongated>', '!', '<repeated>', '<allcaps>', 'waisted', '</allcaps>', '<money>', '.', '<repeated>', '<hashtag>', 'bad', 'movies', '</hashtag>', '<annoyed>']
['<user>', ':', 'can', 'not', 'wait', 'for', 'the', '<date>', '<hashtag>', 'sentiment', '</hashtag>', 'talks', '!', '<allcaps>', 'yay', '<elongated>', '</allcaps>', '!', '<repeated>', '<laugh>', '<url>']
['<user>', '<user>', '<email>', 'and', 'just', 'saw', 'your', 'linkedin', 'comment', 'after', 'i', 'sent', 'this', '!', 'thanks', 'for', 'the', 'message', '<happy>', '😀']
['💙', '💛', '🏆', '<user>', 'young', 'player', 'of', 'the', 'season', '🏆', '💛', '💙', '<hashtag>', 'irony', '</hash