In [1]:
import pandas as pd
from deep_translator import GoogleTranslator
from spellchecker import SpellChecker
import numpy as np

# Forced-choice 

In [2]:
df_esp = pd.read_csv('../clean_data_mturk_espanol/forced_choice_emotion_mturk_espanol.csv')
df_labels_esp = pd.read_csv('../data_mturk_espanol/emotion_labels_espanol.csv')

## Manual translation

**The VADER sentiment analyzer only works in English**. I had two options:

- Using a bilingual sentiment analyzer
- Translating to English and then use VADER 

Since I couldn't find a "widely used" bilingual sentiment analyzer, and translating was straightforward, I opted fot the latter 

In [3]:
def translation_emotions(df):
    df.replace(['Enfado'], 'Anger', inplace=True)
    df.replace(['Felicidad'], 'Happiness', inplace=True)
    df.replace(['Sorpresa'], 'Surprise', inplace=True)
    df.replace(['Tristeza'], 'Sadness', inplace=True)
    df.replace(['Asco'], 'Disgust', inplace=True)
    df.replace(['Incertidumbre'], 'Uncertain', inplace=True)
    df.replace(['Miedo'], 'Fear', inplace=True)
    df.replace(['Otra'], 'Other', inplace=True)
    
    return df

In [4]:
df = translation_emotions(df_esp).copy(deep=True)
df_labels = df_labels_esp.copy(deep=True)

In [5]:
df_emo_answers_wide = df.loc[:, 'Q2.1':'Q195.1'] # subset photos

## Long format DF

In [6]:
df_emo_answers_long = df_emo_answers_wide.reset_index()
df_emo_answers_long['index'] = df_emo_answers_long.index
df_emo_answers_long = df_emo_answers_long.rename(columns={'index':'participantId'})
df_emo_answers_long.head(2)

Unnamed: 0,participantId,Q2.1,Q3.1,Q4.1,Q5.1,Q6.1,Q7.1,Q8.1,Q9.1,Q10.1,...,Q186.1,Q187.1,Q188.1,Q189.1,Q190.1,Q191.1,Q192.1,Q193.1,Q194.1,Q195.1
0,0,Other,Uncertain,Anger,Other,Anger,Anger,Other,Anger,Disgust,...,Surprise,Neutral,Other,Uncertain,Disgust,Neutral,Other,Other,Uncertain,Neutral
1,1,Neutral,Anger,Anger,Uncertain,Anger,Anger,Neutral,Other,Uncertain,...,Surprise,Neutral,Other,Happiness,Uncertain,Neutral,Uncertain,Other,Other,Neutral


In [7]:
df_labels['photoId'] =  df_labels['photoId'].str.strip()
photoIds = df_labels['photoId'].tolist()

In [8]:
df_emo_answers_long = pd.melt(df_emo_answers_long, id_vars=['participantId'], value_vars=photoIds).reset_index(drop=True).sort_values(by=['participantId'])

In [9]:
df_emo_answers_long = df_emo_answers_long.rename(columns={'variable':'photoId', 'value':'emotion'})

In [10]:
df_emo_answers_long_m = pd.merge(df_emo_answers_long, df_labels, how="left", on=["photoId"], validate="many_to_one")

In [11]:
df_emo_answers_long_m.head(2)

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url
0,0,Q2.1,Other,bipoc,female,adult,enfado,https://uwmadison.co1.qualtrics.com/ControlPan...
1,0,Q99.1,Neutral,bipoc,female,adult,neutral,https://uwmadison.co1.qualtrics.com/ControlPan...


## Sentiment score

In [12]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [13]:
sia = SentimentIntensityAnalyzer()

In [14]:
df_emo_answers_long_m['sentimentScore'] = df_emo_answers_long_m['emotion'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [15]:
df_emo_answers_long_m.head(2)

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore
0,0,Q2.1,Other,bipoc,female,adult,enfado,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0
1,0,Q99.1,Neutral,bipoc,female,adult,neutral,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0


## Center predictors

In [16]:
df_emo_answers_long_m['sexC'] = df_emo_answers_long_m['sex'].replace({'female': -0.5, 'male': 0.5})
df_emo_answers_long_m['ageC'] = df_emo_answers_long_m['age'].replace({'child': -0.5, 'adult': 0.5})
df_emo_answers_long_m['ethnicityC'] = df_emo_answers_long_m['ethnicity'].replace({'bipoc': -0.5, 'white': 0.5})

In [17]:
df_emo_answers_long_m.head(2)

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore,sexC,ageC,ethnicityC
0,0,Q2.1,Other,bipoc,female,adult,enfado,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,-0.5,0.5,-0.5
1,0,Q99.1,Neutral,bipoc,female,adult,neutral,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,-0.5,0.5,-0.5


In [18]:
df_emo_answers_long_m_fil = df_emo_answers_long_m[df_emo_answers_long_m['label'] != 'attention']
df_emo_answers_long_m_fil = df_emo_answers_long_m_fil.reset_index(drop=True)

In [19]:
df_emo_answers_long_m_fil.to_csv('../clean_data_mturk_espanol/forced_choice_emotion_mturk_long_format_lmer_espanol.csv', index=False)

In [20]:
df_emo_answers_long_m_fil.head()

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore,sexC,ageC,ethnicityC
0,0,Q2.1,Other,bipoc,female,adult,enfado,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,-0.5,0.5,-0.5
1,0,Q99.1,Neutral,bipoc,female,adult,neutral,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,-0.5,0.5,-0.5
2,0,Q98.1,Happiness,white,male,child,felicidad,https://uwmadison.co1.qualtrics.com/ControlPan...,0.5574,0.5,-0.5,0.5
3,0,Q97.1,Neutral,white,male,adult,felicidad,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,0.5,0.5,0.5
4,0,Q96.1,Happiness,white,female,child,felicidad,https://uwmadison.co1.qualtrics.com/ControlPan...,0.5574,-0.5,-0.5,0.5


# Free-labeling

In [21]:
df_free_esp = pd.read_csv('../clean_data_mturk_espanol/free_choice_emotion_mturk_espanol.csv')
df_labels_free = pd.read_csv('../data_mturk_espanol/emotion_labels_free_choice_mturk_espanol.csv')

In [22]:
df_emo_answers_free = df_free_esp.loc[:, 'Q2.1_1':'Q195.1_4'] # get cols with words only

## Manual word clean up

In [23]:
df_emo_answers_free = df_emo_answers_free.transform(lambda x: x.str.strip(), axis = 1)
df_emo_answers_free = df_emo_answers_free.transform(lambda x: x.str.lower(), axis = 1)
df_emo_answers_free = df_emo_answers_free.transform(lambda x: x.replace({'na':'none'}) , axis = 1)
df_emo_answers_free = df_emo_answers_free.fillna('NAN')

In [24]:
df_emo_answers_free.head(2)

Unnamed: 0,Q2.1_1,Q2.1_2,Q2.1_3,Q2.1_4,Q3.1_1,Q3.1_2,Q3.1_3,Q3.1_4,Q4.1_1,Q4.1_2,...,Q193.1_3,Q193.1_4,Q194.1_1,Q194.1_2,Q194.1_3,Q194.1_4,Q195.1_1,Q195.1_2,Q195.1_3,Q195.1_4
0,molesta,NAN,NAN,NAN,molesta,NAN,NAN,NAN,molesto,NAN,...,NAN,NAN,molesto,NAN,NAN,NAN,molesto,NAN,NAN,NAN
1,enfado,NAN,NAN,NAN,enfado,NAN,NAN,NAN,enfado,NAN,...,NAN,NAN,NAN,NAN,NAN,NAN,neutral,NAN,NAN,NAN


## English translation

In [25]:
df_stack = df_emo_answers_free.stack().reset_index(drop=True) # stack as series
df_stack = df_stack.to_frame(name='emotion') # as DF
emotion_set = list(set(df_stack['emotion'])) # get distinc words

In [26]:
## dic for translation
emotion_dic = {}
for i in emotion_set:
    emotion_dic[i] = GoogleTranslator(source='spanish', target='english').translate(i)
    print(emotion_dic[i])


disgusting
force
naughty
attentiveness
funny
angry
sarcasm
sleep
notice
neutral
anxiety
disgusted
disappointed
concern
attentive
tasty
violent
happiness
mischief
restless
sensual
fascination
irony
betrayed
complicity
relief
satisfied
discomfort
distracted
challenging
disinterest
sensuality
shock
insolent
expectant
tiredness
arguing
terror
irritated
resignation
laughs
euphoric
Surprised
liking
imitation
happiness
shocked
repellency
rogue
coldness
trust
weeping
disease
doubt
cautious
scared
intrigue
terrible
choleric
ugly
naughty
hysteria
malicious
uncomfortable
attention
excited
confrontation
quiet
treason
Print
repulsion
scare
tired
scaredy
shocking
I laughed
pleasure
joke
challenging
fascinated
concentration
performance
desire
depressed
powerful
angry
surprised
stress
disgust
stinky
effusiveness
spoiled
notch
anger
overwhelmed
remembering
discouragement
believed
rude
intrigued
happiness
disgust
irritation
incredulous
claim
spoiled
picardia
nice
astonishment
bold
would be
sulky
surpris

In [27]:
df_emo_answers_free_en = df_emo_answers_free.replace(emotion_dic)

In [28]:
df_labels_free['photoId'] = df_emo_answers_free.columns.tolist()
df_labels_free.tail(2)

Unnamed: 0,ethnicity,sex,age,label,url,photoId
774,white,male,child,incertidumbre,https://uwmadison.co1.qualtrics.com/ControlPan...,Q195.1_3
775,white,male,child,incertidumbre,https://uwmadison.co1.qualtrics.com/ControlPan...,Q195.1_4


## Long-form DF

In [29]:
df_emo_answers_free_long = df_emo_answers_free_en.reset_index()
df_emo_answers_free_long['index'] = df_emo_answers_free_long.index
df_emo_answers_free_long = df_emo_answers_free_long.rename(columns={'index':'participantId'})
df_emo_answers_free_long.head(2)

Unnamed: 0,participantId,Q2.1_1,Q2.1_2,Q2.1_3,Q2.1_4,Q3.1_1,Q3.1_2,Q3.1_3,Q3.1_4,Q4.1_1,...,Q193.1_3,Q193.1_4,Q194.1_1,Q194.1_2,Q194.1_3,Q194.1_4,Q195.1_1,Q195.1_2,Q195.1_3,Q195.1_4
0,0,annoying,,,,annoying,,,,upset,...,,,upset,,,,upset,,,
1,1,anger,,,,anger,,,,anger,...,,,,,,,neutral,,,


In [30]:
# df_labels['photoId'] =  df_labels['photoId'].str.strip()
photoIds = df_labels_free['photoId'].tolist()

In [31]:
df_emo_answers_free_long = pd.melt(df_emo_answers_free_long, id_vars=['participantId'], value_vars=photoIds).reset_index(drop=True).sort_values(by=['participantId'])

In [32]:
df_emo_answers_free_long = df_emo_answers_free_long.rename(columns={'variable':'photoId', 'value':'emotion'})

In [33]:
## replace with np.nan to drop it 
dic_nan = {'nan': np.nan, 'none': np.nan}

df_emo_answers_free_long['emotion'] = df_emo_answers_free_long['emotion'].replace(dic_nan)

In [34]:
df_emo_answers_free_long.head()

Unnamed: 0,participantId,photoId,emotion
0,0,Q2.1_1,annoying
15279,0,Q117.1_4,
15312,0,Q118.1_1,afraid
15345,0,Q118.1_2,upset
15378,0,Q118.1_3,


## Check and drop NA

In [35]:
df_emo_answers_free_long.isna().sum(), df_emo_answers_free_long.shape

(participantId        0
 photoId              0
 emotion          16159
 dtype: int64,
 (25608, 3))

In [36]:
## drop np.nan
df_emo_answers_free_long_drop = df_emo_answers_free_long.dropna()

df_emo_answers_free_long_drop.isna().sum(), df_emo_answers_free_long_drop.shape

(participantId    0
 photoId          0
 emotion          0
 dtype: int64,
 (9449, 3))

In [37]:
df_emo_answers_free_long_m = pd.merge(df_emo_answers_free_long_drop, df_labels_free, how="left", on=["photoId"], validate="many_to_one")

In [38]:
df_emo_answers_free_long_m.head(2)

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url
0,0,Q2.1_1,annoying,bipoc,female,adult,enfado,https://uwmadison.co1.qualtrics.com/ControlPan...
1,0,Q118.1_1,afraid,bipoc,male,child,sorpresa,https://uwmadison.co1.qualtrics.com/ControlPan...


In [39]:
df_emo_answers_free_long_m.isna().sum(), df_emo_answers_free_long_m.shape

(participantId      0
 photoId            0
 emotion            0
 ethnicity        112
 sex              112
 age              112
 label              0
 url                0
 dtype: int64,
 (9449, 8))

## Manual english correction and removing sentences

In [40]:
## we want to get rid of sentences and leave single words

## count length sentences 
df_emo_answers_free_long_m['len_words'] = df_emo_answers_free_long_m['emotion'].str.split().apply(len)

## get list of distinct sentences (>= 2 words)
list_sentences = df_emo_answers_free_long_m[df_emo_answers_free_long_m['len_words'] >= 2].value_counts('emotion').index.tolist()

list_sentences

['go to',
 'in tears',
 'Calm down',
 'I laughed',
 'would be',
 'sassy Girl',
 'I long',
 'calmed down',
 'I disapprove',
 'something sucks',
 'silly faces',
 'poor appetite',
 'nice to meet you',
 'man of steel in disgrace',
 'in love',
 'Bad smell',
 'fed up',
 'bad move',
 'bad gal',
 'bad attitude',
 'Waiting answer',
 'I knew',
 "I didn't expect it from you",
 'He shouted',
 'Clark Kent',
 'bowled over']

In [41]:
## dic for word replacement

dic_sentences = {}

for i in list_sentences:
    dic_sentences[i] = None
    
dic_sentences

{'go to': None,
 'in tears': None,
 'Calm down': None,
 'I laughed': None,
 'would be': None,
 'sassy Girl': None,
 'I long': None,
 'calmed down': None,
 'I disapprove': None,
 'something sucks': None,
 'silly faces': None,
 'poor appetite': None,
 'nice to meet you': None,
 'man of steel in disgrace': None,
 'in love': None,
 'Bad smell': None,
 'fed up': None,
 'bad move': None,
 'bad gal': None,
 'bad attitude': None,
 'Waiting answer': None,
 'I knew': None,
 "I didn't expect it from you": None,
 'He shouted': None,
 'Clark Kent': None,
 'bowled over': None}

In [42]:
## there is no choice but manual translation/fixing for these ones

dic_sentences['go to'] = 'wrath' ## Google Translate is spliting 'ira' (wrath) into 'ir a'
dic_sentences['in tears'] =  'crying'
dic_sentences['Calm down'] =  'calm'
dic_sentences['calmed down'] =  'calm'
dic_sentences['I laughed'] =  'laughing'
dic_sentences['would be'] =  'serious' ## 'serio = serious' 'seria = would be' 
dic_sentences['sassy Girl'] =  'sassy'
dic_sentences['I long'] = 'longing'
dic_sentences['I disapprove'] = 'disapprove'
dic_sentences['something sucks'] = 'sucks'
dic_sentences['silly faces'] = 'silly'
dic_sentences['poor appetite'] = ''
dic_sentences['nice to meet you'] = 'nice'
dic_sentences['in love'] = 'love'
dic_sentences['Bad smell'] = 'smelly'
dic_sentences['fed up'] = 'disgusted'
dic_sentences['bad gal'] = 'bad'
dic_sentences['Waiting answer'] = 'expectant'
dic_sentences['bowled over'] = 'surprised'
dic_sentences['Clark Kent'] = np.nan
dic_sentences['He shouted'] = 'shouting'
dic_sentences['I knew'] = 'know'
dic_sentences['bad attitude'] = 'gloomy'
dic_sentences['bad move'] = np.nan
dic_sentences['in love'] = 'love'
dic_sentences['man of steel in disgrace'] = 'disgraced'
dic_sentences['poor appetite'] = 'satiated'
dic_sentences["I didn't expect it from you"] = np.nan

In [43]:
## to search for 'keys' based on 'value'
print(list(emotion_dic.keys())[list(emotion_dic.values()).index('bad move')]) 

mala jugada


In [44]:
dic_sentences

{'go to': 'wrath',
 'in tears': 'crying',
 'Calm down': 'calm',
 'I laughed': 'laughing',
 'would be': 'serious',
 'sassy Girl': 'sassy',
 'I long': 'longing',
 'calmed down': 'calm',
 'I disapprove': 'disapprove',
 'something sucks': 'sucks',
 'silly faces': 'silly',
 'poor appetite': 'satiated',
 'nice to meet you': 'nice',
 'man of steel in disgrace': 'disgraced',
 'in love': 'love',
 'Bad smell': 'smelly',
 'fed up': 'disgusted',
 'bad move': nan,
 'bad gal': 'bad',
 'bad attitude': 'gloomy',
 'Waiting answer': 'expectant',
 'I knew': 'know',
 "I didn't expect it from you": nan,
 'He shouted': 'shouting',
 'Clark Kent': nan,
 'bowled over': 'surprised'}

In [45]:
## replace with new translation

df_emo_answers_free_long_m = df_emo_answers_free_long_m.replace(dic_sentences)

In [46]:
## recheck NAN values
df_emo_answers_free_long_m.isna().sum(), df_emo_answers_free_long_m.shape

(participantId      0
 photoId            0
 emotion            3
 ethnicity        112
 sex              112
 age              112
 label              0
 url                0
 len_words          0
 dtype: int64,
 (9449, 9))

In [47]:
## drop np.nan
df_emo_answers_free_long_m = df_emo_answers_free_long_m.dropna()

df_emo_answers_free_long_m.isna().sum(), df_emo_answers_free_long_m.shape

(participantId    0
 photoId          0
 emotion          0
 ethnicity        0
 sex              0
 age              0
 label            0
 url              0
 len_words        0
 dtype: int64,
 (9334, 9))

In [48]:
#### recheck sentences

## count length sentences 
df_emo_answers_free_long_m['len_words'] = df_emo_answers_free_long_m['emotion'].str.split().apply(len)

## get list of distinct sentences (>= 2 words)
list_sentences_ = df_emo_answers_free_long_m[df_emo_answers_free_long_m['len_words'] >= 2].value_counts('emotion').index.tolist()

list_sentences_

[]

In [49]:
## remove len_words column
df_emo_answers_free_long_m = df_emo_answers_free_long_m.drop(['len_words'], axis=1)
df_emo_answers_free_long_m.head()

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url
0,0,Q2.1_1,annoying,bipoc,female,adult,enfado,https://uwmadison.co1.qualtrics.com/ControlPan...
1,0,Q118.1_1,afraid,bipoc,male,child,sorpresa,https://uwmadison.co1.qualtrics.com/ControlPan...
2,0,Q118.1_2,upset,bipoc,male,child,sorpresa,https://uwmadison.co1.qualtrics.com/ControlPan...
3,0,Q119.1_1,surprised,white,female,adult,sorpresa,https://uwmadison.co1.qualtrics.com/ControlPan...
4,0,Q120.1_1,neutral,white,female,child,sorpresa,https://uwmadison.co1.qualtrics.com/ControlPan...


## Sentiment score

In [50]:
## compute sentiment score for each word
df_emo_answers_free_long_m['sentimentScore'] = df_emo_answers_free_long_m['emotion'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [51]:
df_emo_answers_free_long_m.head(2)

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore
0,0,Q2.1_1,annoying,bipoc,female,adult,enfado,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.4019
1,0,Q118.1_1,afraid,bipoc,male,child,sorpresa,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0


## Center predictors

In [52]:
df_emo_answers_free_long_m['sexC'] = df_emo_answers_free_long_m['sex'].replace({'female': -0.5, 'male': 0.5})
df_emo_answers_free_long_m['ageC'] = df_emo_answers_free_long_m['age'].replace({'child': -0.5, 'adult': 0.5})
df_emo_answers_free_long_m['ethnicityC'] = df_emo_answers_free_long_m['ethnicity'].replace({'bipoc': -0.5, 'white': 0.5})

In [53]:
df_emo_answers_free_long_m.head(2)

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore,sexC,ageC,ethnicityC
0,0,Q2.1_1,annoying,bipoc,female,adult,enfado,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.4019,-0.5,0.5,-0.5
1,0,Q118.1_1,afraid,bipoc,male,child,sorpresa,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,0.5,-0.5,-0.5


In [54]:
df_emo_answers_free_long_m_fil = df_emo_answers_free_long_m[df_emo_answers_free_long_m['label'] != 'attention']
df_emo_answers_free_long_m_fil = df_emo_answers_free_long_m_fil.reset_index(drop=True)

In [55]:
df_emo_answers_free_long_m_fil.head(2)

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore,sexC,ageC,ethnicityC
0,0,Q2.1_1,annoying,bipoc,female,adult,enfado,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.4019,-0.5,0.5,-0.5
1,0,Q118.1_1,afraid,bipoc,male,child,sorpresa,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,0.5,-0.5,-0.5


In [56]:
df_emo_answers_free_long_m_fil.to_csv('../clean_data_mturk_espanol/free_labeling_emotion_mturk_long_format_lmer_espanol.csv', index=False)