In [1]:
import pandas as pd
from deep_translator import GoogleTranslator
from spellchecker import SpellChecker
import numpy as np

# Forced-choice 

In [2]:
df_esp = pd.read_csv('../clean_data_mturk_espanol/forced_choice_emotion_mturk_espanol.csv')
df_labels_esp = pd.read_csv('../data_mturk_espanol/emotion_labels_espanol.csv')

## Manual translation

**The VADER sentiment analyzer only works in English**. I had two options:

- Using a bilingual sentiment analyzer
- Translating to English and then use VADER 

Since I couldn't find a "widely used" bilingual sentiment analyzer, and translating was straightforward, I opted fot the latter 

In [3]:
def translation_emotions(df):
    df.replace(['Enfado'], 'Anger', inplace=True)
    df.replace(['Felicidad'], 'Happiness', inplace=True)
    df.replace(['Sorpresa'], 'Surprise', inplace=True)
    df.replace(['Tristeza'], 'Sadness', inplace=True)
    df.replace(['Asco'], 'Disgust', inplace=True)
    df.replace(['Incertidumbre'], 'Uncertain', inplace=True)
    df.replace(['Miedo'], 'Fear', inplace=True)
    df.replace(['Otra'], 'Other', inplace=True)
    
    return df

In [4]:
df = translation_emotions(df_esp).copy(deep=True)
df_labels = df_labels_esp.copy(deep=True)

In [5]:
df_emo_answers_wide = df.loc[:, 'Q2.1':'Q195.1'] # subset photos

## Long format DF

In [6]:
df_emo_answers_long = df_emo_answers_wide.reset_index()
df_emo_answers_long['index'] = df_emo_answers_long.index
df_emo_answers_long = df_emo_answers_long.rename(columns={'index':'participantId'})
df_emo_answers_long.head(2)

Unnamed: 0,participantId,Q2.1,Q3.1,Q4.1,Q5.1,Q6.1,Q7.1,Q8.1,Q9.1,Q10.1,...,Q186.1,Q187.1,Q188.1,Q189.1,Q190.1,Q191.1,Q192.1,Q193.1,Q194.1,Q195.1
0,0,Other,Uncertain,Anger,Other,Anger,Anger,Other,Anger,Disgust,...,Surprise,Neutral,Other,Uncertain,Disgust,Neutral,Other,Other,Uncertain,Neutral
1,1,Neutral,Anger,Anger,Uncertain,Anger,Anger,Neutral,Other,Uncertain,...,Surprise,Neutral,Other,Happiness,Uncertain,Neutral,Uncertain,Other,Other,Neutral


In [7]:
df_labels['photoId'] =  df_labels['photoId'].str.strip()
photoIds = df_labels['photoId'].tolist()

In [8]:
df_emo_answers_long = pd.melt(df_emo_answers_long, id_vars=['participantId'], value_vars=photoIds).reset_index(drop=True).sort_values(by=['participantId'])

In [9]:
df_emo_answers_long = df_emo_answers_long.rename(columns={'variable':'photoId', 'value':'emotion'})

In [10]:
df_emo_answers_long_m = pd.merge(df_emo_answers_long, df_labels, how="left", on=["photoId"], validate="many_to_one")

In [11]:
df_emo_answers_long_m.head(2)

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url
0,0,Q2.1,Other,bipoc,female,adult,enfado,https://uwmadison.co1.qualtrics.com/ControlPan...
1,0,Q99.1,Neutral,bipoc,female,adult,neutral,https://uwmadison.co1.qualtrics.com/ControlPan...


## Sentiment score

In [12]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [13]:
sia = SentimentIntensityAnalyzer()

In [14]:
df_emo_answers_long_m['sentimentScore'] = df_emo_answers_long_m['emotion'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [15]:
df_emo_answers_long_m.head(2)

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore
0,0,Q2.1,Other,bipoc,female,adult,enfado,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0
1,0,Q99.1,Neutral,bipoc,female,adult,neutral,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0


## Center predictors

In [16]:
df_emo_answers_long_m['sexC'] = df_emo_answers_long_m['sex'].replace({'female': -0.5, 'male': 0.5})
df_emo_answers_long_m['ageC'] = df_emo_answers_long_m['age'].replace({'child': -0.5, 'adult': 0.5})
df_emo_answers_long_m['ethnicityC'] = df_emo_answers_long_m['ethnicity'].replace({'bipoc': -0.5, 'white': 0.5})

In [17]:
df_emo_answers_long_m.head(2)

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore,sexC,ageC,ethnicityC
0,0,Q2.1,Other,bipoc,female,adult,enfado,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,-0.5,0.5,-0.5
1,0,Q99.1,Neutral,bipoc,female,adult,neutral,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,-0.5,0.5,-0.5


In [18]:
df_emo_answers_long_m_fil = df_emo_answers_long_m[df_emo_answers_long_m['label'] != 'attention']
df_emo_answers_long_m_fil = df_emo_answers_long_m_fil.reset_index(drop=True)

In [19]:
df_emo_answers_long_m_fil.to_csv('../clean_data_mturk/forced_choice_emotion_mturk_long_format_lmer.csv', index=False)

In [20]:
df_emo_answers_long_m_fil.head()

Unnamed: 0,participantId,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore,sexC,ageC,ethnicityC
0,0,Q2.1,Other,bipoc,female,adult,enfado,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,-0.5,0.5,-0.5
1,0,Q99.1,Neutral,bipoc,female,adult,neutral,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,-0.5,0.5,-0.5
2,0,Q98.1,Happiness,white,male,child,felicidad,https://uwmadison.co1.qualtrics.com/ControlPan...,0.5574,0.5,-0.5,0.5
3,0,Q97.1,Neutral,white,male,adult,felicidad,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,0.5,0.5,0.5
4,0,Q96.1,Happiness,white,female,child,felicidad,https://uwmadison.co1.qualtrics.com/ControlPan...,0.5574,-0.5,-0.5,0.5


# Free-labeling

In [21]:
df_free_esp = pd.read_csv('../clean_data_mturk_espanol/free_choice_emotion_mturk_espanol.csv')
df_labels_free = pd.read_csv('../data_mturk_espanol/emotion_labels_free_choice_mturk_espanol.csv')

In [22]:
df_emo_answers_free = df_free_esp.loc[:, 'Q2.1_1':'Q195.1_4'] # get cols with words only

## Manual word clean up

In [23]:
df_emo_answers_free = df_emo_answers_free.transform(lambda x: x.str.strip(), axis = 1)
df_emo_answers_free = df_emo_answers_free.transform(lambda x: x.str.lower(), axis = 1)
df_emo_answers_free = df_emo_answers_free.transform(lambda x: x.replace({'na':'none'}) , axis = 1)
df_emo_answers_free = df_emo_answers_free.fillna('NAN')

In [24]:
df_emo_answers_free.head(2)

Unnamed: 0,Q2.1_1,Q2.1_2,Q2.1_3,Q2.1_4,Q3.1_1,Q3.1_2,Q3.1_3,Q3.1_4,Q4.1_1,Q4.1_2,...,Q193.1_3,Q193.1_4,Q194.1_1,Q194.1_2,Q194.1_3,Q194.1_4,Q195.1_1,Q195.1_2,Q195.1_3,Q195.1_4
0,molesta,NAN,NAN,NAN,molesta,NAN,NAN,NAN,molesto,NAN,...,NAN,NAN,molesto,NAN,NAN,NAN,molesto,NAN,NAN,NAN
1,enfado,NAN,NAN,NAN,enfado,NAN,NAN,NAN,enfado,NAN,...,NAN,NAN,NAN,NAN,NAN,NAN,neutral,NAN,NAN,NAN


## English translation

In [25]:
df_stack = df_emo_answers_free.stack().reset_index(drop=True) # stack as series
df_stack = df_stack.to_frame(name='emotion') # as DF
emotion_set = list(set(df_stack['emotion'])) # get distinc words

In [None]:
## dic for translation
emotion_dic = {}
for i in emotion_set:
    emotion_dic[i] = GoogleTranslator(source='spanish', target='english').translate(i)

In [None]:
df_emo_answers_free_en = df_emo_answers_free.replace(emotion_dic)

In [None]:
df_labels_free['photoId'] = df_emo_answers_free.columns.tolist()
df_labels_free.tail(2)

## Long-form DF

In [None]:
df_emo_answers_free_long = df_emo_answers_free_en.reset_index()
df_emo_answers_free_long['index'] = df_emo_answers_free_long.index
df_emo_answers_free_long = df_emo_answers_free_long.rename(columns={'index':'participantId'})
df_emo_answers_free_long.head(2)

In [None]:
# df_labels['photoId'] =  df_labels['photoId'].str.strip()
photoIds = df_labels_free['photoId'].tolist()

In [None]:
df_emo_answers_free_long = pd.melt(df_emo_answers_free_long, id_vars=['participantId'], value_vars=photoIds).reset_index(drop=True).sort_values(by=['participantId'])

In [None]:
df_emo_answers_free_long = df_emo_answers_free_long.rename(columns={'variable':'photoId', 'value':'emotion'})

In [None]:
## replace with np.nan to drop it 
dic_nan = {'nan': np.nan, 'none': np.nan}

df_emo_answers_free_long['emotion'] = df_emo_answers_free_long['emotion'].replace(dic_nan)

In [None]:
df_emo_answers_free_long.head()

In [None]:
df_emo_answers_free_long.isna().sum(), df_emo_answers_free_long.shape

In [None]:
## drop np.nan
df_emo_answers_free_long_drop = df_emo_answers_free_long.dropna()

df_emo_answers_free_long_drop.isna().sum(), df_emo_answers_free_long_drop.shape

In [None]:
df_emo_answers_free_long_m = pd.merge(df_emo_answers_free_long_drop, df_labels_free, how="left", on=["photoId"], validate="many_to_one")

In [None]:
df_emo_answers_free_long_m.head(2)

## Sentiment score

In [None]:
df_emo_answers_free_long_m['sentimentScore'] = df_emo_answers_free_long_m['emotion'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [None]:
df_emo_answers_free_long_m.head(2)

## Center predictors

In [None]:
df_emo_answers_free_long_m['sexC'] = df_emo_answers_free_long_m['sex'].replace({'female': -0.5, 'male': 0.5})
df_emo_answers_free_long_m['ageC'] = df_emo_answers_free_long_m['age'].replace({'child': -0.5, 'adult': 0.5})
df_emo_answers_free_long_m['ethnicityC'] = df_emo_answers_free_long_m['ethnicity'].replace({'bipoc': -0.5, 'white': 0.5})

In [None]:
df_emo_answers_free_long_m.head(2)

In [None]:
df_emo_answers_free_long_m_fil = df_emo_answers_free_long_m[df_emo_answers_free_long_m['label'] != 'attention']
df_emo_answers_free_long_m_fil = df_emo_answers_free_long_m_fil.reset_index(drop=True)

In [None]:
df_emo_answers_free_long_m_fil.head(2)

In [None]:
df_emo_answers_free_long_m_fil.to_csv('../clean_data_mturk_espanol/free_labeling_emotion_mturk_long_format_lmer_espanol.csv', index=False)