In [1]:
import pandas as pd

# Forced-choice 

In [2]:
df = pd.read_csv('../clean_data_mturk/forced_choice_emotion_mturk.csv')
df_labels = pd.read_csv('../data_mturk/emotion_labels.csv')

In [3]:
sex = df['sex']
ethnicity = df['ethnicity']

In [4]:
df_emo_answers_wide = df.loc[:, 'Q2.1':'Q195.1'] # subset photos

## Long format DF

In [5]:
df_emo_answers_long = df_emo_answers_wide.reset_index()
df_emo_answers_long['index'] = df_emo_answers_long.index
df_emo_answers_long = df_emo_answers_long.rename(columns={'index':'participantId'})
df_emo_answers_long.head(2)

Unnamed: 0,participantId,Q2.1,Q3.1,Q4.1,Q5.1,Q6.1,Q7.1,Q8.1,Q9.1,Q10.1,...,Q186.1,Q187.1,Q188.1,Q189.1,Q190.1,Q191.1,Q192.1,Q193.1,Q194.1,Q195.1
0,0,Anger,Anger,Anger,Sadness,Anger,Anger,Neutral,Anger,Other,...,Surprise,Fear,Happiness,Happiness,Fear,Uncertain,Uncertain,Anger,Uncertain,Uncertain
1,1,Anger,Anger,Anger,Fear,Anger,Anger,Anger,Anger,Disgust,...,Surprise,Fear,Happiness,Happiness,Other,Other,Anger,Anger,Other,Other


In [6]:
df_emo_answers_long['sex_participant'] = sex 
df_emo_answers_long['ethnicity_participant'] = ethnicity

In [7]:
df_labels['photoId'] =  df_labels['photoId'].str.strip()
photoIds = df_labels['photoId'].tolist()

In [8]:
df_emo_answers_long_m = pd.melt(df_emo_answers_long, 
                              id_vars=['participantId', 'sex_participant', 'ethnicity_participant'], 
                              value_vars=photoIds).reset_index(drop=True).sort_values(by=['participantId'])

In [9]:
# df_emo_answers_long_m = pd.melt(df_emo_answers_long_m, id_vars=['participantId'], value_vars=photoIds).reset_index(drop=True).sort_values(by=['participantId'])

In [10]:
df_emo_answers_long_m = df_emo_answers_long_m.rename(columns={'variable':'photoId', 'value':'emotion'})

In [11]:
df_emo_answers_long_m = pd.merge(df_emo_answers_long_m, df_labels, how="left", on=["photoId"], validate="many_to_one")

In [12]:
df_emo_answers_long_m.head(2)

Unnamed: 0,participantId,sex_participant,ethnicity_participant,photoId,emotion,ethnicity,sex,age,label,url
0,0,Male,White or Caucasian,Q2.1,Anger,bipoc,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
1,0,Male,White or Caucasian,Q126.1,Neutral,bipoc,male,child,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...


## Sentiment score

In [13]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [14]:
sia = SentimentIntensityAnalyzer()

In [15]:
df_emo_answers_long_m['sentimentScore'] = df_emo_answers_long_m['emotion'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [16]:
df_emo_answers_long_m.head(2)

Unnamed: 0,participantId,sex_participant,ethnicity_participant,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore
0,0,Male,White or Caucasian,Q2.1,Anger,bipoc,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.5719
1,0,Male,White or Caucasian,Q126.1,Neutral,bipoc,male,child,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0


## Center predictors

In [17]:
df_emo_answers_long_m['sexC'] = df_emo_answers_long_m['sex'].replace({'female': -0.5, 'male': 0.5})
df_emo_answers_long_m['ageC'] = df_emo_answers_long_m['age'].replace({'child': -0.5, 'adult': 0.5})
df_emo_answers_long_m['ethnicityC'] = df_emo_answers_long_m['ethnicity'].replace({'bipoc': -0.5, 'white': 0.5})


In [18]:
df_emo_answers_long_m.head(2)

Unnamed: 0,participantId,sex_participant,ethnicity_participant,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore,sexC,ageC,ethnicityC
0,0,Male,White or Caucasian,Q2.1,Anger,bipoc,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.5719,-0.5,0.5,-0.5
1,0,Male,White or Caucasian,Q126.1,Neutral,bipoc,male,child,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,0.5,-0.5,-0.5


In [19]:
df_emo_answers_long_m_fil = df_emo_answers_long_m[df_emo_answers_long_m['label'] != 'attention']
df_emo_answers_long_m_fil = df_emo_answers_long_m_fil.reset_index(drop=True)

In [20]:
df_emo_answers_long_m_fil.to_csv('../clean_data_mturk/forced_choice_emotion_mturk_long_format_lmer.csv', index=False)

In [21]:
df_emo_answers_long_m_fil.head()

Unnamed: 0,participantId,sex_participant,ethnicity_participant,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore,sexC,ageC,ethnicityC
0,0,Male,White or Caucasian,Q2.1,Anger,bipoc,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.5719,-0.5,0.5,-0.5
1,0,Male,White or Caucasian,Q126.1,Neutral,bipoc,male,child,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,0.5,-0.5,-0.5
2,0,Male,White or Caucasian,Q127.1,Surprise,white,female,adult,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...,0.2732,-0.5,0.5,0.5
3,0,Male,White or Caucasian,Q128.1,Fear,white,female,child,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.4939,-0.5,-0.5,0.5
4,0,Male,White or Caucasian,Q129.1,Other,white,male,adult,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...,0.0,0.5,0.5,0.5


# Free-labeling

In [22]:
df_free = pd.read_csv('../clean_data_mturk/free_choice_emotion_mturk.csv')
df_labels_free = pd.read_csv('../data_mturk/emotion_labels_free_choice.csv')

In [23]:
sex_f = df_free['sex']
ethnicity_f = df_free['ethnicity']

In [24]:
def manual_spell_check(df):
    df.replace(['happy', 'happu', 'hapy', 'happyy', 'happyb', 'happt', 'haapy', 'hapoy'], 'happiness', inplace=True)
    df.replace(['angry', 'angr'], 'anger', inplace=True)
    df.replace(['sad', 'sadd', 'sadness'], 'sadness', inplace=True)
    df.replace(['disgusted', 'disgust', 'disgusting', 'disguetsed', 'disugested'], 'disgust', inplace=True)
    df.replace(['surprise', 'surprised', 'surpsied', 'surpise', 'surprising', 'surprisef'], 'surprise', inplace=True)
    df.replace(['fear', 'fearful', 'fearfulness'], 'fear', inplace=True)
    df.replace(["frusturated"], "frustrated", inplace=True)
    
    return df

df_free = manual_spell_check(df_free)

In [25]:
df_emo_answers_free = df_free.loc[:, 'Q2.1_1':'Q195.1_4'] # get cols with words only

In [26]:
df_emo_answers_free.head(2)

Unnamed: 0,Q2.1_1,Q2.1_2,Q2.1_3,Q2.1_4,Q3.1_1,Q3.1_2,Q3.1_3,Q3.1_4,Q4.1_1,Q4.1_2,...,Q193.1_3,Q193.1_4,Q194.1_1,Q194.1_2,Q194.1_3,Q194.1_4,Q195.1_1,Q195.1_2,Q195.1_3,Q195.1_4
0,boredom,,,,anger,,,,anger,annoyance,...,,,disgust,boredom,exasperation,,embarrassment,,,
1,mad,,,,mad,shouting,,,anger,yelling,...,,,playing,,,,,,,


In [27]:
df_labels_free['photoId'] = df_emo_answers_free.columns.tolist()
df_labels_free.tail(2)

Unnamed: 0,ethnicity,sex,age,label,url,photoId
774,white,male,child,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...,Q195.1_3
775,white,male,child,uncertain,https://uwmadison.co1.qualtrics.com/ControlPan...,Q195.1_4


In [28]:
df_emo_answers_free['sex_participant'] = sex_f 
df_emo_answers_free['ethnicity_participant'] = ethnicity_f

## Long-form DF

In [29]:
df_emo_answers_free_long = df_emo_answers_free.reset_index()
df_emo_answers_free_long['index'] = df_emo_answers_free_long.index
df_emo_answers_free_long = df_emo_answers_free_long.rename(columns={'index':'participantId'})
df_emo_answers_free_long.head(2)

Unnamed: 0,participantId,Q2.1_1,Q2.1_2,Q2.1_3,Q2.1_4,Q3.1_1,Q3.1_2,Q3.1_3,Q3.1_4,Q4.1_1,...,Q194.1_1,Q194.1_2,Q194.1_3,Q194.1_4,Q195.1_1,Q195.1_2,Q195.1_3,Q195.1_4,sex_participant,ethnicity_participant
0,0,boredom,,,,anger,,,,anger,...,disgust,boredom,exasperation,,embarrassment,,,,Male,White or Caucasian
1,1,mad,,,,mad,shouting,,,anger,...,playing,,,,,,,,Male,White or Caucasian


In [30]:
# df_labels['photoId'] =  df_labels['photoId'].str.strip()
photoIds = df_labels_free['photoId'].tolist()

In [31]:
df_emo_answers_free_long = pd.melt(df_emo_answers_free_long,
                                   id_vars=['participantId', 'sex_participant', 'ethnicity_participant'], 
                                   value_vars=photoIds).reset_index(drop=True).sort_values(by=['participantId'])

In [32]:
# df_emo_answers_free_long = pd.melt(df_emo_answers_free_long, id_vars=['participantId'], value_vars=photoIds).reset_index(drop=True).sort_values(by=['participantId'])

In [33]:
df_emo_answers_free_long = df_emo_answers_free_long.rename(columns={'variable':'photoId', 'value':'emotion'})

In [34]:
# clean up text from:
# 1. phrases
# 2. words with numbers 
# 3. words with non alphabetic symbols
# 4. words shorter than three characters
# 5. words that appear just once

In [35]:
df_emo_answers_free_long = df_emo_answers_free_long.dropna()
df_emo_answers_free_long['emotion_txt'] = df_emo_answers_free_long['emotion'].str.lower() # as lower case
df_emo_answers_free_long['emotion_txt'] = df_emo_answers_free_long['emotion_txt'].replace({'na':'none'}) 
df_emo_answers_free_long['emotion_txt'] = df_emo_answers_free_long['emotion_txt'].astype(str) # as str
df_emo_answers_free_long['len_words'] = df_emo_answers_free_long['emotion_txt'].str.split().apply(len) # count words per answer
df_emo_answers_free_long = df_emo_answers_free_long[df_emo_answers_free_long['len_words'] <= 1] # keep single word answers
df_emo_answers_free_long['len_letters'] = df_emo_answers_free_long['emotion_txt'].apply(len) # cont number of letters
df_emo_answers_free_long = df_emo_answers_free_long[df_emo_answers_free_long['len_letters'] >= 3] # keep words with 3 or more letters
df_emo_answers_free_long['emotion_txt'] = df_emo_answers_free_long['emotion_txt'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters
df_emo_answers_free_long = df_emo_answers_free_long[~df_emo_answers_free_long['emotion_txt'].str.contains(r'[0-9]')] #drop words containing numbers
df_emo_answers_free_long = df_emo_answers_free_long[df_emo_answers_free_long.groupby('emotion_txt').emotion_txt.transform(len) > 1] # keep words that ocurr > 1

  df_emo_answers_free_long['emotion_txt'] = df_emo_answers_free_long['emotion_txt'].str.replace('[^a-zA-Z]', '') # remove non-alphabetic characters


In [36]:
df_emo_answers_free_long.isna().sum(), df_emo_answers_free_long.shape

(participantId            0
 sex_participant          0
 ethnicity_participant    0
 photoId                  0
 emotion                  0
 emotion_txt              0
 len_words                0
 len_letters              0
 dtype: int64,
 (14586, 8))

In [37]:
df_emo_answers_free_long_drop = df_emo_answers_free_long.dropna()

df_emo_answers_free_long_drop.isna().sum(), df_emo_answers_free_long_drop.shape

(participantId            0
 sex_participant          0
 ethnicity_participant    0
 photoId                  0
 emotion                  0
 emotion_txt              0
 len_words                0
 len_letters              0
 dtype: int64,
 (14586, 8))

In [38]:
df_emo_answers_free_long_m = pd.merge(df_emo_answers_free_long_drop, df_labels_free, how="left", on=["photoId"], validate="many_to_one")

In [39]:
df_emo_answers_free_long_m.head(2)

Unnamed: 0,participantId,sex_participant,ethnicity_participant,photoId,emotion,emotion_txt,len_words,len_letters,ethnicity,sex,age,label,url
0,0,Male,White or Caucasian,Q2.1_1,boredom,boredom,1,7,bipoc,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...
1,0,Male,White or Caucasian,Q117.1_3,annoyance,annoyance,1,9,bipoc,male,adult,surprise,https://uwmadison.co1.qualtrics.com/ControlPan...


In [40]:
df_emo_answers_free_long_m.drop(['len_words', 'len_letters', 'emotion'], axis=1, inplace=True)
df_emo_answers_free_long_m.rename(columns={"emotion_txt": "emotion"}, inplace=True)

## Sentiment score

In [41]:
df_emo_answers_free_long_m['sentimentScore'] = df_emo_answers_free_long_m['emotion'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [42]:
df_emo_answers_free_long_m.head(2)

Unnamed: 0,participantId,sex_participant,ethnicity_participant,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore
0,0,Male,White or Caucasian,Q2.1_1,boredom,bipoc,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.3182
1,0,Male,White or Caucasian,Q117.1_3,annoyance,bipoc,male,adult,surprise,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.3182


## Center predictors

In [43]:
df_emo_answers_free_long_m['sexC'] = df_emo_answers_free_long_m['sex'].replace({'female': -0.5, 'male': 0.5})
df_emo_answers_free_long_m['ageC'] = df_emo_answers_free_long_m['age'].replace({'child': -0.5, 'adult': 0.5})
df_emo_answers_free_long_m['ethnicityC'] = df_emo_answers_free_long_m['ethnicity'].replace({'bipoc': -0.5, 'white': 0.5})

In [44]:
df_emo_answers_free_long_m.head(2)

Unnamed: 0,participantId,sex_participant,ethnicity_participant,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore,sexC,ageC,ethnicityC
0,0,Male,White or Caucasian,Q2.1_1,boredom,bipoc,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.3182,-0.5,0.5,-0.5
1,0,Male,White or Caucasian,Q117.1_3,annoyance,bipoc,male,adult,surprise,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.3182,0.5,0.5,-0.5


In [45]:
df_emo_answers_free_long_m_fil = df_emo_answers_free_long_m[df_emo_answers_free_long_m['label'] != 'attention']
df_emo_answers_free_long_m_fil = df_emo_answers_free_long_m_fil.reset_index(drop=True)

In [46]:
df_emo_answers_free_long_m_fil.head(2)

Unnamed: 0,participantId,sex_participant,ethnicity_participant,photoId,emotion,ethnicity,sex,age,label,url,sentimentScore,sexC,ageC,ethnicityC
0,0,Male,White or Caucasian,Q2.1_1,boredom,bipoc,female,adult,anger,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.3182,-0.5,0.5,-0.5
1,0,Male,White or Caucasian,Q117.1_3,annoyance,bipoc,male,adult,surprise,https://uwmadison.co1.qualtrics.com/ControlPan...,-0.3182,0.5,0.5,-0.5


In [47]:
df_emo_answers_free_long_m_fil.to_csv('../clean_data_mturk/free_labeling_emotion_mturk_long_format_lmer.csv', index=False)