# Analyse survey results

In [1]:
# This notebook performs statistical tests on the survey responses to evaluate soft prompts against control generated text. 
# Mainly: Chi-square tests of independence and Student's t-tests are performed to verify emotional consistency 
# and grammatical correctness respectively.
# Means are also calculated for the observed and control groups.

import pandas as pd 
import numpy as np

In [2]:
# Paths to survey questions and responses. 
questions = pd.read_csv("questions.csv")
responses = pd.read_csv("responses.csv", encoding="utf-8")

In [3]:
emotions = questions["EmotionSoftPrompt"].unique().tolist()
emotions.remove("none")
topics = questions["TopicPrompt"].unique().tolist()
responses_likert = responses.select_dtypes(include=['int', 'float'])

In [4]:
# Measures emotion classification accuracy in responses, for given emotion soft prompt. 
# If control set to True, instead calculates accuracy on the control responses.
def calculate_emotion_acc(emotion, control=False):
    relevant_questions = questions[["EmotionSoftPrompt", "TopicPrompt", "Text"]]
    
    #Filter for emotion or control entries
    filter = relevant_questions["EmotionSoftPrompt"]==emotion
    if control:
        filter = relevant_questions["EmotionSoftPrompt"]=="none"

    relevant_questions = relevant_questions.where(filter).dropna()
        
    texts = relevant_questions["Text"].tolist()
    responses_for_emotion_soft_prompt = responses[texts]
    
    correct = responses_for_emotion_soft_prompt[responses_for_emotion_soft_prompt==emotion].count(axis=1).sum()
    total = responses_for_emotion_soft_prompt.count(axis=1).sum()
    acc = correct/total
    return acc

print("#### Emotion Classification: Accuracy of each soft prompt for observed vs control ####")
for emotion in emotions:
    acc = calculate_emotion_acc(emotion)
    control_acc = calculate_emotion_acc(emotion, control=True)
    print(emotion,":",acc," (control:",control_acc, ")")

#### Emotion Classification: Accuracy of each soft prompt for observed vs control ####
amusement : 0.35526315789473684  (control: 0.2631578947368421 )
gratitude : 0.4605263157894737  (control: 0.2236842105263158 )
disgust : 0.4342105263157895  (control: 0.013157894736842105 )
fear : 0.32894736842105265  (control: 0.039473684210526314 )
surprise : 0.039473684210526314  (control: 0.09210526315789473 )
curiosity : 0.5131578947368421  (control: 0.17105263157894737 )


In [5]:
from scipy.stats import chi2_contingency

def get_emotion_responses_series(emotion, control=False):
    relevant_questions = questions[["EmotionSoftPrompt", "TopicPrompt", "Text"]]
    
    # Filter for emotion and control emotion classification responses
    filter = relevant_questions["EmotionSoftPrompt"]==emotion
    if control:
        filter = relevant_questions["EmotionSoftPrompt"]=="none"
    emotion_questions = relevant_questions.where(filter).dropna()     
    texts = emotion_questions["Text"].tolist()
    relevant_responses = responses[texts]

    responses_series = pd.Series(relevant_responses.values.ravel())
    responses_series = responses_series.mask(responses_series!=emotion, False)
    responses_series = responses_series.mask(responses_series==emotion, True)
    
    return responses_series

def prepare_emotion_responses_observed_control(emotion):
    responses_observed = get_emotion_responses_series(emotion)
    df_observed = pd.DataFrame({"is_"+emotion: responses_observed})
    df_observed['is_control'] = df_observed.apply(lambda row: False, axis=1)

    
    responses_control = get_emotion_responses_series(emotion, control=True)
    df_control = pd.DataFrame({"is_"+emotion: responses_control})
    df_control['is_control'] = df_control.apply(lambda row: True, axis=1)
                                      
    df = pd.concat([df_observed, df_control])

    return df 
                                                
def get_contigency(emotion):
    df = prepare_emotion_responses_observed_control(emotion)
    contigency= pd.crosstab(df["is_"+emotion], df["is_control"])
    
    return contigency

# Perform Chi Squared Test for emotion classification of the given emotion against the control responses. 
def calc_chi_sqrd(emotion):
    contigency = get_contigency(emotion)
    c, p, dof, expected = chi2_contingency(contigency)
    
    return c, p, dof, expected

print("#### Emotion Classification: Chi Squared Tests ####")
for emotion in emotions:
    c, p, dof, expected = calc_chi_sqrd(emotion)
    print(emotion, ": p=", p, ", c=", c)

#### Emotion Classification: Chi Squared Tests ####
amusement : p= 0.2923397756363371 , c= 1.1088145896656534
gratitude : p= 0.0036550693609837238 , c= 8.447692307692307
disgust : p= 1.5998144185176544e-09 , c= 36.408773678963115
fear : p= 1.1132954059689901e-05 , c= 19.306451612903224
surprise : p= 0.3263366851462598 , c= 0.9633802816901409
curiosity : p= 1.917799723259888e-05 , c= 18.26923076923077


In [6]:
import scipy.stats as stats

def get_likert_responses_series(emotion, control=False):
    relevant_questions = questions[["EmotionSoftPrompt", "TopicPrompt", "Text"]]
    
    # Filter for emotion and control grammatical correctness Likert score responses
    filter = relevant_questions["EmotionSoftPrompt"]==emotion
    if control:
        filter = relevant_questions["EmotionSoftPrompt"]=="none"
    emotion_questions = relevant_questions.where(filter).dropna()     
    texts = emotion_questions["Text"].tolist()
    texts = [e + ".1" for e in texts]
    relevant_responses = responses_likert[texts]

    responses_series = pd.Series(relevant_responses.values.ravel())
    
    return responses_series

def prepare_likert_responses_observed_control(emotion):
    responses_observed = get_likert_responses_series(emotion)
    df_observed = pd.DataFrame({"likert_val": responses_observed})
    df_observed['is_control'] = df_observed.apply(lambda row: False, axis=1)

    
    responses_control = get_likert_responses_series(emotion, control=True)
    df_control = pd.DataFrame({"likert_val": responses_control})
    df_control['is_control'] = df_control.apply(lambda row: True, axis=1)
                                      
    df = pd.concat([df_observed, df_control])

    return df 

# Perform t-test for Likert scale grammatical correctness responses for the given emotion, against the control responses. 
def calc_t_test(emotion):
    df = prepare_likert_responses_observed_control(emotion)
    return stats.ttest_ind(df['likert_val'][df['is_control'] == True], df['likert_val'][df['is_control'] == False])


print("#### Grammatical Scores: Student's T-Tests + Means for observed and control ####")
calc_control_acc = True
for emotion in emotions:
    if calc_control_acc:
        control = get_likert_responses_series(emotion, control=True)
        print("(control mean=", control.mean(), ")")
        calc_control_acc=False
    observed = get_likert_responses_series(emotion)
    t_test_result = calc_t_test(emotion)
    print(emotion, ": p=",t_test_result.pvalue, ", f=", t_test_result.statistic, ", mean=", observed.mean())

#### Grammatical Scores: Student's T-Tests + Means for observed and control ####
(control mean= 3.5526315789473686 )
amusement : p= 0.8988177932240786 , f= -0.12737045052816623 , mean= 3.5789473684210527
gratitude : p= 0.009865156924062627 , f= 2.6138603751408107 , mean= 3.0
disgust : p= 0.000748435766972596 , f= 3.441981590562693 , mean= 2.8421052631578947
fear : p= 0.008500301152973247 , f= 2.666715938769339 , mean= 3.0
surprise : p= 0.009691690823023498 , f= 2.6201982526352903 , mean= 2.986842105263158
curiosity : p= 0.06493258193294145 , f= 1.8593771386077484 , mean= 3.1578947368421053


In [7]:
print("#### Normality assumption check ####")
# Performs Shapiro test to verify assumption of normality for data distribution. 
def calc_normality_test():
    all_likert_scores = np.concatenate(responses_likert.to_numpy())
    all_likert_scores = all_likert_scores[~np.isnan(all_likert_scores)]
    return stats.shapiro(all_likert_scores)

print(calc_normality_test())

#### Normality assumption check ####
ShapiroResult(statistic=0.9055864810943604, pvalue=5.610817793735471e-18)


In [8]:
print("#### Homogenity of variance assumption check ####")
def get_all_likert_responses(control=False):
    relevant_questions = questions[["EmotionSoftPrompt", "TopicPrompt", "Text"]]
    if control:
        filter = relevant_questions["EmotionSoftPrompt"]=="none"
        relevant_questions = relevant_questions.where(filter)
    relevant_questions = relevant_questions.dropna()
    texts = relevant_questions["Text"].tolist()
    texts = [e + ".1" for e in texts]
    relevant_responses = responses_likert[texts]

    responses_series = pd.Series(relevant_responses.values.ravel())
    
    return responses_series

def prepare_all_likert_responses_observed_control():
    responses_observed = get_all_likert_responses()
    df_observed = pd.DataFrame({"likert_val": responses_observed})
    df_observed['is_control'] = df_observed.apply(lambda row: False, axis=1)

    
    responses_control = get_all_likert_responses(control=True)
    df_control = pd.DataFrame({"likert_val": responses_control})
    df_control['is_control'] = df_control.apply(lambda row: True, axis=1)
                                      
    df = pd.concat([df_observed, df_control])

    return df 

# Perform Levene test to verify assumption of data homogenity. 
def calc_var_homogenity_test():
    df = prepare_all_likert_responses_observed_control()
    return stats.levene(df['likert_val'][df['is_control'] == True], df['likert_val'][df['is_control'] == False])
print(calc_var_homogenity_test())

#### Homogenity of variance assumption check ####
LeveneResult(statistic=0.5079606851362205, pvalue=0.4762973243973677)
