# Human Evaluation of Model Generated Responses

## Import Libraries

In [1]:
#Import libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
import scipy.stats as stats
import plotly.graph_objects as go

from statsmodels.stats.inter_rater import fleiss_kappa, aggregate_raters

## Import CSV Files 

In [2]:
# Import CSV files of results 
#df = pd.read_csv("form-1.csv", encoding="ISO-8859-1")
#df = pd.read_csv("form-2.csv", encoding="ISO-8859-1")
#df = pd.read_csv("form-3.csv", encoding="ISO-8859-1")
#df = pd.read_csv("form-4.csv", encoding="ISO-8859-1")
#df = pd.read_csv("form-5.csv", encoding="ISO-8859-1")
#df = pd.read_csv("form-6.csv", encoding="ISO-8859-1")
df = pd.read_csv("form-7.csv", encoding="ISO-8859-1")

## Data Cleaning 

In [3]:
# Drop columns that are not needed 
df = df.drop(['Start time', 'Completion time', 'Email', 'Name', 'Id', 'I confirm that I have read and understood the above statements'], axis=1)
df

Unnamed: 0,How old are you?,What is your gender,Rate Model 1 Response for Corrective Feedback 1.The response is grammatically correct,Rate Model 1 Response for Corrective Feedback 1.The response make sense given the context,Rate Model 1 Response for Corrective Feedback 1.The response addresses the grammatical error made by the child,Rate Model 1 Response for Corrective Feedback 1.The response is appropriate for a child,Rate Model 2 Response for Corrective Feedback 1.The response is grammatically correct,Rate Model 2 Response for Corrective Feedback 1.The response makes sense given the context,Rate Model 2 Response for Corrective Feedback 1.The response addresses the grammatical error made by the child,Rate Model 2 Response for Corrective Feedback 1.The response is appropriate for a child,...,Rate Model 1 Response for Confirmatory Feedback 2.Does the response provide confirmation for the child's statement ?,Rate Model 1 Response for Confirmatory Feedback 2.Is the response appropriate for a child ?,Rate Model 2 Response for Confirmatory Feedback 2.Is the response grammatically correct ?,Rate Model 2 Response for Confirmatory Feedback 2.Does the response make sense given the context ?,Rate Model 2 Response for Confirmatory Feedback 2.Does the response provide confirmation for the child's statement ?,Rate Model 2 Response for Confirmatory Feedback 2.Is the response appropriate for a child ?,Rate Model 3 Response for Confirmatory Feedback 2.Is the response grammatically correct ?,Rate Model 3 Response for Confirmatory Feedback 2.Does the response make sense given the context ?,Rate Model 3 Response for Confirmatory Feedback 2.Does the response provide confirmation for the child's statement ?,Rate Model 3 Response for Confirmatory Feedback 2.Is the response appropriate for a child ?
0,26 - 35,Male,Neutral,Agree,Neutral,Agree,Agree,Agree,Disagree,Agree,...,Strongly Disagree,Disagree,Agree,Neutral,Neutral,Agree,Strongly Disagree,Strongly Disagree,Strongly Disagree,Agree
1,26 - 35,Female,Strongly Disagree,Agree,Neutral,Neutral,Strongly Agree,Strongly Agree,Disagree,Strongly Agree,...,Disagree,Neutral,Strongly Agree,Disagree,Disagree,Neutral,Neutral,Neutral,Disagree,Neutral
2,26 - 35,Male,Agree,Strongly Disagree,Disagree,Strongly Disagree,Disagree,Agree,Disagree,Disagree,...,Agree,Neutral,Agree,Strongly Disagree,Strongly Disagree,Strongly Disagree,Disagree,Disagree,Disagree,Disagree
3,26 - 35,Male,Agree,Disagree,Disagree,Neutral,Strongly Agree,Neutral,Neutral,Agree,...,Disagree,Disagree,Strongly Agree,Strongly Disagree,Strongly Disagree,Neutral,Agree,Agree,Agree,Agree
4,26 - 35,Male,Disagree,Strongly Disagree,Strongly Disagree,Agree,Agree,Disagree,Disagree,Strongly Agree,...,Strongly Disagree,Agree,Strongly Disagree,Strongly Disagree,Strongly Disagree,Agree,Strongly Disagree,Strongly Disagree,Strongly Disagree,Agree
5,36 - 45,Female,Disagree,Strongly Agree,Disagree,Strongly Agree,Strongly Agree,Agree,Strongly Disagree,Strongly Agree,...,Neutral,Agree,Strongly Agree,Agree,Strongly Disagree,Strongly Agree,Strongly Agree,Agree,Disagree,Agree
6,26 - 35,Female,Agree,Strongly Agree,Disagree,Agree,Agree,Disagree,Disagree,Agree,...,Disagree,Neutral,Agree,Disagree,Disagree,Neutral,Disagree,Neutral,Neutral,Neutral


## Change Ratings to Numbers 

In [4]:
# Change ratings to numbers 
rating_dict = {"Strongly Disagree": 1, "Disagree": 2, "Neutral": 3, "Agree": 4, "Strongly Agree":5}
def rating2num(rating):
    return rating_dict[rating]

# Convert the column labels of the DataFrame into a list 
rating_list = list(df.keys())[2:]
rating_list

# Apply the rating2num function to every element
for item in rating_list:
    df[item] = df[item].apply(rating2num)

## Average Performance for Corrective Feedback

### Model 1 Performance - Corrective Feedback

In [5]:
# Model 1 Performance in Fluency
fluency_model1_keys_cf = [item for item in rating_list if "Model 1" in item and "grammatically" in item and "Corrective" in item ]
df['avg_model1_fluency_cf'] = (df[fluency_model1_keys_cf[0]]+df[fluency_model1_keys_cf[1]])/2

# Model 1 Performance in Coherence
coherence_model1_keys_cf = [item for item in rating_list if "Model 1" in item and "context" in item and "Corrective" in item ]
df['avg_model1_coherence_cf'] = (df[coherence_model1_keys_cf[0]]+df[coherence_model1_keys_cf[1]])/2

# Model 1 Performance in Correctness
correctness_model1_keys_cf = [item for item in rating_list if "Model 1" in item and "grammatical error" in item and "Corrective" in item ]
df['avg_model1_correctness_cf'] = (df[correctness_model1_keys_cf[0]]+df[correctness_model1_keys_cf[1]])/2

# Model 1 Performance in Appropriateness
appropriateness_model1_keys_cf = [item for item in rating_list if "Model 1" in item and "appropriate" in item and "Corrective" in item ]
df['avg_model1_appropriateness_cf'] = (df[appropriateness_model1_keys_cf[0]]+df[appropriateness_model1_keys_cf[1]])/2

### Model 2 Performance - Corrective Feedback

In [6]:
# Model 2 Performance in Fluency
fluency_model2_keys_cf = [item for item in rating_list if "Model 2" in item and "grammatically" in item and "Corrective" in item ]
df['avg_model2_fluency_cf'] = (df[fluency_model2_keys_cf[0]]+df[fluency_model2_keys_cf[1]])/2

# Model 2 Performance in Coherence
coherence_model2_keys_cf = [item for item in rating_list if "Model 2" in item and "context" in item and "Corrective" in item ]
df['avg_model2_coherence_cf'] = (df[coherence_model2_keys_cf[0]]+df[coherence_model2_keys_cf[1]])/2

# Model 2 Performance in Correctness
correctness_model2_keys_cf = [item for item in rating_list if "Model 2" in item and "grammatical error" in item and "Corrective" in item ]
df['avg_model2_correctness_cf'] = (df[correctness_model2_keys_cf[0]]+df[correctness_model2_keys_cf[1]])/2

# Model 2 Performance in Appropriateness
appropriateness_model2_keys_cf = [item for item in rating_list if "Model 2" in item and "appropriate" in item and "Corrective" in item ]
df['avg_model2_appropriateness_cf'] = (df[appropriateness_model2_keys_cf[0]]+df[appropriateness_model2_keys_cf[1]])/2


### Model 3 Performance - Corrective Feedback

In [7]:
# Model 3 Performance in Fluency
fluency_model3_keys_cf = [item for item in rating_list if "Model 3" in item and "grammatically" in item and "Corrective" in item ]
df['avg_model3_fluency_cf'] = (df[fluency_model3_keys_cf[0]]+df[fluency_model3_keys_cf[1]])/2

# Model 3 Performance in Coherence
coherence_model3_keys_cf = [item for item in rating_list if "Model 3" in item and "context" in item and "Corrective" in item ]
df['avg_model3_coherence_cf'] = (df[coherence_model3_keys_cf[0]]+df[coherence_model3_keys_cf[1]])/2

# Model 3 Performance in Correctness
correctness_model3_keys_cf = [item for item in rating_list if "Model 3" in item and "grammatical error" in item and "Corrective" in item ]
df['avg_model3_correctness_cf'] = (df[correctness_model3_keys_cf[0]]+df[correctness_model3_keys_cf[1]])/2

# Model 3 Performance in Appropriateness
appropriateness_model3_keys_cf = [item for item in rating_list if "Model 3" in item and "appropriate" in item and "Corrective" in item ]
df['avg_model3_appropriateness_cf'] = (df[appropriateness_model3_keys_cf[0]]+df[appropriateness_model3_keys_cf[1]])/2
#df.to_csv("context-1.csv")

## Average Performance for Confirmatory Feedback

### Model 1 Performance - Confirmatory Feedback

In [8]:
# Model 1 Performance in Fluency
fluency_model1_keys_f = [item for item in rating_list if "Model 1" in item and "grammatically" in item and "Confirmatory" in item ]
df['avg_model1_fluency_f'] = (df[fluency_model1_keys_f[0]]+df[fluency_model1_keys_f[1]])/2

# Model 1 Performance in Coherence
coherence_model1_keys_f = [item for item in rating_list if "Model 1" in item and "context" in item and "Confirmatory" in item ]
df['avg_model1_coherence_f'] = (df[coherence_model1_keys_f[0]]+df[coherence_model1_keys_f[1]])/2

# Model 1 Performance in Correctness
correctness_model1_keys_f = [item for item in rating_list if "Model 1" in item and "confirmation" in item and "Confirmatory" in item ]
df['avg_model1_correctness_f'] = (df[correctness_model1_keys_f[0]]+df[correctness_model1_keys_f[1]])/2

# Model 1 Performance in Appropriateness
appropriateness_model1_keys_f = [item for item in rating_list if "Model 1" in item and "appropriate" in item and "Confirmatory" in item ]
df['avg_model1_appropriateness_f'] = (df[appropriateness_model1_keys_f[0]]+df[appropriateness_model1_keys_f[1]])/2

### Model 2 Performance - Confirmatory Feedback

In [9]:
# Model 2 Performance in Fluency
fluency_model2_keys_f = [item for item in rating_list if "Model 2" in item and "grammatically" in item and "Confirmatory" in item ]
df['avg_model2_fluency_f'] = (df[fluency_model2_keys_f[0]]+df[fluency_model2_keys_f[1]])/2

# Model 2 Performance in Coherence
coherence_model2_keys_f = [item for item in rating_list if "Model 2" in item and "context" in item and "Confirmatory" in item ]
df['avg_model2_coherence_f'] = (df[coherence_model2_keys_f[0]]+df[coherence_model2_keys_f[1]])/2

# Model 2 Performance in Correctness
correctness_model2_keys_f = [item for item in rating_list if "Model 2" in item and "confirmation" in item and "Confirmatory" in item ]
df['avg_model2_correctness_f'] = (df[correctness_model2_keys_f[0]]+df[correctness_model2_keys_f[1]])/2

# Model 2 Performance in Appropriateness
appropriateness_model2_keys_f = [item for item in rating_list if "Model 2" in item and "appropriate" in item and "Confirmatory" in item ]
df['avg_model2_appropriateness_f'] = (df[appropriateness_model2_keys_f[0]]+df[appropriateness_model2_keys_f[1]])/2

In [10]:

print("Rating list:", rating_list)
print("Model 2 keys for fluency:", fluency_model2_keys_f)

Rating list: ['Rate Model 1 Response for\xa0Corrective Feedback 1.The response is grammatically correct\xa0', 'Rate Model 1 Response for\xa0Corrective Feedback 1.The response make sense given the context', 'Rate Model 1 Response for\xa0Corrective Feedback 1.The response addresses the grammatical error made by the child', 'Rate Model 1 Response for\xa0Corrective Feedback 1.The response is appropriate for a child', 'Rate Model 2 Response\xa0for\xa0Corrective Feedback 1.The response is grammatically correct', 'Rate Model 2 Response\xa0for\xa0Corrective Feedback 1.The response makes sense given the context', 'Rate Model 2 Response\xa0for\xa0Corrective Feedback 1.The response addresses the grammatical error made by the child', 'Rate Model 2 Response\xa0for\xa0Corrective Feedback 1.The response is appropriate for a child', 'Rate Model 3 Response\xa0for\xa0Corrective Feedback 1.The response is grammatically correct', 'Rate Model 3 Response\xa0for\xa0Corrective Feedback 1.The response makes se

### Model 3 Performance - Confirmatory Feedback

In [11]:
# Model 3 Performance in Fluency
fluency_model3_keys_f = [item for item in rating_list if "Model 3" in item and "grammatically" in item and "Confirmatory" in item ]
df['avg_model3_fluency_f'] = (df[fluency_model3_keys_f[0]]+df[fluency_model3_keys_f[1]])/2

# Model 3 Performance in Coherence
coherence_model3_keys_f = [item for item in rating_list if "Model 3" in item and "context" in item and "Confirmatory" in item ]
df['avg_model3_coherence_f'] = (df[coherence_model3_keys_f[0]]+df[coherence_model3_keys_f[1]])/2

# Model 3 Performance in Correctness
correctness_model3_keys_f = [item for item in rating_list if "Model 3" in item and "confirmation" in item and "Confirmatory" in item ]
df['avg_model3_correctness_f'] = (df[correctness_model3_keys_f[0]]+df[correctness_model3_keys_f[1]])/2

# Model 3 Performance in Appropriateness
appropriateness_model3_keys_f = [item for item in rating_list if "Model 3" in item and "appropriate" in item and "Confirmatory" in item ]
df['avg_model3_appropriateness_f'] = (df[appropriateness_model3_keys_f[0]]+df[appropriateness_model3_keys_f[1]])/2

In [12]:
df.to_csv("context-7.csv")

## Calculate Statistics 

### Inter Annotator Agreements for Corrective Feedback

In [13]:
# Selecting only the desired columns
model1 = df[["avg_model1_fluency_cf", "avg_model1_coherence_cf", "avg_model1_correctness_cf", "avg_model1_appropriateness_cf"]]
model2 = df[["avg_model2_fluency_cf", "avg_model2_coherence_cf", "avg_model2_correctness_cf", "avg_model2_appropriateness_cf"]]
model3 = df[["avg_model3_fluency_cf", "avg_model3_coherence_cf", "avg_model3_correctness_cf", "avg_model3_appropriateness_cf"]]


In [14]:
# Function to calculate fleiss Kappa 
def calculate_fleiss_kappa(dataframe, dataframe_name):
    # Number of categories (assuming ratings go from 1 to 5)
    num_categories = 5

    # Number of raters
    #num_raters = len(dataframe)

    # Prepare data in the required format for Fleiss' Kappa
    ratings = np.zeros((len(dataframe), num_categories))
    for i, (_, row) in enumerate(dataframe.iterrows()):
        for j, col in enumerate(dataframe.columns):
            ratings[i, int(row[col]) - 1] += 1

    # Compute Fleiss' Kappa
    fleiss_kappa_score = fleiss_kappa(ratings)

    # Print Fleiss' Kappa score along with the name of the DataFrame
    print("Fleiss' Kappa of", dataframe_name, ":", fleiss_kappa_score)

    return fleiss_kappa_score

# Apply function to dataframes 
fleiss_kappa_score = calculate_fleiss_kappa(model1, "Model 1 on Rephrasing Questions")
fleiss_kappa_score = calculate_fleiss_kappa(model2, "Model 2 on Rephrasing Questions")
fleiss_kappa_score = calculate_fleiss_kappa(model3, "Model 3 on Rephrasing Questions")

Fleiss' Kappa of Model 1 on Rephrasing Questions : 0.10502283105022835
Fleiss' Kappa of Model 2 on Rephrasing Questions : 0.024875621890547372
Fleiss' Kappa of Model 3 on Rephrasing Questions : 0.2195121951219512


### Inter Annotator Agreements for Confirmatory Feedback

In [15]:
# Selecting only the desired columns
model1_1 = df[["avg_model1_fluency_f", "avg_model1_coherence_f", "avg_model1_correctness_f", "avg_model1_appropriateness_f"]]
model2_2 = df[["avg_model2_fluency_f", "avg_model2_coherence_f", "avg_model2_correctness_f", "avg_model2_appropriateness_f"]]
model3_3 = df[["avg_model3_fluency_f", "avg_model3_coherence_f", "avg_model3_correctness_f", "avg_model3_appropriateness_f"]]

In [16]:
# Function to calculate fleiss Kappa 
def calculate_fleiss_kappa(dataframe, dataframe_name):
    # Number of categories (assuming ratings go from 1 to 5)
    num_categories = 5

    # Number of raters
    #num_raters = len(dataframe)

    # Prepare data in the required format for Fleiss' Kappa
    ratings = np.zeros((len(dataframe), num_categories))
    for i, (_, row) in enumerate(dataframe.iterrows()):
        for j, col in enumerate(dataframe.columns):
            ratings[i, int(row[col]) - 1] += 1

    # Compute Fleiss' Kappa
    fleiss_kappa_score = fleiss_kappa(ratings)

    # Print Fleiss' Kappa score along with the name of the DataFrame
    print("Fleiss' Kappa of", dataframe_name, ":", fleiss_kappa_score)

    return fleiss_kappa_score

# Apply function to dataframes 
fleiss_kappa_score = calculate_fleiss_kappa(model1_1, "Model 1 on Feedback on Question Answer")
fleiss_kappa_score = calculate_fleiss_kappa(model2_2, "Model 2 on Feedback on Question Answer")
fleiss_kappa_score = calculate_fleiss_kappa(model3_3, "Model 3 on Feedback on Question Answer")

Fleiss' Kappa of Model 1 on Feedback on Question Answer : 0.11280101394169835
Fleiss' Kappa of Model 2 on Feedback on Question Answer : 0.06666666666666664
Fleiss' Kappa of Model 3 on Feedback on Question Answer : 0.3920817369093229


### Calculate Average Fleiss Kappa 

In [24]:
# Find average 
def calculate_average(numbers):
    if not numbers:
        print("The list is empty.")
        return None  # Return None if the list is empty
    total = sum(numbers)
    average = total / len(numbers)
    print("Average:", average)
    return average


# Example usage:
mode1_rephrase = [0.2309612983770287, -0.11504424778761065, -0.009836065573770501, 0.25203252032520324, 0.10638297872340423, -0.10012674271229387, 0.10502283105022835]
model1_feedback = [0.46005509641873277, 0.08771929824561396, 0.033441208198489794, 0.2558139534883722, 0.6770472895040369, -0.006535947712418415, 0.11280101394169835]

model2_rephrase = [ 0.27721661054994384, 0.008547008547008517, 0.007092198581560292, 0.10600255427841633, 0.5185185185185187, -0.15393939393939401, 0.024875621890547372]
model2_feedback = [0.47967479674796765, 0.2056737588652482, 0.10756972111553774, 0.07038512616201846, 0.7624242424242423,  0.1080773606370876, 0.06666666666666664]

model3_rephrase = [0.24231242312423115, 0.22737306843267108,  0.3354908306364618, 0.2107843137254902, 0.5882352941176471, -0.018779342723004716, 0.2195121951219512]
model3_feedback = [0.3826199740596628, 0.23096129837702875, 0.13641755634638192, 0.21285140562248997, 0.6056338028169014, 0.4701601164483261,  0.3920817369093229 ]


calculate_average(mode1_rephrase)
calculate_average(model1_feedback)

calculate_average(model2_rephrase)
calculate_average(model2_feedback)

calculate_average(model3_rephrase)
calculate_average(model3_feedback)

Average: 0.06705608177174135
Average: 0.23147741601207508
Average: 0.11261615977522872
Average: 0.25721023894553835
Average: 0.2578469689193497
Average: 0.34724655579715913


0.34724655579715913

## Visualizations 

In [18]:
df_age = pd.read_csv("age-gender.csv")

In [19]:
df_age.head()

Unnamed: 0,How old are you?,What is your gender
0,18 - 25,Male
1,26 - 35,Male
2,26 - 35,Female
3,26 - 35,Female
4,26 - 35,Male


In [36]:
# Count the occurrences of each age group
age_counts = df_age["How old are you?"].value_counts().reset_index()
age_counts.columns = ["Age Group", "Count"]

# Create pie chart figure
fig_pie = px.pie(age_counts, values="Count", names="Age Group", 
                 #title="Distribution of Participants by Age Range",
                 color_discrete_sequence=px.colors.qualitative.Pastel)


fig_pie.show()

# Create table trace
table_trace = go.Table(
    header=dict(values=["Age Group", "Count"],
                fill=dict(color='paleturquoise'),
                align=['left'] * 5),
    cells=dict(values=[age_counts['Age Group'], age_counts['Count']],
               fill=dict(color='lavender'),
               align=['left'] * 5))

# Create table figure
fig_table = go.Figure(data=[table_trace])

# Update layout of table figure
fig_table.update_layout(
    #title="Distribution of Participants by Age Range",
    width=400,
)

fig_table.show()

In [39]:
# Gender count 
gender_counts = df_age["What is your gender"].value_counts().reset_index()
gender_counts.columns = ["Gender", "Count"]

# Plot the pie chart using Plotly with warmer colors
fig = px.pie(gender_counts, values="Count", names="Gender", 
             #title="Distribution of Participants by Gender",
             color_discrete_sequence=px.colors.qualitative.Pastel)

fig.show()


# Create a table trace
table_trace = go.Table(
    header=dict(values=["Gender", "Count"],
                fill=dict(color='paleturquoise'),
                align=['left'] * 5),
    cells=dict(values=[gender_counts['Gender'], gender_counts['Count']],
               fill=dict(color='lavender'),
               align=['left'] * 5))

# Create figure
fig = go.Figure(data=[table_trace])

# Update layout
fig.update_layout(
    width=400,
)
fig.show()

