In [None]:
import pandas as pd
import plotly.graph_objects as go

# Initialize lists to store the hierarchical data
labels = []
parents = []
values = []

# Root node
labels.append("All Exercises")
parents.append("")
values.append(0)  # Root node value

# Exercises
num_exercises = 4
submissions_per_exercise = 25
feedback_types = ["Tutor", "LLM", "CoFee"]
metrics = ["Completeness", "Correctness", "Actionability", "Tone"]

for ex in range(1, num_exercises + 1):
    exercise_label = f"Exercise {ex}"
    labels.append(exercise_label)
    parents.append("All Exercises")
    values.append(0)  # Exercise node value

    # Submissions (grouped as "25 submissions")
    submission_label = f"{exercise_label} - 25 Submissions"
    labels.append(submission_label)
    parents.append(exercise_label)
    values.append(0)  # Grouped submissions node value

    # Feedback Types
    for feedback in feedback_types:
        feedback_label = f"{submission_label} - {feedback} Feedback"
        labels.append(feedback_label)
        parents.append(submission_label)
        values.append(0)  # Feedback node value

        # Metrics
        for metric in metrics:
            metric_label = f"{feedback_label} - {metric}"
            labels.append(metric_label)
            parents.append(feedback_label)
            values.append(1)  # Metric node value

# Create the icicle chart
fig = go.Figure(go.Icicle(
    labels=labels,
    parents=parents,
    values=values,
    tiling=dict(orientation='v'),  # 'v' for vertical orientation (root at the top)
    marker=dict(colorscale='Blues')  # Modern color scale
))

# Update layout for better visualization
fig.update_layout(
    title='Hierarchical Structure of Exercises, Submissions, Feedback, and Metrics',
    margin=dict(t=50, l=25, r=25, b=25),
    template='plotly_white'  # Modern-looking template
)

# Display the figure
fig.show()

## Examples of Analysing the Sampled Exercises
The following examples demonstrate some basic analysis of the sampled exercises.

In [None]:
data = pd.read_csv("data/1_exercises/exercises.csv")

In [None]:
overall_submissions = data["submission_id"].nunique()
print(f"Overall number of submissions: {overall_submissions}")

Creates a grouped DataFrame to count the number of distinct feedback IDs, submissions, and total feedbacks per score.
Saves the data to a CSV file for further analysis.

In [None]:
grouped_data = (
    data
    .groupby(["exercise_id", "result_score"])
    .agg(
        distinct_feedback_count=("feedback_id", "nunique"),  # Count distinct feedback IDs per score
        submission_count=("submission_id", "nunique"),       # Count distinct submissions per score
        feedback_count=("feedback_id", "nunique")            # Total feedbacks per score
    )
    .reset_index()
)

total_feedbacks_per_exercise = (
    data
    .groupby("exercise_id")["feedback_id"]
    .nunique()
    .reset_index()
    .rename(columns={"feedback_id": "total_feedback_count"})
)

total_submissions_per_exercise = (
    data
    .groupby("exercise_id")["submission_id"]
    .nunique()
    .reset_index()
    .rename(columns={"submission_id": "total_submission_count"})
)

# Merge the total feedback count and total submission count back into the grouped data
grouped_data = grouped_data.merge(total_feedbacks_per_exercise, on="exercise_id")
grouped_data = grouped_data.merge(total_submissions_per_exercise, on="exercise_id")

# Calculate average number of feedbacks per exercise and score
grouped_data["avg_feedbacks_per_score"] = (
    grouped_data["feedback_count"] / grouped_data["submission_count"]
)

grouped_data = grouped_data[[
    "exercise_id",
    "result_score",
    "submission_count",
    "total_submission_count",
    "total_feedback_count",
    "feedback_count",
    "avg_feedbacks_per_score"
]]

grouped_data.to_csv("../data/6_analysis/grouped_data.csv", index=False)

Visualize the relationship between the scores and the average number of feedbacks per score using the grouped data from the previous step.


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Load the grouped data
grouped_data = pd.read_csv("../data/6_analysis/grouped_data.csv")

# Create a color and marker map for exercises
exercise_ids = grouped_data["exercise_id"].unique()
colors = plt.cm.tab10(range(len(exercise_ids)))  # Use a colormap for distinct colors
markers = ['o', 's', 'D', '^', 'v', 'P', '*', 'X']  # Different marker styles
marker_map = {exercise_id: markers[i % len(markers)] for i, exercise_id in enumerate(exercise_ids)}
color_map = {exercise_id: colors[i] for i, exercise_id in enumerate(exercise_ids)}

# Create the scatter plot
plt.figure(figsize=(10, 6))

for exercise_id in exercise_ids:
    subset = grouped_data[grouped_data["exercise_id"] == exercise_id]
    x = subset["avg_feedbacks_per_score"]
    y = subset["result_score"]

    # Scatter points
    plt.scatter(
        x, y,
        label=f"Exercise {exercise_id}",
        color=color_map[exercise_id],
        marker=marker_map[exercise_id],
        s=100,  # Marker size
        alpha=0.7  # Transparency
    )

    # Compute regression line
    if len(subset) > 1:  # Regression is meaningful only if there are multiple points
        coefficients = np.polyfit(x, y, 1)  # Linear regression (degree=1)
        regression_line = np.poly1d(coefficients)
        plt.plot(
            x, regression_line(x),
            color=color_map[exercise_id],
            linestyle='--',
            linewidth=2,
            alpha=0.7
        )

# Add labels and legend
plt.xlabel("Average Number of Feedbacks", fontsize=12)
plt.ylabel("Scores", fontsize=12)
plt.title("Scores vs. Average Number of Feedbacks", fontsize=14)
plt.legend(title="Exercises", loc="upper left", fontsize=10)
plt.grid(True)
plt.tight_layout()

# Show the plot
plt.show()

## Example of Analysing the Sampled Submissions with Feedback

In [None]:
sampled_submissions_with_feedback = pd.read_csv("data/3_feedback_suggestions/feedback_suggestions.csv")

In [None]:
grouped_data = (
    sampled_submissions_with_feedback
    .groupby(["exercise_id", "result_score"])
    .agg(
        submission_count=("submission_id", "nunique")
    )
    .reset_index()
)

total_submissions_per_exercise = (
    sampled_submissions_with_feedback
    .groupby("exercise_id")["submission_id"]
    .nunique()
    .reset_index()
    .rename(columns={"submission_id": "total_submission_count"})
)
grouped_data = grouped_data.merge(total_submissions_per_exercise, on="exercise_id", how="left")

feedback_types = sampled_submissions_with_feedback["feedback_type"].unique()
for feedback_type in feedback_types:
    feedback_data = sampled_submissions_with_feedback[sampled_submissions_with_feedback["feedback_type"] == feedback_type]

    feedback_count = (
        feedback_data
        .groupby(["exercise_id", "result_score"])["feedback_id"]
        .nunique()
        .reset_index()
        .rename(columns={"feedback_id": f"feedback_count_{feedback_type}"})
    )
    grouped_data = grouped_data.merge(feedback_count, on=["exercise_id", "result_score"], how="left")
    grouped_data[f"feedback_count_{feedback_type}"] = grouped_data[f"feedback_count_{feedback_type}"].fillna(0).astype(int)

    total_feedback_count = (
        feedback_data
        .groupby("exercise_id")["feedback_id"]
        .nunique()
        .reset_index()
        .rename(columns={"feedback_id": f"total_feedback_count_{feedback_type}"})
    )
    grouped_data = grouped_data.merge(total_feedback_count, on="exercise_id", how="left")
    grouped_data[f"total_feedback_count_{feedback_type}"] = grouped_data[f"total_feedback_count_{feedback_type}"].fillna(0).astype(int)

    grouped_data[f"average_feedback_count_{feedback_type}"] = (
        grouped_data[f"feedback_count_{feedback_type}"] / grouped_data["submission_count"]
    ).fillna(0)


grouped_data.to_csv("../data/2_feedback_counts.csv", index=False)
grouped_data