In [None]:
import pandas as pd
import seaborn as sns

In [None]:
annotation_df = pd.read_csv("data/annotation.csv", dtype={"answer": str}, index_col="id")
question_df = pd.read_csv("data/question.csv", index_col="id")
annotation_df = annotation_df.merge(question_df, left_on="question_id", right_index=True)
annotation_df["answer"] = annotation_df["answer"].fillna("")

In [None]:
def rename_question_columns(name):
    if name in ["correct", "distractor_1", "distractor_2", "distractor_3"]:
        return f"{name}_answer"
    return name

In [None]:
annotation_df = annotation_df.rename(columns=rename_question_columns)

In [None]:
annotation_df = annotation_df[~annotation_df["unfinished"]].drop(columns=["unfinished"])

In [None]:
annotation_df

In [None]:
# plot the correct rate (answer=="0") for each participant
annotation_df["labelled_0"] = annotation_df["answer"].apply(lambda x: x == "0")
ax = sns.histplot(annotation_df.groupby("participant_id")["labelled_0"].mean(), bins=21, kde=True)
ax.set_title("Distribution of answer=='0' rate per participant")
ax

In [None]:
# plot the average time spent split by dataset
#sns.histplot(annotation_df, x="time", hue="dataset", log_scale=True)
ax = sns.histplot(annotation_df[(annotation_df["time"] < 180) & (annotation_df["time"] > 1)], x="time", hue="dataset")
ax.set_title("Time spent per annotation")
ax.get_figure().savefig("data/plots/annotation_time.png")

In [None]:
sns.boxenplot(annotation_df, x="time", hue="dataset", showfliers=False)

In [None]:
print(f"Number of participants: {annotation_df['participant_id'].nunique()}")

In [None]:
def transform_answer(answer):
    if answer == "Can't answer":
        return [False, ] * 4 + [True, False]
    elif answer == "Odd question":
        return [False, ] * 4 + [False, True]
    elif answer == "":
        return [False, ] * 6
    else:
        answer = [int(d) for d in answer.split(", ")]
        return [i in answer for i in range(4)] + [False, False]


In [None]:
annotation_df["answer"] = annotation_df["answer"].apply(transform_answer)
annotation_df

In [None]:
answer_names_dict = {0: "correct", 1: "distractor1", 2: "distractor2", 3: "distractor3", 4: "cant_answer",
                     5: "odd_question"}
answer_names = list(answer_names_dict.values())
annotation_df = annotation_df.merge(annotation_df["answer"].apply(pd.Series).rename(columns=answer_names_dict),
                                    left_index=True, right_index=True).drop(columns=["answer"])

In [None]:
annotation_df

In [None]:
# drop cant_answer
annotation_df = annotation_df[~annotation_df["cant_answer"]]
annotation_df = annotation_df.drop(columns=["cant_answer"])

In [None]:
annotation_df

In [None]:
# num annotations per question
annotation_df.groupby("question_id").size().value_counts()


In [None]:
annotation_df.groupby(["participant_id", "question_id"]).size().value_counts()

In [None]:
# questions annotated multiple times by the same participant_id
annotation_df.groupby(["participant_id", "question_id"]).size()[lambda x: x > 1].index.get_level_values(
    "participant_id").unique().size

In [None]:
# filter questions with less than 3 answers
annotation_count_qid = annotation_df.groupby("question_id").count()["participant_id"]
under_annotated_ids = annotation_count_qid[annotation_count_qid < 3].index
annotation_df = annotation_df[~annotation_df["question_id"].isin(under_annotated_ids)]

In [None]:
# plot count for correct, distractors, cant_answer, odd_question
answer_names = list(sorted(set(answer_names) - {"cant_answer"}))
all_counts = annotation_df[answer_names].mean()
sdd_counts = annotation_df[annotation_df["dataset"] == "sdd"][answer_names].mean()
mc_counts = annotation_df[annotation_df["dataset"] == "musiccaps"][answer_names].mean()
merged_counts = pd.concat([all_counts, sdd_counts, mc_counts], axis=1, keys=["both", "sdd", "mc"])
ax = merged_counts.plot(kind="bar", title="Answer distribution", ylabel="Fraction of answers")
ax.get_figure().savefig("data/plots/answer_distribution.png")

In [None]:
aggregators = dict.fromkeys(
    ["time", "correct", "distractor1", "distractor2", "distractor3", "odd_question"], "mean")
aggregators.update(dict.fromkeys(["question", "dataset", "correct_answer", "distractor_1_answer", "distractor_2_answer",
                                  "distractor_3_answer", "dataset_identifier"], "first"))
aggregators.update({"participant_id": "nunique"})
benchmark_df = annotation_df.groupby("question_id").agg(aggregators)

In [None]:
benchmark_df

In [None]:
benchmark_df["distractors"] = benchmark_df[["distractor1", "distractor2", "distractor3"]].max(axis=1)

In [None]:
benchmark_df[(benchmark_df["correct"] >= 0.5) & (benchmark_df["distractors"] <= 0.5)]

In [None]:
benchmark_df[(benchmark_df["correct"] > 0.5) & (benchmark_df["distractors"] <= 0.5)]

In [None]:
benchmark_df[(benchmark_df["correct"] > 0.5) & (benchmark_df["distractors"] < 0.5)]

In [None]:
len(benchmark_df[benchmark_df["correct"] >= 0.5])

In [None]:
len(benchmark_df[benchmark_df["correct"] > 0.5])

In [None]:
print("correct >= 0.5\t& distractors <= 0.5", len(benchmark_df[(benchmark_df["correct"] >= 0.5) & (benchmark_df["distractors"] <= 0.5)]), sep="\t")
print("correct >= 0.5 & distractors < 0.5", len(benchmark_df[(benchmark_df["correct"] >= 0.5) & (benchmark_df["distractors"] < 0.5)]), sep="\t")
print("correct >  0.5\t& distractors <= 0.5", len(benchmark_df[(benchmark_df["correct"] > 0.5) & (benchmark_df["distractors"] <= 0.5)]), sep="\t")
print("correct >  0.5\t& distractors <  0.5", len(benchmark_df[(benchmark_df["correct"] > 0.5) & (benchmark_df["distractors"] < 0.5)]) , sep="\t")
print("correct >= 0.5\t& distractors <= 0.5 & correct > distractors", len(benchmark_df[(benchmark_df["correct"] >= 0.5) & (benchmark_df["distractors"] <= 0.5) & (benchmark_df["correct"] > benchmark_df["distractors"])]), sep="\t")

In [None]:
final_benchmark = benchmark_df[(benchmark_df["correct"] >= 0.5) & (benchmark_df["distractors"] <= 0.5)].drop(columns=["time"]).rename(columns={"participant_id": "num_annotations"})
sdd_meta = pd.read_csv("data/SongDescriberDataset/song_describer-for-generation.csv", index_col="track_id", usecols=["track_id", "genre"], dtype={"track_id": str})
mc_meta = pd.read_csv("data/musiccaps/musiccaps-for-generation.csv")
mc_meta["identifier"] = mc_meta["ytid"] + "_" + mc_meta["start_s"].astype(str)
mc_meta = mc_meta.set_index("identifier")
genres = sdd_meta["genre"].to_dict()
genres.update(mc_meta["genres"].to_dict())
final_benchmark["genre"] = final_benchmark["dataset_identifier"].map(genres)

In [None]:
final_benchmark.reset_index().to_csv("data/benchmark.csv", index=False)

In [None]:
print("correct == 0.5", len(benchmark_df[benchmark_df["correct"] == 0.5]), sep="\t")
print("distractors == 0.5", len(benchmark_df[benchmark_df["distractors"] == 0.5]), sep="\t")
print("correct == 0.5 & distractors == 0.5", len(benchmark_df[(benchmark_df["correct"] == 0.5) & (benchmark_df["distractors"] == 0.5)]), sep="\t")
print("correct >= 0.5 & distractors == 0.5", len(benchmark_df[(benchmark_df["correct"] >= 0.5) & (benchmark_df["distractors"] == 0.5)]), sep="\t")

In [None]:
benchmark_df[(benchmark_df["distractors"] == 0.5) & (benchmark_df["correct"] >= 0.5)].index.tolist()

In [None]:
benchmark_df[(benchmark_df["distractors"] <= 0.7) & (benchmark_df["correct"] > 0.3) & (benchmark_df["correct"] < 0.7) & (benchmark_df["distractors"] > 0.3) & (benchmark_df["participant_id"] < 5)]

In [None]:
#correct >= 0.5 & distractors == 0.5 plus correct == 0.5 & distractors < 0.5
len(benchmark_df[(benchmark_df["correct"] >= 0.5) & (benchmark_df["distractors"] == 0.5)]) + len(benchmark_df[(benchmark_df["correct"] == 0.5) & (benchmark_df["distractors"] < 0.5)])

In [None]:
benchmark_df[(benchmark_df["correct"] > 0.5) & (benchmark_df["distractors"] < 0.5) & (benchmark_df["participant_id"] > 3)]

In [None]:
benchmark_df[(benchmark_df["correct"] > 0.75) & #(benchmark_df["correct"] < 1) &
             (benchmark_df["distractors"] < 0.5) & (benchmark_df["distractors"] > 0.3) &
             (benchmark_df["participant_id"] == 3)
             & (benchmark_df["odd_question"] < 0.2)
].index.tolist()

In [None]:
benchmark_df[(benchmark_df["correct"] < 0.5) &
                (benchmark_df["distractors"] > 0.5) &
                (benchmark_df["participant_id"] > 2)]

In [None]:
benchmark_df[
    (benchmark_df["correct"] < 0.5) &
    (benchmark_df["distractors"] > 0.5) &
    (benchmark_df["participant_id"] == 5) &
    (benchmark_df["odd_question"] < 0.2)
].index.tolist()

In [None]:
# suburst_chart first words of questions
final_benchmark["first_words"] = final_benchmark["question"].str.split().str[:2]
final_benchmark["first_words"].value_counts()

In [None]:
final_benchmark["first_words"].str[0].value_counts()

In [None]:
# split in two columns
final_benchmark["first_word"] = final_benchmark["first_words"].str[0]
final_benchmark["second_word"] = final_benchmark["first_words"].str[1]
final_benchmark = final_benchmark.drop(columns=["first_words"])
#group by
question_start = final_benchmark.groupby(["first_word", "second_word"]).size().reset_index(name="count")

In [None]:
import plotly.express as px
fig = px.sunburst(
    question_start[question_start["count"] > 9],
    path=["first_word", "second_word"],
    values="count",
    title="Sunburst chart of first two words of questions"
)
fig.show()


In [None]:
# save as PNG
fig.write_image("data/plots/question_sunburst.png")

In [None]:
# set paper style
sns.set_context("paper")

In [None]:
# plot distribution of correct answers
sns.histplot(data=final_benchmark, x="correct", hue="dataset")

In [None]:
# plot distribution of 
sns.histplot(data=final_benchmark, x="distractors", hue="dataset")

In [None]:
# plot number of annotations
sns.histplot(data=final_benchmark, x="num_annotations", hue="dataset")

In [None]:
print("distinct dataset identifiers", final_benchmark["dataset_identifier"].nunique())
final_benchmark["dataset"].value_counts()

In [None]:
# plot dataset_identifier distribution and question distribution per dataset
ax = sns.countplot(data=final_benchmark, x="dataset")
ax.set_title("Num questions per dataset")
ax.get_figure().savefig("data/plots/num_questions_per_dataset.png")

In [None]:
# distinct dataset identifiers
final_benchmark["dataset_identifier"].value_counts()
ax = sns.countplot(data=final_benchmark[["dataset_identifier", "dataset"]].drop_duplicates(), x="dataset")
ax.set_title("Num tracks per dataset")
ax.get_figure().savefig("data/plots/num_tracks_per_dataset.png")

In [None]:
#final_benchmark["dataset_identifier", "dataset"].drop_duplicates()
final_benchmark[["dataset_identifier", "dataset"]].drop_duplicates().groupby("dataset").size()

In [None]:
# plot genre distribution
ax = sns.histplot(data=final_benchmark, x="genre", hue="dataset", multiple="stack")
# rotate x labels
ax.set_title("Genre distribution")
labels = ax.get_xticklabels()  # get x labels

ax.set_xticklabels(labels, rotation=90)
ax.get_figure().savefig("data/plots/genre_distribution.png")