In [2]:
import re
import pandas as pd
import plotly.express as px
import spacy
from copy import deepcopy

## Information on answers
Which answer corresponds to which prompt, etc

In [3]:
info_answers = pd.read_csv("info_answers.csv")
print(info_answers.shape)
info_answers.head(3)

(144, 5)


Unnamed: 0,form,type_question,data,type_answer,metric
0,f1,summary,French_Revolution,triples_dbpedia,granularity
1,f1,summary,French_Revolution,triples_dbpedia,relevance
2,f1,summary,French_Revolution,triples_dbpedia,succinctness


## Get answers from forms
granularity, relevance, diversity, succinctness

In [4]:
def read_answers(data_path):
    df = pd.read_csv(data_path).transpose()[:-1]
    time_info = df.iloc[0]
    df = df[1:]
    columns = list(df.columns)
    df.reset_index(inplace=True)
    df.columns = ["answer"] + columns
    
    return df.melt(id_vars='answer', value_vars=columns, var_name='n', value_name='value')

In [5]:
answers = {f"f{i}": read_answers(f"f{i}_answers.csv") for i in ["1", "2"]}
answers["f1"].head(3)

Unnamed: 0,answer,n,value
0,"Answer:\n""""""\nThe French Revolution, a period ...",0,3
1,"Answer:\n""""""\nThe French Revolution, a period ...",0,4
2,"Answer:\n""""""\nThe French Revolution, a period ...",0,2


In [6]:
def build_data(answers, info_answers, columns):
    data_list = []
    for form, info in answers.items():
        info_answer = info_answers[info_answers.form == form].reset_index(drop=True)
        for n in info.n.unique():
            curr_df = pd.concat([info_answer, info[info.n == n].reset_index(drop=True)], axis=1)
            curr_df.reset_index(drop=True, inplace=True)
            data_list.append(curr_df)
    data = pd.concat(data_list, axis=0, ignore_index=True).reset_index(drop=True)
    return data

In [7]:
columns = list(info_answers.columns) + list(answers["f1"].columns)
data = build_data(answers=answers, info_answers=info_answers, columns=columns)
data

Unnamed: 0,form,type_question,data,type_answer,metric,answer,n,value
0,f1,summary,French_Revolution,triples_dbpedia,granularity,"Answer:\n""""""\nThe French Revolution, a period ...",0,3
1,f1,summary,French_Revolution,triples_dbpedia,relevance,"Answer:\n""""""\nThe French Revolution, a period ...",0,4
2,f1,summary,French_Revolution,triples_dbpedia,succinctness,"Answer:\n""""""\nThe French Revolution, a period ...",0,2
3,f1,summary,French_Revolution,triples_dbpedia,diversity,"Answer:\n""""""\nThe French Revolution, a period ...",0,4
4,f1,summary,French_Revolution,base,granularity,"Answer:\n""""""\nThe French Revolution, spanning ...",0,3
...,...,...,...,...,...,...,...,...
715,f2,actor_common,"(Guillaume_Brune, Magnus_Gustav_von_Essen)",triples_dbpedia,diversity,"Answer: \n""""""\nGuillaume Brune and Magnus Gust...",4,5
716,f2,actor_common,"(Guillaume_Brune, Magnus_Gustav_von_Essen)",triples_eckg,granularity,"Answer: \n""""""\nGuillaume Brune and Magnus Gust...",4,4
717,f2,actor_common,"(Guillaume_Brune, Magnus_Gustav_von_Essen)",triples_eckg,relevance,"Answer: \n""""""\nGuillaume Brune and Magnus Gust...",4,5
718,f2,actor_common,"(Guillaume_Brune, Magnus_Gustav_von_Essen)",triples_eckg,succinctness,"Answer: \n""""""\nGuillaume Brune and Magnus Gust...",4,5


## Add groundedness

In [8]:
def clean_answer(text):
    new_text = re.sub(r'""" \[.+\]', '', text)
    new_text = re.sub(r'"""\n \[.+\]', '', new_text)
    new_text = re.sub(r'Answer: \n"""\n', '', new_text)
    new_text = re.sub(r'Answer:\n"""\n', '', new_text)
    new_text = re.sub(r'Answer:  \n"""\n', '', new_text)
    return new_text

grounding = pd.read_csv("grounding.csv")
grounding["answer_pp"] = grounding["answer"].apply(clean_answer)
grounding.groundedness = grounding.groundedness.str.replace(',', '.').astype(float)
print(grounding.shape)
grounding.head(3)

(36, 6)


Unnamed: 0,answer,events,nb_events,nb_mentioned,groundedness,answer_pp
0,"Answer: \n""""""\nBetween January 1, 1792, and Ja...",http://dbpedia.org/resource/Battle_of_Jemappes...,11,2,0.909091,"Between January 1, 1792, and January 1, 1793, ..."
1,"Answer: \n""""""\nBetween January 1, 1792, and Ja...",http://dbpedia.org/resource/Battle_of_Jemappes...,11,8,3.636364,"Between January 1, 1792, and January 1, 1793, ..."
2,"Answer: \n""""""\nBetween January 1, 1792, and Ja...",http://dbpedia.org/resource/Battle_of_Jemappes...,11,2,0.909091,"Between January 1, 1792, and January 1, 1793, ..."


In [9]:
data_grounded = deepcopy(data[["form", "type_question", "data", "type_answer", "answer"]])
data_grounded["answer_pp"] = data_grounded["answer"].apply(clean_answer)
data_grounded = data_grounded.drop_duplicates().sort_values(by="answer_pp")
print(data_grounded.shape)
data_grounded.head(3)

(144, 6)


Unnamed: 0,form,type_question,data,type_answer,answer,answer_pp
48,f1,actor_event,Antoine_Balland,base,"Answer:\n""""""\nAntoine Balland, a French priest...","Antoine Balland, a French priest, was executed..."
49,f1,actor_event,Antoine_Balland,base,"Answer:\n""""""\nAntoine Balland, a French priest...","Antoine Balland, a French priest, was executed..."
50,f1,actor_event,Antoine_Balland,base,"Answer:\n""""""\nAntoine Balland, a French priest...","Antoine Balland, a French priest, was executed..."


In [10]:
data_grounded = pd.merge(data_grounded, grounding[["answer_pp", "groundedness"]], on='answer_pp', how='left')
data_grounded["metric"] = "groundedness"
data_grounded["n"] = -1
data_grounded = data_grounded.rename(columns={"groundedness": "value"})
data_grounded["answer"] = 'Answer:\n"""\n' + data_grounded["answer_pp"] + '""" \[Groundedness]'
data_grounded = data_grounded[data.columns].drop_duplicates()
print(data_grounded.shape)
data_grounded.head(3)

(36, 8)


Unnamed: 0,form,type_question,data,type_answer,metric,answer,n,value
0,f1,actor_event,Antoine_Balland,base,groundedness,"Answer:\n""""""\nAntoine Balland, a French priest...",-1,0.0
4,f2,cause_consequence,Battle_of_Winterthur,triples_dbpedia,groundedness,"Answer:\n""""""\nAt the end of the Battle of Wint...",-1,5.0
8,f2,cause_consequence,Battle_of_Winterthur,triples_eckg,groundedness,"Answer:\n""""""\nAt the end of the Battle of Wint...",-1,5.0


In [11]:
data = pd.concat([data, data_grounded], axis=0, ignore_index=True).reset_index(drop=True)
print(data.shape)
data.head(3)

(756, 8)


Unnamed: 0,form,type_question,data,type_answer,metric,answer,n,value
0,f1,summary,French_Revolution,triples_dbpedia,granularity,"Answer:\n""""""\nThe French Revolution, a period ...",0,3
1,f1,summary,French_Revolution,triples_dbpedia,relevance,"Answer:\n""""""\nThe French Revolution, a period ...",0,4
2,f1,summary,French_Revolution,triples_dbpedia,succinctness,"Answer:\n""""""\nThe French Revolution, a period ...",0,2


## Analyse results

In [16]:
grouped = data.groupby(["type_answer", "metric"]).agg({"value": ["mean", "std"]}).reset_index()
grouped.columns = ["type_answer", "metric", "mean", "std"]
grouped

Unnamed: 0,type_answer,metric,mean,std
0,base,diversity,3.833333,1.02786
1,base,granularity,3.916667,0.961843
2,base,groundedness,1.110942,1.470802
3,base,relevance,4.183333,1.033206
4,base,succinctness,4.216667,0.922261
5,triples_dbpedia,diversity,3.966667,1.08872
6,triples_dbpedia,granularity,4.15,0.898681
7,triples_dbpedia,groundedness,2.235281,1.597252
8,triples_dbpedia,relevance,4.016667,1.06551
9,triples_dbpedia,succinctness,3.35,1.005493


In [28]:
data_overleaf = grouped.pivot(index="type_answer", columns="metric", values=["mean", "std"]).reset_index()
data_overleaf.columns = ["_".join(x) for x in data_overleaf.columns]
data_overleaf 

Unnamed: 0,type_answer_,mean_diversity,mean_granularity,mean_groundedness,mean_relevance,mean_succinctness,std_diversity,std_granularity,std_groundedness,std_relevance,std_succinctness
0,base,3.833333,3.916667,1.110942,4.183333,4.216667,1.02786,0.961843,1.470802,1.033206,0.922261
1,triples_dbpedia,3.966667,4.15,2.235281,4.016667,3.35,1.08872,0.898681,1.597252,1.06551,1.005493
2,triples_eckg,3.816667,4.133333,2.845764,4.116667,3.616667,1.033206,0.964921,1.863222,1.009978,0.993055


In [38]:
def format(x):
    if str(x)[0].isdigit():
        return str(round(x, 2))
    return "\\texttt{" + str(x).replace("_", "\\_") + "}"

ta_to_table_name = {"base": "base", "triples_dbpedia": "db-kg", "triples_eckg": "ec-kg"}
data_overleaf = grouped.pivot(index="type_answer", columns="metric", values=["mean", "std"]).reset_index()
data_overleaf.columns = ["_".join(x) for x in data_overleaf.columns]
columns = ["groundedness", "granularity", "relevance", "succinctness", "diversity"]

for _, row in data_overleaf.iterrows(): 
    res = [format(ta_to_table_name[row.type_answer_])]
    for col in columns:
        if row.type_answer_ != "base" or col == "groundedness":
            res.append(f"{format(row['mean_' + col])} $\pm$ {format(row['std_' + col])}")
        else:
            res.append(f"\scriptsize ({format(row['mean_' + col])} $\pm$ {format(row['std_' + col])})")
    print(" & ".join(res) + " \\" + "\\")

\texttt{base} & 1.11 $\pm$ 1.47 & \scriptsize (3.92 $\pm$ 0.96) & \scriptsize (4.18 $\pm$ 1.03) & \scriptsize (4.22 $\pm$ 0.92) & \scriptsize (3.83 $\pm$ 1.03) \\
\texttt{db-kg} & 2.24 $\pm$ 1.6 & 4.15 $\pm$ 0.9 & 4.02 $\pm$ 1.07 & 3.35 $\pm$ 1.01 & 3.97 $\pm$ 1.09 \\
\texttt{ec-kg} & 2.85 $\pm$ 1.86 & 4.13 $\pm$ 0.96 & 4.12 $\pm$ 1.01 & 3.62 $\pm$ 0.99 & 3.82 $\pm$ 1.03 \\


In [None]:
for metric in data.metric.unique():
    fig = px.histogram(data[data.metric==metric], x="value", color="type_answer", barmode="group")
    print(f"Metric: {metric}")
    fig.show()

In [None]:
data.groupby(["type_answer", "metric"]).agg({"value": "mean"})

In [None]:
# Sample DataFrame
grouped = data.groupby(["type_answer", "metric"]).agg({"value": "mean"}).reset_index()

# Create scatter plot using Plotly Express
fig = px.scatter(grouped, x='metric', y='value', color='type_answer', #symbol='metric',
                 color_discrete_sequence=px.colors.qualitative.Set1,
                 #symbol_sequence=['circle', 'square', 'diamond', 'cross', 'x', 'triangle-up']
                 )

# Update layout
fig.update_layout(
    title='Scatter Plot',
    xaxis=dict(title='Type Question'),
    yaxis=dict(title='Value', range=[1,5])
)

# Show the plot
fig.show()

In [None]:
grounding[["nb_events", "nb_mentioned"]]