# Results Formatting

This notebook was used for collecting and formatting the results of the experiments, for the evaluation section of the paper.

### Imports

In [4]:
import pandas as pd
import os
import src.Common.Utils.Config.ConfigHelper as ConfigHelper
import shutil
import src.Common.EpisodeReplay.EpisodeReplay as EpisodeReplay
from tqdm import tqdm
import pyperclip as pc

### Define Constants

In [10]:
RunGroup = "12"
EnvNames = ["FrozenLake"]
BehaviouralTypes = ["Human", "Curated", "HighScore"]
BehaviouralTypesToReview = ["Human", "Curated"]
AgentTypes = ["HardCoded", "ML", "Random", "Human"]

# manual Review of the results config
MaxChoicesPerAgent = 5
MaxReplaysPerChoice = 5


In [3]:
# copy demos of each behavioural type to the run group folder
def CopyDemos(envName, runGroup):
	fromPath = os.path.join("Data", envName, "dev", "replays", "Human")
	toPath = os.path.join("Data", envName, runGroup, "replays", "Human")

	if os.path.exists(toPath):
		shutil.rmtree(toPath)
	shutil.copytree(fromPath, toPath)

	print("Copied Human Demos for: " + envName)
	return

def AddDemoIdsToBehaviour(envName, runGroup, behaviourType):
	# load the results
	statsPath = os.path.join("Data", envName, runGroup, "replays", "Human", "stats.tsv")
	stats = pd.read_csv(statsPath, sep="\t")

	lowerBehaviourType = behaviourType.lower()

	stats["Behaviour"] = stats["loggerName"].apply(lambda x: x.split("_")[-2])

	if lowerBehaviourType != "highscore":
		stats = stats[stats["Behaviour"] == lowerBehaviourType]

	episodeIds = stats["EpisodeId"].unique().tolist()

	loggerName = "Human_" + behaviourType

	# load the json with the episode Ids of the behavioural type
	episodeIdsPath = os.path.join("Data", envName, runGroup, f"{behaviourType}_Episodes.json")
	
	episodeIdsJson = ConfigHelper.LoadConfig(episodeIdsPath)
	episodeIdsJson[loggerName] = episodeIds

	ConfigHelper.SaveConfig(episodeIdsJson, episodeIdsPath)

	print(f"Added {len(episodeIds)} {behaviourType} episodes to {envName}")
	return

for envName in EnvNames:
	CopyDemos(envName, RunGroup)

	for behaviourType in BehaviouralTypes:
		AddDemoIdsToBehaviour(envName, RunGroup, behaviourType)


Copied Human Demos for: FrozenLake
Added 59 Human episodes to FrozenLake
Added 23 Curated episodes to FrozenLake
Added 82 HighScore episodes to FrozenLake


## Collect Replays For Manual Reviewed

In [5]:
def LoadReplay(envName, runGroup, agentType, episodeId):
	
	path = os.path.join("Data", envName, runGroup, "replays", agentType, episodeId)
	try:
		replay = EpisodeReplay.EpisodeReplay.LoadFromFolder(path)
		return replay
	except:
		return None

def CollectReplaysToReview(envName, runGroup, behaviourType):
	episodeIdsPath = os.path.join("Data", envName, runGroup, f"{behaviourType}_Episodes.json")
	replays =  ConfigHelper.LoadConfig(episodeIdsPath)

	columns = ["AgentId", "Predicted", "AgentType"]
	columns += [f"Replay_{i}" for i in range(MaxReplaysPerChoice)]

	replaysToReview = pd.DataFrame(columns=columns)

	for agentId, episodeIds in replays.items():
		
		agentType = agentId.split("_")[0]
		ids = []

		for i in range(len(episodeIds)):
			episodeId = episodeIds[i]

			replay = LoadReplay(envName, runGroup, agentType, episodeId)
			if replay is None:
				continue

			ids.append(episodeId)

			if len(ids) >= MaxReplaysPerChoice or i == len(episodeIds) - 1:
				row = {}
				row["AgentId"] = [agentId]
				row["Predicted"] = [None]
				row["AgentType"] = [agentType]
				for i, id in enumerate(ids):
					row[f"Replay_{i}"] = [id]

				replaysToReview = pd.concat([replaysToReview, pd.DataFrame(row)], ignore_index=True)
				ids = []

	return replaysToReview
	
for envName in EnvNames:
	for behaviourType in BehaviouralTypesToReview:

		replaysToReview = CollectReplaysToReview(envName, RunGroup, behaviourType)
		replaysToReview = replaysToReview.sample(frac=1)
		replaysToReviewPath = os.path.join("Data", envName, RunGroup, f"ReplaysToReview_{behaviourType}.json")
		replaysToReview.to_json(replaysToReviewPath, orient="records", indent=4)

		print(f"Collected {len(replaysToReview)} replays to review for {behaviourType} in {envName}")


Collected 92 replays to review for Human in FrozenLake
Collected 85 replays to review for Curated in FrozenLake


## Formate the results of the review

In [12]:
def LoadReplaysToReview(envName, runGroup, behaviourType):
	replaysToReviewPath = os.path.join("Data", envName, runGroup, f"ReplaysToReview_{behaviourType}.json")
	replaysToReview = pd.read_json(replaysToReviewPath)

	grouped = replaysToReview.groupby("AgentType").aggregate({"Predicted": ["count", "sum"]})


	grouped["Percent"] = grouped["Predicted"]["sum"] / grouped["Predicted"]["count"]
	grouped["Norm_Percent"] = grouped["Percent"] / grouped["Percent"]["Human"]

	# format the percent columns
	grouped["Percent"] = grouped["Percent"].apply(lambda x: f"{x:.0%}")
	grouped["Norm_Percent"] = grouped["Norm_Percent"].apply(lambda x: f"{x:.0%}")

	return grouped


for envName in EnvNames:
	for behaviourType in BehaviouralTypesToReview:
		grouped = LoadReplaysToReview(envName, RunGroup, behaviourType)
		print(f"{envName} - {behaviourType}")
		display(grouped)

FrozenLake - Human


Unnamed: 0_level_0,Predicted,Predicted,Percent,Norm_Percent
Unnamed: 0_level_1,count,sum,Unnamed: 3_level_1,Unnamed: 4_level_1
AgentType,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
HardCoded,0,0.0,nan%,nan%
Human,0,0.0,nan%,nan%
ML,0,0.0,nan%,nan%
Random,0,0.0,nan%,nan%


FrozenLake - Curated


Unnamed: 0_level_0,Predicted,Predicted,Percent,Norm_Percent
Unnamed: 0_level_1,count,sum,Unnamed: 3_level_1,Unnamed: 4_level_1
AgentType,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
HardCoded,0,0.0,nan%,nan%
Human,0,0.0,nan%,nan%
ML,0,0.0,nan%,nan%
Random,0,0.0,nan%,nan%


## Formating tables and graphs

In [54]:
def CollectEvalIds(runGroup, envNames, behaviouralTypes):
	columns = ["EnvName", "AgentId", "AgentType", "EpisodeId", "Behaviour"]
	df = pd.DataFrame(columns=columns)

	for envName in envNames:
		for behaviourType in behaviouralTypes:
			episodeIdsPath = os.path.join("Data", envName, runGroup, f"{behaviourType}_Episodes.json")
			replays =  ConfigHelper.LoadConfig(episodeIdsPath)


			for agentId, episodeIds in replays.items():
				
				agentType = agentId.split("_")[0]

				for i in range(len(episodeIds)):
					episodeId = episodeIds[i]

					row = {}
					row["EnvName"] = [envName]
					row["AgentId"] = [agentId]
					row["AgentType"] = [agentType]
					row["Behaviour"] = [behaviourType]
					row["EpisodeId"] = [episodeId]
					df = pd.concat([df, pd.DataFrame(row)], ignore_index=True)

	return df

evalIds = CollectEvalIds(RunGroup, EnvNames, BehaviouralTypes)

In [111]:
def CleanStats(df):
	prefixesToDrop = ["LearnerConfig", "ModelConfigs", "DataTables"]
	columnsToDrop = [col for col in df.columns if col.startswith(tuple(prefixesToDrop))]
	df = df.drop(columns=columnsToDrop)

	# add duration column
	df["Duration"] = (df["EndTime"] - df["StartTime"])  / 1e9
	return df

def CombinedStats(runGroup, envNames, agentTypes):
	combinedStats = None

	for envName in envNames:
		for agentType in agentTypes:
			statsPath = os.path.join("Data", envName, runGroup, "replays", agentType, "stats.tsv")
			stats = pd.read_csv(statsPath, sep="\t")
			stats = CleanStats(stats)

			combinedStats = pd.concat([combinedStats, stats], ignore_index=True)
	return combinedStats

stats = CombinedStats(RunGroup, EnvNames, AgentTypes)
evalDf = pd.merge(evalIds, stats, on=["EpisodeId"], how="left")

# drop rows with nan values
evalDf = evalDf.dropna()

In [155]:
def ConvertToLatex(df):
	# Get column names
	columns = df.columns.tolist()

	headerCode = "\hline\n"
	headerCode += "\t\multicolumn{1}{|c|}{\\textbf{"
	headerCode += "}} &\n\t\multicolumn{1}{c|}{\\textbf{".join(columns)

	headerCode += "}} \\\\\n\hline\n"


	# Generate LaTeX table code
	latex_code = "\\begin{longtable}{|" + "c|" * len(columns) + "}\n"

	# add caption and label
	latex_code += "\\caption{Insert Caption Here.}\n"
	latex_code += "\\label{tab:InsertLabelHere} \\\\\n"

	latex_code += headerCode
	latex_code += "\endfirsthead\n\n"

	latex_code += "\multicolumn{" + str(len(columns)) + "}{c}%\n"
	latex_code += "{{\\bfseries \\tablename\\ \\thetable{} -- continued from previous page}} \\\\\n"
	latex_code += headerCode
	latex_code += "\endhead\n\n"

	latex_code += "\hline \multicolumn{" + str(len(columns)) + "}{|c|}{{Continued on next page}} \\\\ \hline\n\n"
	latex_code += "\endfoot\n"

	latex_code += "\hline\n"
	latex_code += "\endlastfoot\n"

	latex_code += "\n"


	# Add data rows
	for index, row in df.iterrows():
		values = row.tolist()
		latex_code += "\t" + " & ".join(str(value) for value in values) + " \\\\\n"
	latex_code += "\\hline\n"
	# Complete LaTeX table code
	latex_code += "\\end{longtable}"

	return latex_code

In [142]:
aggregateSettings = {}
aggregateSettings["EpisodeId"] = "count"
aggregateSettings["Duration"] = ["mean"]
aggregateSettings["EpisodeTotalReward"] = ["mean", "std"]
aggregateSettings["EpisodeTotalCuratedReward"] = ["mean", "std"]

evalDf.groupby(["EnvName", "Behaviour", "AgentId"]).aggregate(aggregateSettings)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,EpisodeId,Duration,EpisodeTotalReward,EpisodeTotalReward,EpisodeTotalCuratedReward,EpisodeTotalCuratedReward
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,count,mean,mean,std,mean,std
EnvName,Behaviour,AgentId,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FrozenLake,Curated,HardCoded_D_1_RT_True_Curated,100,0.029948,1.0,0.0,1.0,0.0
FrozenLake,Curated,Human_Curated,23,2.874005,1.0,0.0,1.0,0.0
FrozenLake,Curated,ML_D_10_RT_False_Curated,100,0.423779,1.0,0.0,0.0,0.0
FrozenLake,Curated,ML_D_10_RT_True_Curated,100,0.181596,1.0,0.0,0.0,0.0
FrozenLake,Curated,Random_D_1_RT_True_Curated,112,2.173379,0.035714,0.186411,0.116071,0.32175
FrozenLake,HighScore,HardCoded_D_1_RT_True_HighScore,1000,0.009263,1.0,0.0,0.0,0.0
FrozenLake,HighScore,Human_HighScore,82,2.292962,0.963415,0.188897,0.292683,0.457794
FrozenLake,HighScore,ML_D_10_RT_False_HighScore,1000,0.286177,1.0,0.0,0.0,0.0
FrozenLake,HighScore,ML_D_10_RT_True_HighScore,1000,0.159529,1.0,0.0,0.0,0.0
FrozenLake,HighScore,Random_D_1_RT_True_HighScore,1016,2.495324,0.047244,0.212265,0.095472,0.294011


In [144]:


evalDf[evalDf["Behaviour"] == "HighScore"].groupby(["EnvName", "AgentId"])[["EpisodeTotalReward"]].aggregate(aggregateSettings["EpisodeTotalReward"])

Unnamed: 0_level_0,Unnamed: 1_level_0,EpisodeTotalReward,EpisodeTotalReward
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std
EnvName,AgentId,Unnamed: 2_level_2,Unnamed: 3_level_2
FrozenLake,HardCoded_D_1_RT_True_HighScore,1.0,0.0
FrozenLake,Human_HighScore,0.963415,0.188897
FrozenLake,ML_D_10_RT_False_HighScore,1.0,0.0
FrozenLake,ML_D_10_RT_True_HighScore,1.0,0.0
FrozenLake,Random_D_1_RT_True_HighScore,0.047244,0.212265


In [192]:
def CreateEnvAgentTypeTable(evalDf, envNames, agentTypes, metric):

	columns = ["EnvName"]
	columns += agentTypes

	df = pd.DataFrame(columns=columns)

	for envName in envNames:
		row = {}
		row["EnvName"] = [envName]

		for agentType in agentTypes:
			agentTypeDf = evalDf[evalDf["AgentType"] == agentType]
			agentTypeDf = agentTypeDf[agentTypeDf["EnvName"] == envName]

			avg = agentTypeDf[metric].mean()
			error = agentTypeDf[metric].std()
			cell = f"{avg:.2f} ±{error:.2f}"
			row[agentType] = [cell]

		df = pd.concat([df, pd.DataFrame(row)], ignore_index=True)

	
	# set the index to be the env name
	df = df.set_index("EnvName")

	text = ConvertToLatex(df)
	pc.copy(text)
	print(text.replace("\n", " "))
	display(df)
	print("Copied to clipboard")

	return

In [195]:
curatedDf = evalDf[evalDf["Behaviour"] == "Curated"]
humanDf = evalDf[evalDf["Behaviour"] == "Human"]
highScoreDf = evalDf[evalDf["Behaviour"] == "HighScore"]

In [196]:
CreateEnvAgentTypeTable(highScoreDf, EnvNames, AgentTypes, "EpisodeTotalReward")

\begin{longtable}{|c|c|c|c|} \caption{Insert Caption Here.} \label{tab:InsertLabelHere} \\ \hline 	\multicolumn{1}{|c|}{\textbf{HardCoded}} & 	\multicolumn{1}{c|}{\textbf{ML}} & 	\multicolumn{1}{c|}{\textbf{Random}} & 	\multicolumn{1}{c|}{\textbf{Human}} \\ \hline \endfirsthead  \multicolumn{4}{c}% {{\bfseries \tablename\ \thetable{} -- continued from previous page}} \\ \hline 	\multicolumn{1}{|c|}{\textbf{HardCoded}} & 	\multicolumn{1}{c|}{\textbf{ML}} & 	\multicolumn{1}{c|}{\textbf{Random}} & 	\multicolumn{1}{c|}{\textbf{Human}} \\ \hline \endhead  \hline \multicolumn{4}{|c|}{{Continued on next page}} \\ \hline  \endfoot \hline \endlastfoot  	1.00 ±0.00 & 1.00 ±0.00 & 0.05 ±0.21 & 0.96 ±0.19 \\ \hline \end{longtable}


Unnamed: 0_level_0,HardCoded,ML,Random,Human
EnvName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FrozenLake,1.00 ±0.00,1.00 ±0.00,0.05 ±0.21,0.96 ±0.19


Copied to clipboard


In [188]:
CreateEnvAgentTypeTable(curatedDf, EnvNames, AgentTypes, "EpisodeTotalCuratedReward")

\begin{longtable}{|c|c|c|c|} \caption{Insert Caption Here.} \label{tab:InsertLabelHere} \\ \hline 	\multicolumn{1}{|c|}{\textbf{HardCoded}} & 	\multicolumn{1}{c|}{\textbf{ML}} & 	\multicolumn{1}{c|}{\textbf{Random}} & 	\multicolumn{1}{c|}{\textbf{Human}} \\ \hline \endfirsthead  \multicolumn{4}{c}% {{\bfseries \tablename\ \thetable{} -- continued from previous page}} \\ \hline 	\multicolumn{1}{|c|}{\textbf{HardCoded}} & 	\multicolumn{1}{c|}{\textbf{ML}} & 	\multicolumn{1}{c|}{\textbf{Random}} & 	\multicolumn{1}{c|}{\textbf{Human}} \\ \hline \endhead  \hline \multicolumn{4}{|c|}{{Continued on next page}} \\ \hline  \endfoot \hline \endlastfoot  	1.00 ±0.00 & 0.00 ±0.00 & 0.12 ±0.32 & 1.00 ±0.00 \\ \hline \end{longtable}


Unnamed: 0_level_0,HardCoded,ML,Random,Human
EnvName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FrozenLake,1.00 ±0.00,0.00 ±0.00,0.12 ±0.32,1.00 ±0.00


Copied to clipboard


In [197]:
CreateEnvAgentTypeTable(highScoreDf, EnvNames, AgentTypes, "Duration")

\begin{longtable}{|c|c|c|c|} \caption{Insert Caption Here.} \label{tab:InsertLabelHere} \\ \hline 	\multicolumn{1}{|c|}{\textbf{HardCoded}} & 	\multicolumn{1}{c|}{\textbf{ML}} & 	\multicolumn{1}{c|}{\textbf{Random}} & 	\multicolumn{1}{c|}{\textbf{Human}} \\ \hline \endfirsthead  \multicolumn{4}{c}% {{\bfseries \tablename\ \thetable{} -- continued from previous page}} \\ \hline 	\multicolumn{1}{|c|}{\textbf{HardCoded}} & 	\multicolumn{1}{c|}{\textbf{ML}} & 	\multicolumn{1}{c|}{\textbf{Random}} & 	\multicolumn{1}{c|}{\textbf{Human}} \\ \hline \endhead  \hline \multicolumn{4}{|c|}{{Continued on next page}} \\ \hline  \endfoot \hline \endlastfoot  	0.01 ±0.00 & 0.22 ±2.54 & 2.50 ±2.15 & 2.29 ±1.36 \\ \hline \end{longtable}


Unnamed: 0_level_0,HardCoded,ML,Random,Human
EnvName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
FrozenLake,0.01 ±0.00,0.22 ±2.54,2.50 ±2.15,2.29 ±1.36


Copied to clipboard
