## Syntax Understanding

In [2]:
import pandas as pd
import json
from pathlib import Path
import os

In [6]:
root_folder = ""
df = pd.read_csv(f"{root_folder}/evaluation/data/syntax_qa_final.csv")

syntax_df = df.groupby("user_id").filter(lambda x: len(x) > 5)
syntax_df = syntax_df[
    (syntax_df["question_rating"] > 5) & (syntax_df["answer_rating"] > 5)
]
syntax_df["type"] = "syntax"

average_question_rating = syntax_df.groupby("user_id")["question_rating"].mean()
average_answer_rating = syntax_df.groupby("user_id")["answer_rating"].mean()
syntax_df = syntax_df[["question", "answer", "type", "module", "file_name", "start_line", "end_line"]]
syntax_df = syntax_df.reset_index()
syntax_df = syntax_df.drop(columns=["index"])


#### User Stats

In [3]:
filtered_df = df.groupby("user_id").filter(lambda x: len(x) > 5)
filtered_df["created"] = pd.to_datetime(filtered_df["created"])
time_diff_per_user = filtered_df.groupby("user_id")["created"].agg(lambda x: x.max() - x.min())

min_duration = time_diff_per_user.min()
mean_duration = time_diff_per_user.median()
std_duration = time_diff_per_user.std()
max_duration = time_diff_per_user.max()

min_duration, mean_duration, std_duration, max_duration


(Timedelta('0 days 00:38:58.042692'),
 Timedelta('0 days 01:21:42.323134'),
 Timedelta('0 days 02:43:36.669400473'),
 Timedelta('0 days 08:18:05.677610'))

In [4]:
df = pd.read_csv(
    f"{root_folder}/evaluation/data/syntax_user_data_final.csv"
)
finished_users = df[df["status"] == "finished"]

mean_coding_knowledge = finished_users["coding_knowledge"].mean()
mean_python_knowledge = finished_users["python_knowledge"].mean()
mean_working_experience = finished_users["working_experience"].mean()

mean_coding_knowledge, mean_python_knowledge, mean_working_experience

(3.888888888888889, 4.333333333333333, 3.5555555555555554)

## Dependency Understanding

In [5]:
depend_df = pd.read_json(f"{root_folder}/evaluation/data/dependency_spyder_qa.jsonl", lines=True)
depend_df["module"] = depend_df["target_file"].apply(lambda x: os.path.dirname(x))
depend_df["module"] = depend_df["module"].apply(lambda x: x.replace("/", "."))
depend_df["file_name"] = depend_df["target_file"].apply(lambda x: x.split("/")[-1])
depend_df["type"] = "dependencies"
depend_df["marked"] = "unmarked"
depend_df["start_line"] = None
depend_df["end_line"] = None
depend_df = depend_df[["marked", "question", "answer", "type", "module", "file_name", "start_line", "end_line"]]

depend_df = depend_df[depend_df["question"] != "What files are imported directly in the file test_profiler.py within the programming repository?"]
depend_df['question'] = depend_df['question'].str.replace('target file', 'file', case=False)
depend_df['answer'] = depend_df['answer'].str.replace('target file', 'file', case=False)

depend_df = depend_df.reset_index()
depend_df = depend_df.drop(columns=["index"])

# depend_df.to_csv(f"{root_folder}/evaluation/data/dependency_spyder_qa_unmarked.csv", index=False)




### After Marked

In [5]:
depend_df = pd.read_csv("ma_llm/evaluation/data/dependency_spyder_qa_good.csv", sep=";")
depend_df = depend_df[depend_df.columns[1:8]]
depend_df["module"] = depend_df["module"].astype(str)
for index, row in depend_df.iterrows():
    if row["module"] == "nan":
        depend_df.at[index, 'module'] = "spyder"
    else:
        depend_df.at[index, 'module'] = "spyder." + row["module"]
depend_df = depend_df[~depend_df['file_name'].isna()]
depend_df


Unnamed: 0,question,answer,type,module,file_name,start_line,end_line
0,Which libraries are used in the file spyder/ap...,The libraries used in the file spyder/app/util...,dependencies,spyder.spyder.app,utils.py,,
1,What library is imported in the file 'spyder/p...,The library 'qtpy.QtCore' is imported in the f...,dependencies,spyder.spyder.plugins.statusbar,plugin.py,,
2,Which file does spyder.plugins.history.api.py ...,It imports classes from spyder.plugins.history...,dependencies,spyder.spyder.plugins.history,api.py,,
3,What library is directly imported in the file ...,The 'spyder/utils/snippets/ast.py' file direct...,dependencies,spyder.spyder.utils.snippets,ast.py,,
4,What libraries are imported directly in the fi...,The file pycodestyle_conf.py directly imports ...,dependencies,spyder.external-deps.python-lsp-server.pylsp.c...,pycodestyle_conf.py,,
...,...,...,...,...,...,...,...
134,Does the file JsonRequestBehaviorObsedian cont...,No,dependencies,spyder.spyder.api.widgets,mixins.py,,
135,Is the function to_text_string contained in th...,Yes,dependencies,spyder.spyder.plugins.explorer.widgets.tests,conftest.py,,
137,What libraries are directly imported in the fi...,"functools, copy, typing, uuid, qtpy.QtCore, qt...",dependencies,spyder.spyder.plugins.run,confpage.py,,
138,What library is directly imported in the file ...,logging,dependencies,spyder.spyder.plugins.editor.api,panel.py,,


## Meta Information Understanding

In [4]:
meta_df = pd.read_csv(f"{root_folder}/evaluation/data/meta_info_modified.csv", sep=";")
meta_df

Unnamed: 0,question,answer,type,module,file_name,start_line,end_line
0,In which language is the repository written?,The repository is written in the Python progra...,meta,spyder,README.md,,
1,What are the main features of the Spyder IDE?,It combines the functionality of a comprehensi...,meta,spyder,README.md,,
2,What is the recommended way to install the Spy...,The easiest way to install Spyder on any of ou...,meta,spyder,README.md,,
3,What are other methods besides anaconda to ins...,Unlock Python's full potential by choosing fro...,meta,spyder,README.md,,
4,What can happen without using Anaconda or Cond...,We cannot offer personalised assistance to use...,meta,spyder,README.md,,
5,What are the main components of the Spyder IDE?,The software has five main components:\n1. Edi...,meta,spyder,README.md,,
6,Where can I find the documentation of the Spyd...,"If you want detailed information about Spyder,...",meta,spyder,README.md,,
7,What are the dependencies to install Spyder IDE?,Spyder's dependencies come pre-installed with ...,meta,spyder,README.md,,
8,What is the LICENCE of the Spyder IDE?,MIT License Copyright (c) 2009- Spyder Project...,meta,spyder,LICENCE.txt,,
9,What are the guidelines for creating issues fo...,"When submitting an issue report for Spyder, pr...",meta,spyder,CONTRIBUTING.md,,


## Export Dataset

In [43]:
def format_output(df: pd.DataFrame):
    results_to_dump = []
    for _, row in df.iterrows():
        results_to_dump.append(
            {
                "meta_data": {
                    "file_name": row["file_name"],
                    "module": row["module"],
                    "start_line": row["start_line"],
                    "end_line": row["end_line"],
                },
                "question": row["question"],
                "answer": row["answer"],
                "type": row["type"],
            }
        )
    return results_to_dump


def dump_jsonlines(obj, filepath, **kwargs):
    path = Path(filepath)
    path.parent.mkdir(parents=True, exist_ok=True)

    with open(filepath, "wt", encoding="utf-8") as fout:
        for d in obj:
            line_d = json.dumps(d, ensure_ascii=False, **kwargs)
            fout.write("{}\n".format(line_d))


df_complete = pd.concat([syntax_df, depend_df, meta_df])
final_json = format_output(df_complete)


dump_jsonlines(final_json, f"{root_folder}/evaluation/data/final_eval_custom_meta.jsonl")
