In [1]:
import json
import re 
import pandas as pd

data = []
buffer = ""
invalid_blocks = []


def fix_inner_quotes(line: str) -> str:
    # Find the pattern: text inside double quotes within a JSON string
    # Example: 'dfasf "sth" dfads' → replace inner quotes with escaped
    pattern = r'("question":\s*")(.*?)(?<!\\)"(.*?)(?<!\\)"(.*?)(")'
    fixed_line = re.sub(pattern, r'\1\2\"\3\"\4\5', line)
    return fixed_line


with open('GAIL-DA-tasks-questions.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        
        # Skip comment lines
        if line.startswith("##") or line.startswith('%'):
            continue
        
        # If line is empty and we have a JSON block collected
        if not line and buffer:
            try:
                obj = json.loads(buffer)
                data.append(obj)
            except json.JSONDecodeError as e:
                print("Invalid JSON block:\n", buffer)
                invalid_blocks.append(buffer)
                # print("Error:", e)

            buffer = ""
        else:
            buffer += line  # accumulate multi-line JSON objects

data.append({"id": 34, "question": "Create the histogram for the runners of the \"10 Mile\" event. Describe the overall shape of the histogram.What does this suggest about the structure of the age distribution of the runners?Calculate some simple summary statistics to support your comments.","concepts": ["Data Visualization-Summary"],"constraints": "Calculate the summary statistics and histogram using R's tidyverse functionalities","format": "@histogram_10mile_summary_stats[histogram_10mile_summary_stats] where \"histogram_10mile_summary_stats\" is the created histogram and additional summary statistics","file_name": "cherryblossom::run17", "level": "medium"})
data.append({"id": 34.1, "question": "Create a similar data visualisation for the \"5K\" race event.Describe the shape of this histogram and discuss the similarities/differences of the age distribution between the \"5K\" and \"10 Mile\"races.","concepts": ["Data Visualization-Interpretation"],"constraints": "Create the histogram and related comparison using R's tidyverse functionalities","format": "@histogram_10mile_5mile_comparison[histogram_10mile_5mile_comparison] where \"histogram_10mile_5mile_comparison\" is the created histogram and additional comparison comments","file_name": "cherryblossom::run17", "level": "hard"})
data.append({"id": 37, "question": "Create a new data frame called gss16_advfront that includes the variables advfront, emailhr (Number of hours spent on email weekly),educ (education level), polviews (political views) and wrkstat (working status). Remove any row that contains any NAs.Relevel the advfront variable based on \"Agree\" - combining the options \"Strongly agree\" and \"Agree\" and \"Not agree\" - combining the options \"Dont know\", \"Disagree\" and \"Strongly disagree\".Besides, relevel the polviews to simplify the range of options to 3 categories - \"Conservative\" , \"Moderate\", and \"Liberal\".Creating a new fulltime variable that is equal to TRUE if wrkstat is equal to \“Working fulltime\” and FALSE otherwise.Save the name data set as a new data frame called 'gss16_advfront'","concepts": ["Data Cleaning-Preparation"],"constraints": "Cleaning the data set using R's tidyverse functionalities","format": "@gss16_data_set_cleaning[gss16_data_set_cleaning] where \"gss16_data_set_cleaning\" is the cleaned data set based on given data and instructions","file_name": "gss16.csv", "level": "hard"})
data.append({"id": 45, "question": "For the given data frames in tibble format,```df1 <- tibble(x = 1:3)df2 <- tibble(x = c(1, 1, 2), y = c(\"first\", \"second\", \"third\"))```Apply the joining procedure results in a new data frame with missing value.","concepts": ["Data Preparation"],"constraints": "Merging the given specific dfs using R's tidyverse functionalities","format": "@df1_df2_merged[df1_df2_merged] where \"df1_df2_merged\" is the merged data frame based on given condition","file_name": "", "level": "easy"})
data.append({"id": 41, "question": "In the context of an email spam detection system, describe the terms \"True Positive\"\, \"False Positive\"\, \"True Negative\"\, and \"False Negative\"\ ","concepts": ["Specific Descriptions"],"constraints": "Giving the descriptions of the mentioned quantities","format": "@Confusion_definitions[Confusion_definitions] where \"Confusion_definitions\" is the verbal definitions for the given quantities.","file_name": "", "level": "easy"})

print(f"Parsed {len(data)} JSON objects")

Invalid JSON block:
 {"id": 34, "question": "Create the histogram for the runners of the \"10 Mile"\ event. Describe the overall shape of the histogram.What does this suggest about the structure of the age distribution of the runners?Calculate some simple summary statistics to support your comments.","concepts": ["Data Visualization-Summary"],"format": "@histogram_10mile_summary_stats[histogram_10mile_summary_stats] where \"histogram_10mile_summary_stats\" is the created histogram and additional summary statistics","file_name": "cherryblossom_run17.xlsx", "level": "medium"}
Invalid JSON block:
 {"id": 34.1, "question": "Create a similar data visualisation for the \"5K"\ race event.Describe the shape of this histogram and discuss the similarities/differences of the age distribution between the \"5K"\ and \"10 Mile"\ races.","concepts": ["Data Visualization-Interpretation"],"format": "@histogram_10mile_5mile_comparison[histogram_10mile_5mile_comparison] where \"histogram_10mile_5mile_com

In [2]:
import pandas as pd

num = len(data)
ids = [data[i]['id'] for i in range(num)]
levels = [data[i]['level'] for i in range(num)]
filenames = [data[i]['file_name'] for i in range(num)]
concepts = [data[i]['concepts'][0] for i in range(num)]

from collections import Counter

levels_counter = Counter(levels)
filenames_counter = Counter(filenames)
concepts_counter = Counter(concepts)

In [3]:
total_concept = sum(concepts_counter.values())
ratio_concept = {key: (value, value / total_concept) for key, value in concepts_counter.items()}

total_level = sum(levels_counter.values())
ratio_level = {key: (value, value / total_level) for key, value in levels_counter.items()}

In [4]:
ratio_level

{'easy': (22, 0.21782178217821782),
 'medium': (40, 0.39603960396039606),
 'hard': (39, 0.38613861386138615)}

In [5]:
key_data = {'id':ids, 'level':levels, 'filename':filenames, 'concept':concepts}
df = pd.DataFrame(data=key_data)
df

Unnamed: 0,id,level,filename,concept
0,0.0,easy,UK-visitor-numbers.csv,Data understanding
1,1.0,easy,UK-visitor-numbers.csv,Data understanding
2,2.0,easy,UK-visitor-numbers.csv,Data understanding
3,3.0,easy,UK-visitor-numbers.csv,Data understanding
4,4.0,medium,UK-visitor-numbers.csv,Data understanding
...,...,...,...,...
96,34.0,medium,cherryblossom::run17,Data Visualization-Summary
97,34.1,hard,cherryblossom::run17,Data Visualization-Interpretation
98,37.0,hard,gss16.csv,Data Cleaning-Preparation
99,45.0,easy,,Data Preparation


In [6]:
summary = []
for filename in set(filenames):
    df_file = df[df['filename'] == filename]



    summary.append([filename,
                    set(df_file['id']),
                    Counter(df_file['level']),
                    Counter(df_file['concept'])])



In [7]:
df_summary = pd.DataFrame(summary,columns=['dataset','questions','difficulty','concepts'])
df_summary

Unnamed: 0,dataset,questions,difficulty,concepts
0,,"{40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 52.0, 55.0}","{'easy': 6, 'medium': 2}","{'CI Interpretation': 1, 'Summary Statistics':..."
1,ggplot::diamonds,"{15.1, 16.0, 16.1, 17.0, 15.0, 18.0, 19.0, 20.0}","{'hard': 4, 'medium': 4}","{'General EDA': 1, 'Data cleaning': 1, 'Data t..."
2,Edinburgh_rainfall.csv,"{71.2, 71.4, 71.3, 71.1, 71.0, 72.0}","{'hard': 4, 'medium': 2}","{'Bayesian Regression Model': 4, 'Bayesian Reg..."
3,gss16_advfront.csv,"{38.1, 39.1, 38.0, 39.0}","{'medium': 1, 'hard': 3}","{'Regression Modeling': 2, 'Logistic Regressio..."
4,Gene_Data.RData / Gene_Data.xlsx,"{66.0, 67.0, 68.0, 69.0}","{'medium': 2, 'hard': 2}","{'Data Description': 1, 'PCA': 1, 'Clustering'..."
5,"edibnb.csv, council_assessments.csv",{33.0},{'hard': 1},{'Data Summary': 1}
6,cherryblossom_run17.xlsx,"{35.0, 36.0}",{'hard': 2},{'Data Visualization': 2}
7,mouse.txt,"{53.0, 54.0}","{'medium': 1, 'easy': 1}","{'Data Visualization': 1, 'Data Summary': 1}"
8,laptop_data_cleaned.csv,"{73.0, 46.0, 47.1, 48.0, 48.1, 47.0, 49.0, 50....","{'easy': 1, 'medium': 4, 'hard': 4}","{'Data Summary': 1, 'Data Visualization': 2, '..."
9,Stats_diamonds.xlsx,"{70.1, 70.3, 70.2, 70.0}","{'easy': 2, 'medium': 1, 'hard': 1}","{'Linear Regression': 1, 'Regression Model Dia..."


In [8]:
df_summary.iloc[14,1]

{21.0, 22.0, 23.0, 23.1, 23.2, 24.0, 25.0, 26.0, 26.1}

In [9]:
df_cat = pd.DataFrame([
    [f"{k}", v[0], f"{v[1]*100:.1f}%"] for k, v in ratio_concept.items()
], columns=["Statistic", "Count", "Percentage"])

df_cat=df_cat.sort_values(by='Count',ascending=False).reset_index(drop=True)
df_cat

Unnamed: 0,Statistic,Count,Percentage
0,Data Visualization,8,7.9%
1,Regression Modeling,6,5.9%
2,Data Summary,5,5.0%
3,Data Understanding,5,5.0%
4,Data understanding,5,5.0%
5,Data summary,4,4.0%
6,Bayesian Regression Model,4,4.0%
7,Linear Regression,3,3.0%
8,Hypothesis Testing,3,3.0%
9,Logistic Regression Model,3,3.0%


In [10]:
df_total = pd.DataFrame([["Total tasks", num, "100%"]], columns=df_cat.columns)
df_diff = pd.DataFrame([
    [f"{k}", v[0], f"{v[1]*100:.1f}%"] for k, v in ratio_level.items()
], columns=["Statistic", "Count", "Percentage"])

pd.concat([df_total, df_diff], ignore_index=True)

Unnamed: 0,Statistic,Count,Percentage
0,Total tasks,101,100%
1,easy,22,21.8%
2,medium,40,39.6%
3,hard,39,38.6%


In [11]:
import matplotlib.pyplot as plt
n = 10 # number of top features

df_sorted = df_cat

top_df = df_sorted.iloc[:n].copy()
top_df["Label"] = top_df["Statistic"]

# Group the rest as 'Others'
other_count = df_sorted.iloc[n:]["Count"].sum()
other_ratio = df_sorted.iloc[n:]["Percentage"].sum()
others = pd.DataFrame([{
    "Statistic": "Others",
    "Count": other_count,
    "Percentage": other_ratio,
    "Label": "Others"
}])

# Combine top n + others
pie_df = pd.concat([top_df, others], ignore_index=True)

# Custom colors: unique for top n + one for Others
# colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d3d3d3"]  # last one is light gray for "Others"

# Pie chart
plt.figure(figsize=(7, 7))
plt.pie(
    pie_df["Count"],
    labels=pie_df["Label"],
    # colors=colors,
    autopct="%1.1f%%",
    startangle=140
)
plt.title(f"Top {n} Categories + Others")
plt.axis("equal")  # Equal aspect ratio
plt.tight_layout()
plt.show()


ModuleNotFoundError: No module named 'matplotlib'

In [None]:
import pandas as pd

# Load the Excel file
df = pd.read_excel('cherryblossom_run17.xlsx')
# Save it as a CSV file
df.to_csv('cherryblossom_run17.csv', index=False)


In [None]:
%pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [openpyxl]1/2[0m [openpyxl]
[1A[2KSuccessfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
Note: you may need to restart the kernel to use updated packages.
