## 1. Install libraries
First, create a new conda environment named BI2025 and install the required packages from requirements.txt


In [None]:
#!conda install -n .conda ipykernel --update-deps --force-reinstall
#!conda create -n BI2025 python=3.11 -y
#!conda activate BI2025
#%pip install -r requirements.txt

Collecting starvers@ git+https://github.com/AllStarsAT/starvers.git (from -r requirements.txt (line 4))
  Cloning https://github.com/AllStarsAT/starvers.git to c:\users\dell\appdata\local\temp\pip-install-fqjm528r\starvers_cbc21f6f50da48869d049e77c242dfa6
  Resolved https://github.com/AllStarsAT/starvers.git to commit 5e6e112e2b37cb5a27af9585bd572d27187ef735
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting seaborn (from -r requirements.txt (line 2))
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting matplotlib (from -r requirements.txt (line 3))
  Downloading matplotlib-3.10.7-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting plotly (from -r requirements.txt (line 5))
  

  Running command git clone --filter=blob:none --quiet https://github.com/AllStarsAT/starvers.git 'C:\Users\Dell\AppData\Local\Temp\pip-install-fqjm528r\starvers_cbc21f6f50da48869d049e77c242dfa6'


In [42]:
# DO NOT MODIFY OR COPY THIS CELL!! 
# Note: The only imports allowed are Python's standard library, pandas, numpy, scipy, matplotlib, seaborn and scikit-learn
import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import plotly.express as px
import datetime
import typing
import requests
import time
import shutil
import json
from starvers.starvers import TripleStoreEngine
import uuid


## 2. Graph-based documentation preliminaries

**!!!IMPORTANT!!!**

Everytime you work on this notebook, enter your student ID in the `executed_by` variable so that the cell executions are accredited to you.

In [3]:
executed_by ='stud-id_12307565'     #Lili 
#executed_by = 'stud-id_12340246'    #Zita

Set your group and student IDs. Do this only once.

In [4]:
# group id for this project
group_id = '009'  # Replace the digits with your group id

# Students working on this notebook
student_a = 'stud-id_12307565'  # Replace the digits after "id_" with student A's student ID
student_b = 'stud-id_12340246'  # Replace the digits after "id_" with student B's student ID

In [5]:
# Roles. Don't change these values.
code_writer_role = 'code_writer'
code_executor_role = 'code_executor'

Setup the starvers API for logging your steps into our server-sided graph database.

In [6]:
get_endpoint = "https://starvers.ec.tuwien.ac.at/BI2025"
post_endpoint = "https://starvers.ec.tuwien.ac.at/BI2025/statements"
engine = TripleStoreEngine(get_endpoint, post_endpoint, skip_connection_test=True)

Use these prefixes in your notebooks. You can extend this dict with your prefixes of additional ontologies that you use in this notebook. Replace 00 with your group id

In [7]:
prefixes = {
    'xsd': 'http://www.w3.org/2001/XMLSchema#',
    'rdfs': 'http://www.w3.org/2000/01/rdf-schema#',
    'foaf': 'http://xmlns.com/foaf/0.1/',
    'prov': 'http://www.w3.org/ns/prov#',
    'sc': 'https://schema.org/',
    'cr': 'http://mlcommons.org/croissant/',
    'mls': 'http://www.w3.org/ns/mls#',
    'mlso': 'http://w3id.org/mlso',
    'siu': 'https://si-digital-framework.org/SI/units/',
    'siq': 'https://si-digital-framework.org/SI/quantities/',
    'qudt': 'http://qudt.org/schema/qudt/',
    '': f'https://starvers.ec.tuwien.ac.at/BI2025/{group_id}/',
}

prefix_header = '\n'.join([f'PREFIX {k}: <{v}>' for k, v in prefixes.items()]) + '\n\n'

Ontologies to use
* Provenance of the experiment process
    * PROV-O: 
        * doc: https://www.w3.org/TR/prov-o/
        * serialization: https://www.w3.org/ns/prov-o
* Data used and created
    * schema.org - Dataset: 
        * doc: https://schema.org/Dataset
        * serialization: https://schema.org/version/latest/schemaorg-current-https.ttl
    * Crossaint
        * doc: https://docs.mlcommons.org/croissant/docs/croissant-spec.html
        * serialization: https://github.com/mlcommons/croissant/blob/main/docs/croissant.ttl
* ML experiments performed
    * MLSO: 
        * doc: https://github.com/dtai-kg/MLSO
        * doc: https://dtai-kg.github.io/MLSO/#http://w3id.org/
        * serialization: https://dtai-kg.github.io/MLSO/ontology.ttl
* Measurements, Metrics, Units
    * QUDT
        * doc:https://qudt.org/
        * doc: https://github.com/qudt/qudt-public-repo
        * serialization: https://github.com/qudt/qudt-public-repo/blob/main/src/main/rdf/schema/SCHEMA_QUDT.ttl
    * SI Digital Framework
        * doc: https://github.com/TheBIPM/SI_Digital_Framework/blob/main/SI_Reference_Point/docs/README.md
        * doc: https://si-digital-framework.org/
        * doc: https://si-digital-framework.org/SI
        * serialization: https://github.com/TheBIPM/SI_Digital_Framework/blob/main/SI_Reference_Point/TTL/si.ttl
    * Quantities and Units
        * doc: https://www.omg.org/spec/Commons
        * serialization: https://www.omg.org/spec/Commons/QuantitiesAndUnits.ttl

Use this function to record execution times.

In [8]:
def now() -> str:
    """
    Returns the current time in ISO 8601 format with UTC timezone in the following format:
    YYYY-MM-DDTHH:MM:SS.sssZ
    """
    timestamp = datetime.datetime.now(datetime.timezone.utc)
    timestamp_formated = timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]  +"Z"

    return timestamp_formated

Register yourself in the Knowledge Graph using ProvO. Change the given name, family name and immatriculation number to reflect your own data.

In [45]:
# Ontologies used: foaf, prov, IAO
reigstration_triples_a = [
f':{student_a} rdf:type foaf:Person .',
f':{student_a} rdf:type prov:Agent .',
f':{student_a} foaf:givenName "Liliana" .',
f':{student_a} foaf:familyName "Sulyok" .',
f':{student_a} <http://vivoweb.org/ontology/core#identifier> :{student_a} .',
f':{student_a} rdf:type <http://purl.obolibrary.org/obo/IAO_0000578> .',
f':{student_a} <http://www.w3.org/2000/01/rdf-schema#label> "Immatriculation number" .',
f':{student_a} <http://purl.obolibrary.org/obo/IAO_0000219> "12307565"^^xsd:string .',
]

reigstration_triples_b = [
f':{student_b} rdf:type foaf:Person .',
f':{student_b} rdf:type prov:Agent .',
f':{student_b} foaf:givenName "Zita Dorina" .',
f':{student_b} foaf:familyName "Marton" .',
f':{student_b} <http://vivoweb.org/ontology/core#identifier> :{student_b} .',
f':{student_b} rdf:type <http://purl.obolibrary.org/obo/IAO_0000578> .',
f':{student_b} <http://www.w3.org/2000/01/rdf-schema#label> "Immatriculation number" .',
f':{student_b} <http://purl.obolibrary.org/obo/IAO_0000219> "12340246"^^xsd:string .',
]

role_triples = [
    f':{code_writer_role} rdf:type prov:Role .',
    f':{code_executor_role} rdf:type prov:Role .',
]


engine.insert(reigstration_triples_a, prefixes=prefixes)
engine.insert(reigstration_triples_b, prefixes=prefixes)
engine.insert(role_triples, prefixes=prefixes)

In [56]:
# UUIDs will be generated for each data processing step, and entered manually

str(uuid.uuid4())

'96d0a477-2bca-4012-ab6e-b017a083705d'

**What not do do**

Do not use [blank nodes](https://www.w3.org/wiki/BlankNodes).

PROV-O uses blank nodes to connect multiple elements with each other.
Such blank nodes (such as _:association) should not be used.
Instead, assign a fixed node ID such as
:5119fcd7-b571-41e0-9464-a37c7be0f574 by generating them outside of the
notebook.
We suggest that, for each setting where such a blank node is needed to
connect multiple elements, you create a unique hash (using uuid.uuid4())
and keep this as hard-coded identifier for the blank node. The template
notebook contains examples of this. Do *not* use these provided values,
as otherwise, your provenance documentations will all be connected via
these identifiers!
Also, do not generate them dynamically in every cell execution, e.g. by
using uuid.uuid4() in a cell. This would generate many new linking nodes
for connecting the same elements.
Compute one for each node (cell) where you need them and make sure to
use the same one on each re-execution of the notebook.

## 3. Business Understanding 

In [46]:
## Each Activity that follows is part of the Business Understanding Phase

business_understanding_phase_executor = [
f':business_understanding_phase rdf:type prov:Activity .',
f':business_understanding_phase rdfs:label "Business Understanding Phase" .', ## Phase 1: Business Understanding
]
engine.insert(business_understanding_phase_executor, prefixes=prefixes)


In [47]:
data_src_and_scenario_comment = """
The dataset is the Online Shoppers Purchasing Intention Dataset from the 
UCI Machine Learning Repository. It contains 12,330 anonymized, unique user
sessions from an e-commerce website collected over one year. The detaset 
includes behavioral, technical, and user-related attributes such as page
duration, bounce rates, visitor type. The binary target variable indicates 
whether a session resulted in revenue. 

Business Scenario:
We assume the role of data scientists for a mid-sized online retail company
that seeks to improve its marketing efficiency by identifing sessions with high
purchase intent in real time. This allows the marketing team to apply targeted
interventions (such as live chat support or limited-time discount pop-ups 
specifically for users likely to purchase) only when beneficial. 
Predicting purchase likelihood enables cost-efficient personalization, 
optimizing marketing resources and strategic decision-making
"""

business_objectives_comment = """
Primary Objective:
Increase online revenue by identifying sessions with high purchase intent and 
by proactively engaging customers using targeted, cost-efficient interventions 
that increase the probability of conversion.

Secondary Objectives:
Improve the overall conversion rate through real-time personalization.
Reduce unnecessary promotional costs on users who are highly unlikely to buy
regardless of incentives.
"""

business_success_criteria_comment = """
The project is considered a success if the resulting model can be deployed to:

Profitability: Identify purchasing customers with sufficient precision that the 
cost of the intervention (e.g., the discount margin lost) is outweighed by the 
revenue gained from "saved" sales.

Deployment Feasibility: The model must be lightweight enough to infer intent in
near real-time (milliseconds) as the user browses.
"""

data_mining_goals_comment = """
Primary Data Mining Goal: Binary Classification
Develop a supervised machine learning model to predict the target variable 
Revenue and estimates the likelihood that a given web session will result in 
a purchase.


Secondary Goal: Imbalance Management
The dataset is imbalanced (approx. 85% negative / 15% positive), therefore the
goal is to handle this skew effectively without biasing the model toward the 
majority class (No Purchase).
"""

data_mining_success_criteria_comment = """
Model-level performance targets

AUC-ROC Score: 
Achieving a score greater than 0.80 on the test set, indicating a strong 
ability to distinguish between buyers and non-buyers.

F1-Score: 
Achieving a score greater than 0.60, balancing Precision

Baseline Comparison:
The model must significantly outperform a trivial classifier (which would 
achieve ~85% accuracy by always predicting "No Purchase") and a random guess 
baseline.

Stable performance across validation splits.
"""

ai_risk_aspects_comment = """
Key AI risk considerations:

1. Bias and Fairness:
The dataset contains features such as browser, operating system, and region, 
which may act as proxies for socioeconomic status, potentially leading to unfair
treatment across user groups. Class imbalance may also bias predictions towards 
the majority class.

2. Privacy and Profiling:
The analysis uses tracking data (Operating Systems, Browser, Duration). 
Deployment requires strict adherence to privacy regulations (e.g., GDPR), 
ensuring users have consented to behavioral tracking, knowing that bhavioral 
data may enable profiling of users.

3. Transparency:
If personalized offers are triggered based on model predictions, stakeholders 
must be able to understand and justify the reasoning behind these decisions.

4. Operational Risk:
Incorrect predictions may lead to unnecessary promotion costs (false positives)
or missed revenue opportunities (false negatives). 
Thresholding and monitoring must be carefully designed.

5. Manipulation:
There is an ethical risk in exploiting behavioral data to target vulnerable
users with aggressive sales tactics.
"""



business_understanding_uuid_executor = '53996d94-9b75-4479-9046-e15a53067259'
business_understanding_executor = [
f':business_understanding rdf:type prov:Activity .',
f':business_understanding sc:isPartOf :business_understanding_phase .',
f':business_understanding prov:qualifiedAssociation :{business_understanding_uuid_executor} .',
f':{business_understanding_uuid_executor} prov:agent :{executed_by} .',
f':{business_understanding_uuid_executor} rdf:type prov:Association .',
f':{business_understanding_uuid_executor} prov:hadRole :{code_executor_role} .',
]
engine.insert(business_understanding_executor, prefixes=prefixes)


business_understanding_data_executor = [
# 1a
f':bu_data_source_and_scenario rdf:type prov:Entity .',
f':bu_data_source_and_scenario prov:wasGeneratedBy :business_understanding .',
f':bu_data_source_and_scenario rdfs:label "1a Data Source and Scenario" .',
f':bu_data_source_and_scenario rdfs:comment """{data_src_and_scenario_comment}""" .',
# 1b
f':bu_business_objectives rdf:type prov:Entity .',
f':bu_business_objectives prov:wasGeneratedBy :business_understanding .',
f':bu_business_objectives rdfs:label "1b Business Objectives" .',
f':bu_business_objectives rdfs:comment """{business_objectives_comment}""" .',
# 1c
f':bu_business_success_criteria rdf:type prov:Entity .',
f':bu_business_success_criteria prov:wasGeneratedBy :business_understanding .',
f':bu_business_success_criteria rdfs:label "1c Business Success Criteria" .',
f':bu_business_success_criteria rdfs:comment """{business_success_criteria_comment}""" .',
# 1d
f':bu_data_mining_goals rdf:type prov:Entity .',
f':bu_data_mining_goals prov:wasGeneratedBy :business_understanding .',
f':bu_data_mining_goals rdfs:label "1d Data Mining Goals" .',
f':bu_data_mining_goals rdfs:comment """{data_mining_goals_comment}""" .',
# 1e
f':bu_data_mining_success_criteria rdf:type prov:Entity .',
f':bu_data_mining_success_criteria prov:wasGeneratedBy :business_understanding .',
f':bu_data_mining_success_criteria rdfs:label "1e Data Mining Success Criteria" .',
f':bu_data_mining_success_criteria rdfs:comment """{data_mining_success_criteria_comment}""" .',
# 1f
f':bu_ai_risk_aspects rdf:type prov:Entity .',
f':bu_ai_risk_aspects prov:wasGeneratedBy :business_understanding .',
f':bu_ai_risk_aspects rdfs:label "1f AI risk aspects" .',
f':bu_ai_risk_aspects rdfs:comment """{ai_risk_aspects_comment}""" .',

]
engine.insert(business_understanding_data_executor, prefixes=prefixes)

## Data Understanding

### Load data

In [None]:
## Each Activity that follows is part of the Data Understanding Phase

data_understanding_phase_executor = [
f':data_understanding_phase rdf:type prov:Activity .',
f':data_understanding_phase rdfs:label "Data Understanding Phase" .', 
]
engine.insert(business_understanding_phase_executor, prefixes=prefixes)


In [69]:
load_shopping_data_code_writer = student_a

def load_shopping_data(filepath: str)-> pd.DataFrame:
    """
    Load the online shoppers purchasing intention dataset from a CSV file.
    The nature of the data requires no index set.

    Returns:
        pd.DataFrame: DataFrame containing the loaded dataset.
    """
    
    raw_data = pd.read_csv(shopping_data_path,  sep=',', header = 0)
    # Check for invalid month values
    valid_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    raw_data['Month'] = raw_data['Month'].apply(lambda x: x if x in valid_months else None)
    raw_data['VisitorType'] = raw_data['VisitorType'].apply(lambda x: 1 if x == 'Returning_Visitor' else 2 if x == 'New_Visitor' else 3 if x == 'Other' else 4)

    # Convert valid month names to numeric values
    raw_data['Month'] = pd.to_datetime(raw_data['Month'], format='%b', errors='coerce').dt.month

    
    return raw_data



start_time_ld = now()
shopping_data_path = os.path.join("data", "online_shoppers_intention.csv")
shopping_df = load_shopping_data(shopping_data_path)
end_time_ld = now()

display(shopping_df.head())


#############################################
# Documentation
#############################################

ld_ass_uuid_executor = "fa0f4ddf-4a31-40bd-b2a0-98146b8f2de5"
load_shopping_data_executor = [
    f':load_shopping_data prov:qualifiedAssociation :{ld_ass_uuid_executor} .',
    f':{ld_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{ld_ass_uuid_executor} rdf:type prov:Association .',
    f':{ld_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
]
engine.insert(load_shopping_data_executor, prefixes=prefixes)

ld_ass_uuid_writer = "8fc07fd5-de82-4b44-a7b7-ecafc0fdbe09"
ld_report = """
Load all records of the shopping dataset. The activity reads the CSV file,
converts the Month attribute to a machine-readable textual representation, 
and returns a dataframe containing 18 semantically interpretable fields.
"""
load_shopping_data_activity = [
    ':load_shopping_data rdf:type prov:Activity .',
    ':load_shopping_data sc:isPartOf :data_understanding_phase .',
    ':load_shopping_data rdfs:comment \'Data Understanding\' .',
    f':load_shopping_data rdfs:comment """{ld_report}""" .', 
    f':load_shopping_data prov:startedAtTime "{start_time_ld}"^^xsd:dateTime .',
    f':load_shopping_data prov:endedAtTime "{end_time_ld}"^^xsd:dateTime .',

    #Writer association
    f':load_shopping_data prov:qualifiedAssociation :{ld_ass_uuid_writer} .',
    f':{ld_ass_uuid_writer} prov:agent :{load_shopping_data_code_writer} .',
    f':{ld_ass_uuid_writer} rdf:type prov:Association .',
    f':{ld_ass_uuid_writer} prov:hadRole :{code_writer_role} .',

    # INPUT of activity
    ':load_shopping_data prov:used :raw_data .',
    ':load_shopping_data prov:used :raw_data_path .',
    ':raw_data rdf:type prov:Entity .',
    ':raw_data_path rdf:type prov:Entity .',
    ':raw_data prov:wasDerivedFrom :raw_data_path .',

    # OUTPUT of activity
    ':data rdf:type prov:Entity .',
    ':data prov:wasGeneratedBy :load_shopping_data .',
    ':data prov:wasDerivedFrom :raw_data .',
]
engine.insert(load_shopping_data_activity, prefixes=prefixes)

# Description of the raw data using Croissant
raw_data_triples = [
    ':raw_data rdf:type sc:Dataset .',
    ':raw_data sc:name \'Online Shoppers Purchasing Intention Dataset\' .',
    ':raw_data sc:description \'Describe the data set.\' .',

    # The dataset
    ':online_shoppers_intention_csv rdf:type cr:FileObject .',
    ':online_shoppers_intention_csv sc:name \'online_shoppers_intention.csv\' .',
    ':online_shoppers_intention_csv sc:encodingFormat \'text/csv\' .',
    ':raw_data sc:distribution :online_shoppers_intention_csv .',

    # Distribution 
    ':raw_recordset rdf:type cr:RecordSet .',
    ':raw_recordset sc:name \'Table of online shopping sessions over a year\' .',
    ':raw_recordset cr:source :online_shoppers_intention_csv .',
    ':raw_data cr:recordSet :raw_recordset .'
]
engine.insert(raw_data_triples, prefixes=prefixes)

# Description of all the fields in the dataset using Croissant
fields = {
    "Administrative": ("Number of administrative pages visited.", "xsd:integer"),
    "Administrative_Duration": ("Time spent on administrative pages (sec).", "xsd:double"),
    "Informational": ("Number of informational pages visited.", "xsd:integer"),
    "Informational_Duration": ("Time spent on informational pages (sec).", "xsd:double"),
    "ProductRelated": ("Number of product-related pages viewed.", "xsd:integer"),
    "ProductRelated_Duration": ("Total time on product pages (sec).", "xsd:double"),
    "BounceRates": ("Average bounce rate.", "xsd:double"),
    "ExitRates": ("Average exit rate.", "xsd:double"),
    "PageValues": ("Value estimate of viewed pages.", "xsd:double"),
    "SpecialDay": ("Proximity to a special day (0â€“1).", "xsd:double"),
    "Month": ("Month of session.", "xsd:string"),
    "OperatingSystems": ("Operating system code.", "xsd:integer"),
    "Browser": ("Browser code.", "xsd:integer"),
    "Region": ("Region code.", "xsd:integer"),
    "TrafficType": ("Traffic source category.", "xsd:integer"),
    "VisitorType": ("Visitor category (Returning/New).", "xsd:string"),
    "Weekend": ("Boolean indicating weekend visit.", "xsd:boolean"),
    "Revenue": ("Class label: whether purchase occurred.", "xsd:boolean")
}

field_triples = []
for field_name, (desc, datatype) in fields.items():
    field_id = f":field_{field_name}"
    field_triples += [
        f'{field_id} rdf:type cr:Field .',
        f'{field_id} sc:name "{field_name}" .',
        f'{field_id} sc:description "{desc}" .',
        f'{field_id} cr:dataType {datatype} .',
        f':raw_recordset cr:field {field_id} .',
    ]

engine.insert(field_triples, prefixes=prefixes)

units_triples = [
    # page counts = arbitrary count unit
    ':field_Administrative qudt:unit qudt:CountingUnit .',
    ':field_Informational qudt:unit qudt:CountingUnit .',
    ':field_ProductRelated qudt:unit qudt:CountingUnit .',

    # durations = seconds
    ':field_Administrative_Duration qudt:unit siu:second .',
    ':field_Informational_Duration qudt:unit siu:second .',
    ':field_ProductRelated_Duration qudt:unit siu:second .',

    # bounce, exit, special day, page values = dimensionless ratios
    ':field_BounceRates qudt:unit qudt:DimensionlessUnit .',
    ':field_ExitRates qudt:unit qudt:DimensionlessUnit .',
    ':field_SpecialDay qudt:unit qudt:DimensionlessUnit .',
    ':field_PageValues qudt:unit qudt:DimensionlessUnit .',

    # categorical / boolean: no units needed
]

engine.insert(units_triples, prefixes=prefixes)


Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2.0,1,1,1,1,1,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,2.0,2,2,1,2,1,False,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2.0,4,1,9,3,1,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,2.0,3,2,2,4,1,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,2.0,3,3,1,4,1,True,False


### 2.a Attribute types, units of measurement, and the semantics of attributes

In [59]:
data_understanding_2a_code_writer = student_a


start_time_data_understanding_2a = now()


display(shopping_df.info())
## CONCLUSIONS (manual summary for provenance):
# - Number of rows and columns is correct.
# - Semantically, all data types appear correct; object-type columns require closer inspection.
# - Non-null counts look correct; NA types still need deeper evaluation.


attribute_info = {
    "Administrative": "Number of administrative pages visited.",
    "Administrative_Duration": "Total time spent on administrative pages (seconds).",
    "Informational": "Number of informational pages visited.",
    "Informational_Duration": "Time spent on informational pages (seconds).",
    "ProductRelated": "Number of product-related pages viewed.",
    "ProductRelated_Duration": "Total time on product pages (seconds).",
    "BounceRates": "Percentage of visitors who leave from that page during that session.",
    "ExitRates": "Percentage of last pageviews in the session.",
    "PageValues": "Average value a user visited before completing an e-commerce transaction.",
    "SpecialDay": "Proximity to a special day (0â€“1).",
    "Month": "Month of session.",
    "OperatingSystems": "Recorded OS identifier.",
    "Browser": "Browser identifier.",
    "Region": "Region code.",
    "TrafficType": "Traffic source category.",
    "VisitorType": "Returning or new visitor.",
    "Weekend": "Boolean indicating if visit was on a weekend.",
    "Revenue": "Target variable: purchase occurred (True/False)."
}
display(attribute_info)

display(shopping_df.describe())
## CONCLUSIONS (manual summary for provenance):
# - Numerical columns show semantically meaningful ranges.
# - Minimum/maximum values appear correct
# - Means indicate class imbalanceâ€”skew analysis required later.
# - No semantic outliers identified, but statistical outliers will be checked later.

display(shopping_df.describe(include=["object", "bool"]))
## CONCLUSTIONS (manual summary for provenance):
# - Categorical fields have expected number of unique values
# - Unexpected additional default visitor type will be checked later.

display(shopping_df.nunique())

## CONCLUSION:
# - The 'Month' column is missing two months; need to inspect if the dataset covers whole year.

display(shopping_df["TrafficType"].unique())
display(shopping_df["Month"].unique())
display(shopping_df["OperatingSystems"].unique())
display(shopping_df["SpecialDay"].unique())
display(shopping_df["VisitorType"].unique())
## Conclusion
# All categorical, boolean fields have expected unique values except 'VisitorType',
# which includes an unexpected 'Other' category. The 'Month' field is missing two months


end_time_data_understanding_2a = now()



############################################################
# 2a Documentation
############################################################

data_understanding_2a_ass_uuid_executor = "f359c376-274b-4da4-ba6b-b4e2788109f9"
data_understanding_2a_ass_uuid_writer = "96d0a477-2bca-4012-ab6e-b017a083705d"

data_understanding_2a_comment = """
Data Understanding 2a: Attribute types, semantics, units, and initial interpretation.
The dataset structure (rows, columns, dtypes) matches expectations. Numerical attributes 
reflect semantically meaningful ranges with plausible min/max values. Object-typed fields 
require follow-up semantic validation. No semantic anomalies detected at this stage but 
statistical outlier analysis will follow. The Month attribute shows missing months, requiring 
investigation of dataset completeness. VisitorType contains an unexpected 'Other' category.
"""

info_output = str(shopping_df.info())
describe_numeric_output = shopping_df.describe().to_json()
describe_categorical_output = shopping_df.describe(include=["object","bool"]).to_json()
nunique_output = shopping_df.nunique().to_json()


traffic_type_unique_output = shopping_df["TrafficType"].unique().tolist()
month_unique_output = shopping_df["Month"].unique().tolist()
os_unique_output = shopping_df["OperatingSystems"].unique().tolist()
specialday_unique_output = shopping_df["SpecialDay"].unique().tolist()
visitor_unique_output = shopping_df["VisitorType"].unique().tolist()



data_understanding_2a_activity = [

    # Activity definition
    ':data_understanding_2a rdf:type prov:Activity .',
    ':data_understanding_2a sc:isPartOf :data_understanding_phase .',
    ':data_understanding_2a rdfs:label "2a Attribute Types and Semantics" .',
    f':data_understanding_2a rdfs:comment """{data_understanding_2a_comment}""" .',

    # Timing
    f':data_understanding_2a prov:startedAtTime "{start_time_data_understanding_2a}"^^xsd:dateTime .',
    f':data_understanding_2a prov:endedAtTime "{end_time_data_understanding_2a}"^^xsd:dateTime .',

    # Executor association
    f':data_understanding_2a prov:qualifiedAssociation :{data_understanding_2a_ass_uuid_executor} .',
    f':{data_understanding_2a_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{data_understanding_2a_ass_uuid_executor} rdf:type prov:Association .',
    f':{data_understanding_2a_ass_uuid_executor} prov:hadRole :{code_executor_role} .',

    # Writer association
    f':data_understanding_2a prov:qualifiedAssociation :{data_understanding_2a_ass_uuid_writer} .',
    f':{data_understanding_2a_ass_uuid_writer} prov:agent :{data_understanding_2a_code_writer} .',
    f':{data_understanding_2a_ass_uuid_writer} rdf:type prov:Association .',
    f':{data_understanding_2a_ass_uuid_writer} prov:hadRole :{code_writer_role} .',


    ### ENTITY: Attribute Semantics -----------------------------------------
    ':data_understanding_2a_attribute_info rdf:type prov:Entity .',
    ':data_understanding_2a_attribute_info rdfs:label "Attribute Semantics" .',
    f':data_understanding_2a_attribute_info rdfs:comment """{attribute_info}""" .',
    ':data_understanding_2a_attribute_info prov:wasGeneratedBy :data_understanding_2a .',


    ### ENTITY: shopping_df.info() -------------------------------------------
    ':data_understanding_2a_info rdf:type prov:Entity .',
    ':data_understanding_2a_info rdfs:label "DataFrame Info Output" .',
    f':data_understanding_2a_info rdfs:comment """{info_output}""" .',
    ':data_understanding_2a_info prov:wasGeneratedBy :data_understanding_2a .',


    ### ENTITY: shopping_df.describe() numeric summary ------------------------
    ':data_understanding_2a_describe_numeric rdf:type prov:Entity .',
    ':data_understanding_2a_describe_numeric rdfs:label "Numeric Summary Statistics" .',
    f':data_understanding_2a_describe_numeric rdfs:comment """{describe_numeric_output}""" .',
    ':data_understanding_2a_describe_numeric prov:wasGeneratedBy :data_understanding_2a .',


    ### ENTITY: describe() categorical summary --------------------------------
    ':data_understanding_2a_describe_categorical rdf:type prov:Entity .',
    ':data_understanding_2a_describe_categorical rdfs:label "Categorical Summary Statistics" .',
    f':data_understanding_2a_describe_categorical rdfs:comment """{describe_categorical_output}""" .',
    ':data_understanding_2a_describe_categorical prov:wasGeneratedBy :data_understanding_2a .',


    ### ENTITY: nunique() -----------------------------------------------------
    ':data_understanding_2a_nunique rdf:type prov:Entity .',
    ':data_understanding_2a_nunique rdfs:label "Unique Value Counts" .',
    f':data_understanding_2a_nunique rdfs:comment """{nunique_output}""" .',
    ':data_understanding_2a_nunique prov:wasGeneratedBy :data_understanding_2a .',


    ### ENTITY: Unique categorical values -------------------------------------
    ':data_understanding_2a_unique_values rdf:type prov:Entity .',
    ':data_understanding_2a_unique_values rdfs:label "Unique Values of Key Categorical Columns" .',
    f':data_understanding_2a_unique_values rdfs:comment """TrafficType: {traffic_type_unique_output}\n'
    f'Month: {month_unique_output}\n'
    f'OperatingSystems: {os_unique_output}\n'
    f'SpecialDay: {specialday_unique_output}\n'
    f'VisitorType: {visitor_unique_output}""" .',
    ':data_understanding_2a_unique_values prov:wasGeneratedBy :data_understanding_2a .',
]


engine.insert(data_understanding_2a_activity, prefixes=prefixes)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12042 non-null  float64
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

None

{'Administrative': 'Number of administrative pages visited.',
 'Administrative_Duration': 'Total time spent on administrative pages (seconds).',
 'Informational': 'Number of informational pages visited.',
 'Informational_Duration': 'Time spent on informational pages (seconds).',
 'ProductRelated': 'Number of product-related pages viewed.',
 'ProductRelated_Duration': 'Total time on product pages (seconds).',
 'BounceRates': 'Percentage of visitors who leave from that page during that session.',
 'ExitRates': 'Percentage of last pageviews in the session.',
 'PageValues': 'Average value a user visited before completing an e-commerce transaction.',
 'SpecialDay': 'Proximity to a special day (0â€“1).',
 'Month': 'Month of session.',
 'OperatingSystems': 'Recorded OS identifier.',
 'Browser': 'Browser identifier.',
 'Region': 'Region code.',
 'TrafficType': 'Traffic source category.',
 'VisitorType': 'Returning or new visitor.',
 'Weekend': 'Boolean indicating if visit was on a weekend.',
 

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12042.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,7.691496,2.124006,2.357097,3.147364,4.069586
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,3.423429,0.911325,1.717277,2.401591,4.025169
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,5.0,2.0,2.0,1.0,2.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,8.0,2.0,2.0,3.0,2.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0,11.0,3.0,2.0,4.0,4.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,12.0,8.0,13.0,9.0,20.0


Unnamed: 0,VisitorType,Weekend,Revenue
count,12330,12330,12330
unique,3,2,2
top,Returning_Visitor,False,False
freq,10551,9462,10422


Administrative               27
Administrative_Duration    3335
Informational                17
Informational_Duration     1258
ProductRelated              311
ProductRelated_Duration    9551
BounceRates                1872
ExitRates                  4777
PageValues                 2704
SpecialDay                    6
Month                         9
OperatingSystems              8
Browser                      13
Region                        9
TrafficType                  20
VisitorType                   3
Weekend                       2
Revenue                       2
dtype: int64

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 18, 19,
       16, 17, 20])

array([ 2.,  3.,  5., 10., nan,  7.,  8., 11.,  9., 12.])

array([1, 2, 4, 3, 7, 6, 8, 5])

array([0. , 0.4, 0.8, 1. , 0.2, 0.6])

array(['Returning_Visitor', 'New_Visitor', 'Other'], dtype=object)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12042 non-null  float64
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

### 2.b Statistical properties describing the dataset including correlations

In [72]:
shopping_df.kurtosis()

Administrative               4.701146
Administrative_Duration     50.556739
Informational               26.932266
Informational_Duration      76.316853
ProductRelated              31.211707
ProductRelated_Duration    137.174164
BounceRates                  7.723159
ExitRates                    4.017035
PageValues                  65.635694
SpecialDay                   9.913659
Month                       -1.630700
OperatingSystems            10.456843
Browser                     12.746733
Region                      -0.148680
TrafficType                  3.479711
VisitorType                  4.544444
Weekend                     -0.397404
Revenue                      1.646493
dtype: float64

In [71]:
shopping_df.skew()

Administrative             1.960357
Administrative_Duration    5.615719
Informational              4.036464
Informational_Duration     7.579185
ProductRelated             4.341516
ProductRelated_Duration    7.263228
BounceRates                2.947855
ExitRates                  2.148789
PageValues                 6.382964
SpecialDay                 3.302667
Month                     -0.087167
OperatingSystems           2.066285
Browser                    3.242350
Region                     0.983549
TrafficType                1.962987
VisitorType                2.326762
Weekend                    1.265962
Revenue                    1.909509
dtype: float64

In [73]:
shopping_df.corr(method='pearson')

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
Administrative,1.0,0.601583,0.37685,0.255848,0.431119,0.373939,-0.223563,-0.316483,0.09899,-0.094778,0.098092,-0.006347,-0.025035,-0.005487,-0.033561,0.01668,0.026417,0.138917
Administrative_Duration,0.601583,1.0,0.30271,0.238031,0.289087,0.355422,-0.14417,-0.205798,0.067608,-0.073304,0.056891,-0.007343,-0.015392,-0.005561,-0.014376,0.01912,0.01499,0.093587
Informational,0.37685,0.30271,1.0,0.618955,0.374164,0.387505,-0.116114,-0.163666,0.048632,-0.048219,0.065079,-0.009527,-0.038235,-0.029169,-0.034491,-0.058211,0.035785,0.0952
Informational_Duration,0.255848,0.238031,0.618955,1.0,0.280046,0.347364,-0.074067,-0.105276,0.030861,-0.030577,0.043469,-0.009579,-0.019285,-0.027144,-0.024675,-0.045372,0.024078,0.070345
ProductRelated,0.431119,0.289087,0.374164,0.280046,1.0,0.860927,-0.204578,-0.292526,0.056282,-0.023958,0.160138,0.00429,-0.013146,-0.038122,-0.043064,-0.127916,0.016092,0.158538
ProductRelated_Duration,0.373939,0.355422,0.387505,0.347364,0.860927,1.0,-0.184541,-0.251984,0.052823,-0.03638,0.140064,0.002976,-0.00738,-0.033091,-0.036377,-0.118273,0.007311,0.152373
BounceRates,-0.223563,-0.14417,-0.116114,-0.074067,-0.204578,-0.184541,1.0,0.913004,-0.119386,0.072702,-0.064823,0.023823,-0.015772,-0.006485,0.078286,-0.114916,-0.046514,-0.150673
ExitRates,-0.316483,-0.205798,-0.163666,-0.105276,-0.292526,-0.251984,0.913004,1.0,-0.174498,0.102242,-0.093698,0.014567,-0.004442,-0.008907,0.078616,-0.152678,-0.062587,-0.207071
PageValues,0.09899,0.067608,0.048632,0.030861,0.056282,0.052823,-0.119386,-0.174498,1.0,-0.063541,0.066204,0.018508,0.045592,0.011315,0.012532,0.120077,0.012002,0.492569
SpecialDay,-0.094778,-0.073304,-0.048219,-0.030577,-0.023958,-0.03638,0.072702,0.102242,-0.063541,1.0,-0.261537,0.012652,0.003499,-0.016098,0.052301,-0.086854,-0.016767,-0.082305


In [75]:
shopping_df.corr(method='spearman')

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
Administrative,1.0,0.940725,0.369194,0.362861,0.460204,0.421613,-0.155219,-0.434389,0.32835,-0.125391,0.078856,-0.004963,-0.012359,0.009413,-0.011739,0.094197,0.039527,0.167945
Administrative_Duration,0.940725,1.0,0.35715,0.35206,0.430072,0.413765,-0.163609,-0.437912,0.317329,-0.131622,0.07929,-0.007425,-0.023003,0.018761,-0.015486,0.11292,0.042549,0.16396
Informational,0.369194,0.35715,1.0,0.950958,0.368673,0.367522,0.005753,-0.185691,0.219471,-0.054115,0.059021,1.7e-05,-0.020386,-0.02292,-0.029061,-0.05911,0.043098,0.113876
Informational_Duration,0.362861,0.35206,0.950958,1.0,0.361032,0.36272,-0.002474,-0.200056,0.223919,-0.054314,0.05077,0.003003,-0.013451,-0.015007,-0.026349,-0.053374,0.046284,0.11212
ProductRelated,0.460204,0.430072,0.368673,0.361032,1.0,0.882672,-0.052305,-0.51892,0.341975,-0.021536,0.140575,0.021218,0.043554,-0.0214,-0.069877,-0.129735,0.034369,0.198545
ProductRelated_Duration,0.421613,0.413765,0.367522,0.36272,0.882672,1.0,-0.079768,-0.476935,0.360282,-0.049578,0.134856,0.023485,0.045758,-0.009666,-0.072516,-0.119205,0.027364,0.216764
BounceRates,-0.155219,-0.163609,0.005753,-0.002474,-0.052305,-0.079768,1.0,0.602276,-0.123726,0.135341,-0.001271,0.053448,-0.047057,-0.01796,0.015583,-0.303654,-0.032165,-0.148977
ExitRates,-0.434389,-0.437912,-0.185691,-0.200056,-0.51892,-0.476935,0.602276,1.0,-0.308002,0.151278,-0.063413,0.022357,-0.016319,-0.003934,0.02244,-0.268306,-0.06764,-0.254488
PageValues,0.32835,0.317329,0.219471,0.223919,0.341975,0.360282,-0.123726,-0.308002,1.0,-0.07048,0.061524,-0.012409,0.025728,0.001202,-0.017504,0.023391,0.020695,0.626363
SpecialDay,-0.125391,-0.131622,-0.054115,-0.054314,-0.021536,-0.049578,0.135341,0.151278,-0.07048,1.0,-0.247726,0.022679,0.020734,-0.014977,0.11015,-0.093918,-0.074242,-0.086878


In [None]:
shopping_df["OperatingSystems"].value_counts()

### 2.c. Data quality aspects
e.g. missing values and their potential effects and reasons, uneven distributions in certain attribute types, plausibility of values, outliers

### 2.d Visual exploration of data properties and hypotheses

### 2.e Sensitive data
Evaluate and document whether the data set contains attributes that are potentially ethically sensitive, minority classes or underrepresented data groups, unbalanced distributions with respect to bias (to guide over- and under-sampling, micro- and macro evaluation criteria)

### 2.f Potential risk and bias
What potential risks and additional types of bias exist in the data? What questions would you need to have answered by an external expert in order to determine potential bias or data quality issues?


### 2.g Which actions are likely required in data preparation based on this analysis?

## Data Preparation

In [None]:
## Each Activity that follows is part of the Data Preparation Phase

data_preparation_phase_executor = [
f':data_preparation_phase rdf:type prov:Activity .',
f':data_preparation_phase rdfs:label "Data Preparation Phase" .', 
]
engine.insert(data_preparation_phase_executor, prefixes=prefixes)

In [None]:
handle_outliers_code_writer = student_b
def handle_outliers(df:pd.DataFrame, outliers_report: dict) -> pd.DataFrame:
    # REMOVE OUTLIERS
    return df

start_time_td = now()
handle_outliers(data, outliers_report)
end_time_td = now()

#############################################
# Documentation
#############################################
# This is the continuation of the example from the Data Understanding phase above.
# There are three steps involved in this process:
# 1. activity creates a figure, report etc. => already done in data understanding phase
# 2. activity inspects the outcome and derives decisions => already done in data understanding phase
# 3. activity follows up on the decision by changing the data => in this case by removing the the outliers that were found

ro_ass_uuid_executor = "ec7e81e1-86ea-475a-a8d4-c7d8ee535488"
handle_outliers_executor = [
    f':handle_outliers prov:qualifiedAssociation :{ro_ass_uuid_executor} .',
    f':{ro_ass_uuid_executor} prov:agent :{executed_by} .',
    f':{ro_ass_uuid_executor} rdf:type prov:Association .',
    f':{ro_ass_uuid_executor} prov:hadRole :{code_executor_role} .',
]
engine.insert(handle_outliers_executor, prefixes=prefixes)

td_ass_uuid_writer = "1405f15a-3545-4014-a962-637f3c10a137"
td_comment = """
Removing all outliers that were identifying in the Data Understanding Phase.
"""
handle_outliers_activity = [
    ':handle_outliers rdf:type prov:Activity .',
    ':handle_outliers sc:isPartOf :data_preparation_phase .',
    ':handle_outliers rdfs:comment \'Data Preparation\' .', 
    f':handle_outliers rdfs:comment """{td_comment}""" .', 
    f':handle_outliers prov:startedAtTime "{start_time_td}"^^xsd:dateTime .',
    f':handle_outliers prov:endedAtTime "{end_time_td}"^^xsd:dateTime .',
    f':handle_outliers prov:qualifiedAssociation :{td_ass_uuid_writer} .',
    f':{td_ass_uuid_writer} prov:agent :{handle_outliers_code_writer} .',
    f':{td_ass_uuid_writer} rdf:type prov:Association .',
    f':{td_ass_uuid_writer} prov:hadRole :{code_writer_role} .',
    ':handle_outliers prov:used :data .',
    ':handle_outliers prov:used :outlier_decision .',
    ':cleaned_data rdf:type prov:Entity .',
    ':cleaned_data prov:wasGeneratedBy :handle_outliers .',
    ':cleaned_data prov:wasDerivedFrom :data .',
]
engine.insert(handle_outliers_activity, prefixes=prefixes)

**Continue with other tasks of the Data Preparation phase such as binning, scaling etc...**

In [None]:
# Your final transformed dataset should also be documented appropriately using Croissant, SI, etc.

prepared_data_triples = [
    ':prepared_data rdf:type prov:Entity .',
    ':prepared_data prov:wasDerivedFrom :cleaned_data .',
    ':prepared_data rdf:type sc:Dataset .',
    # ....
]
engine.insert(prepared_data_triples, prefixes=prefixes)

## Modeling

In [None]:
## Each Activity that follows is part of the Modeling Phase

modeling_phase_executor = [
f':modeling_phase rdf:type prov:Activity .',
f':modeling rdfs:label "Modeling Phase" .', 
]
engine.insert(modeling_phase_executor, prefixes=prefixes)


In [None]:
model_data_code_writer = student_a

#############################################
# Documentation 4a
#############################################

dma_ass_uuid_writer = "b3e840ab-ac23-415e-bd9c-6d00bb79c37a"
dma_comment = """
...
"""

identify_data_mining_algorithm_activity = [
    f':define_algorithm rdf:type prov:Activity .',
    f':define_algorithm sc:isPartOf :modeling_phase .',
    f':define_algorithm rdfs:comment """{dma_comment}""" .',
    f':define_algorithm prov:qualifiedAssociation :{dma_ass_uuid_writer} .',
    f':{dma_ass_uuid_writer} prov:agent :{model_data_code_writer} .',
    f':{dma_ass_uuid_writer} rdf:type prov:Association .',
    f':{dma_ass_uuid_writer} prov:hadRole :{code_writer_role} .',
    
    # example algorithm definition
    f':random_forest_algorithm rdf:type mls:Algorithm .',
    f':random_forest_algorithm rdfs:label "Random Forest Algorithm" .',

    # example implementation
    f':random_forrest_classifier_implementation rdf:type mls:Implementation .',
    f':random_forrest_classifier_implementation rdfs:label "Scikit-learn RandomForestClassifier" .',
    f':random_forrest_classifier_implementation mls:implements :random_forest_algorithm .',
    f':random_forrest_classifier_implementation prov:wasGeneratedBy :define_algorithm .',

    
    # you can also define your Evaluation Measures here
    
    # example evaluation 
    f':r2_score_measure rdf:type mls:EvaluationMeasure .',
    f':r2_score_measure rdfs:label "R-squared Score" .',
    f':r2_score_measure rdfs:comment "xxx" .',
    f':r2_score_measure prov:wasGeneratedBy :define_algorithm .',

    
]
engine.insert(identify_data_mining_algorithm_activity, prefixes=prefixes)

In [None]:
#############################################
# Documentation 4b
#############################################

hp_ass_uuid_writer = "fff582a8-c5cd-4030-978b-9f56b603167c"
hp_comment = """
...
"""
identify_hp_activity = [
    f':identify_hyperparameters rdf:type prov:Activity .',
    f':identify_hyperparameters sc:isPartOf :modeling_phase .',
    f':identify_hyperparameters rdfs:comment """{hp_comment}""" .',
    f':identify_hyperparameters prov:qualifiedAssociation :{hp_ass_uuid_writer} .',
    f':{hp_ass_uuid_writer} prov:agent :{model_data_code_writer} .',
    f':{hp_ass_uuid_writer} rdf:type prov:Association .',
    f':{hp_ass_uuid_writer} prov:hadRole :{code_writer_role} .',
    
    # example parameter
    f':hp_learning_rate rdf:type mls:HyperParameter .',
    f':hp_learning_rate rdfs:label "Learning Rate" .',
    f':hp_learning_rate rdfs:comment "..." .',
    f':random_forrest_classifier_implementation mls:hasHyperParameter :hp_learning_rate .',
    f':hp_learning_rate prov:wasGeneratedBy :identify_hyperparameters .',

    # continue with your identified hyperparameters
    
]
engine.insert(identify_hp_activity, prefixes=prefixes)

In [None]:
def split_data(df: pd.DataFrame):
    #do something
    return 'train_set', 'validation_set', 'test_set'

#############################################
# Documentation 4c
#############################################

### Define Train/Validation/Test splits
split_ass_uuid_writer = "fb58ae6c-9d58-44c9-ac7e-529111bdf7fc"
split_comment = """
...
"""
## Use your prepared dataset
input_dataset = ":prepared_data" 

define_split_activity = [
    f':define_data_split rdf:type prov:Activity .',
    f':define_data_split sc:isPartOf :modeling_phase .',
    f':define_data_split rdfs:comment "Train/Validation/Test Split Definition" .',
    f':define_data_split rdfs:comment """{split_comment}""" .',
    f':define_data_split prov:qualifiedAssociation :{split_ass_uuid_writer} .',
    f':{split_ass_uuid_writer} prov:agent :{model_data_code_writer} .',
    f':{split_ass_uuid_writer} rdf:type prov:Association .',
    f':{split_ass_uuid_writer} prov:hadRole :{code_writer_role} .',
    f':define_data_split prov:used {input_dataset} .',
    
    # Training Set
    f':training_set rdf:type sc:Dataset .',
    f':training_set rdfs:label "Training Set" .',
    f':training_set prov:wasGeneratedBy :define_data_split .',
    f':training_set prov:wasDerivedFrom {input_dataset} .',
    f':training_set rdfs:comment "Contains xx samples" .', 

    # Validation Set
    f':validation_set rdf:type sc:Dataset .',
    f':validation_set rdfs:label "Validation Set" .',
    f':validation_set prov:wasGeneratedBy :define_data_split .',
    f':validation_set prov:wasDerivedFrom {input_dataset} .',
    f':validation_set rdfs:comment "Contains xx samples" .', 

    # Test Set
    f':test_set rdf:type sc:Dataset .',
    f':test_set rdfs:label "Test Set" .',
    f':test_set prov:wasGeneratedBy :define_data_split .',
    f':test_set prov:wasDerivedFrom {input_dataset} .',
    f':test_set rdfs:comment "Contains xx samples" .', 

    
]
engine.insert(define_split_activity, prefixes=prefixes)

In [None]:
def train_and_finetune_model(training_set, validation_set):
    # do something here

    # Try to automate as much documentation work as possible.
    # Define your training runs with their respective hyperparameter settings, etc.
    # Document each time a training run, model, its hp_settings, evaluations, ...  
    # Create performance figures/graphs

    return 'Find most suitable model'


start_time_tafm = now()
# train_and_finetune_model()
end_time_tafm = now() 


#############################################
# Documentation 4d & e & f
#############################################

tafm_ass_uuid_writer = "21d60fe3-c9ab-4a0a-bae7-b9fe9653c755"
tafm_comment = """
...
"""

# EXAMPLE output from your training
training_run1 = "run_1" 
model_run1 = "model_run1"
hp1_setting_run1 = "hp_setting_run1"
eval_train_run1 = "metric_train_run1"
eval_validation_run1 = "metric_validation_run1"


train_model_activity = [
    # Activity 
    f':train_and_finetune_model rdf:type prov:Activity .',
    f':train_and_finetune_model sc:isPartOf :modeling_phase .',
    f':train_and_finetune_model rdfs:comment """{tafm_comment}""" .',
    f':train_and_finetune_model prov:startedAtTime "{start_time_tafm}"^^xsd:dateTime .',
    f':train_and_finetune_model prov:endedAtTime "{end_time_tafm}"^^xsd:dateTime .',
    f':train_and_finetune_model prov:qualifiedAssociation :{tafm_ass_uuid_writer} .',
    f':{tafm_ass_uuid_writer} prov:agent :{model_data_code_writer} .',
    f':{tafm_ass_uuid_writer} rdf:type prov:Association .',
    f':{tafm_ass_uuid_writer} prov:hadRole :{code_writer_role} .',
    
    ########################################
    # ONE model run - automate everything below!

    # Parameter settings
    f':{hp1_setting_run1} rdf:type mls:HyperParameterSetting .',
    f':{hp1_setting_run1} mls:specifiedBy :hp_learning_rate .',
    f':{hp1_setting_run1} mls:hasValue "1.23"^^xsd:double .',
    f':{hp1_setting_run1} prov:wasGeneratedBy :train_and_finetune_model .',
    # add your further parameters

    # Describe your Run
    f':{training_run1} rdf:type mls:Run .',
    f':{training_run1} sc:isPartOf :train_and_finetune_model .',
    f':{training_run1} mls:realizes :random_forest_algorithm .',
    f':{training_run1} rdf:label "Training Run 1 with..." .',
    f':{training_run1} mls:executes :your_implementation .', 
    f':{training_run1} mls:hasInput :training_set .',
    f':{training_run1} mls:hasInput :validation_set .',
    f':{training_run1} mls:hasInput :{hp1_setting_run1} .',     
    # list all your used parameters here
    f':{training_run1} mls:hasOutput :{model_run1} .',
    f':{training_run1} mls:hasOutput :{eval_train_run1} .',
    f':{training_run1} mls:hasOutput :{eval_validation_run1} .',

    # Describe your Model
    f':{model_run1} rdf:type mls:Model .',
    f':{model_run1} prov:label "xxx" .',
    f':{model_run1} prov:wasGeneratedBy :{training_run1} .',
    f':{model_run1} mlso:trainedOn :training_set .',
    f':{model_run1} mlso:hasAlgorithmType :random_forest_algorithm .',

    # Describe your evaluations
    # You can have multiple evaluations per model 
    f':{eval_train_run1} rdf:type mls:ModelEvaluation .',
    f':{eval_train_run1} prov:wasGeneratedBy :{training_run1} .',
    f':{eval_train_run1} mls:hasValue "1.23"^^xsd:double .',
    f':{eval_train_run1} mls:specifiedBy :r2_score_measure .',
    f':{eval_train_run1} prov:used :training_set .',

    f':{eval_validation_run1} rdf:type mls:ModelEvaluation .',
    f':{eval_validation_run1} prov:wasGeneratedBy :{training_run1} .',
    f':{eval_validation_run1} mls:hasValue "1.23"^^xsd:double .',
    f':{eval_validation_run1} mls:specifiedBy :r2_score_measure .',
    f':{eval_validation_run1} prov:used :validation_set .',

    # Dont forget to document any visualizations

]
engine.insert(train_model_activity, prefixes=prefixes)


In [None]:
def retrain_model_full_data(training_set, validation_set):
    
    # create your
    return "Final Trained Model"


start_time_tafm = now()
# train_and_finetune_model()
end_time_tafm = now() 


#############################################
# Documentation 4g
#############################################

retrain_ass_uuid_writer = "96815ee0-524c-437b-b5fa-2e15b945c993" # Generate once

final_training_activity = ":retrain_final_model"
final_model = ":final_model_entity"

# Document the retraining activity.
# Hint: This activity is still part of the :modeling_phase

retrain_documentation = [
    # your documentation here    
]
engine.insert(retrain_documentation, prefixes=prefixes)


## Evaluation

In [None]:
## Each Activity that follows is part of the Evaluation Phase

evaluation_phase_executor = [
f':evaluation_phase rdf:type prov:Activity .',
f':evaluation_phase rdfs:label "Evaluation Phase" .', 
]
engine.insert(evaluation_phase_executor, prefixes=prefixes)

In [None]:
eval_code_writer = student_b
def evaluate_on_test_data(final_model, test_set):

    # Predict and evaluation on test data
        
    return 'Performance'

start_time_eval = now()
#evaluate_on_test_data()
end_time_eval = now() 

#############################################
# Documentation
#############################################

eval_ass_uuid = "7f1431e9-feed-429a-92ed-c131b23cbe79" # Generate once
final_model = ":final_model_entity" 
test_set = ":test_set" 

eval_comment = """
...
"""

evaluate_activity = [
    f':evaluate_final_model rdf:type prov:Activity .',
    f':evaluate_final_model sc:isPartOf :evaluation_phase .',
    f':evaluate_final_model rdfs:label "Final Model Evaluation on Test Set" .',
    f':evaluate_final_model rdfs:comment """{eval_comment}""" .',
    f':evaluate_final_model prov:startedAtTime "{start_time_eval}"^^xsd:dateTime .',
    f':evaluate_final_model prov:endedAtTime "{end_time_eval}"^^xsd:dateTime .',
    f':evaluate_final_model prov:qualifiedAssociation :{eval_ass_uuid} .',
    
    f':{eval_ass_uuid} prov:agent :{eval_code_writer} .',
    f':{eval_ass_uuid} rdf:type prov:Association .',
    f':{eval_ass_uuid} prov:hadRole :{code_writer_role} .',

    # Inputs
    f':evaluate_final_model prov:used {final_model} .',
    f':evaluate_final_model prov:used {test_set} .',
    
    # Reference to Data Mining Success Criteria from Phase 1
    f':evaluate_final_model prov:used :bu_data_mining_success_criteria .',

    # Document you final model performance
 
    # Hint: you evaluate bias in this way:
    f':bias_evaluation_result rdf:type mls:ModelEvaluation .',
    f':bias_evaluation_result prov:wasGeneratedBy :evaluate_final_model .',
    f':bias_evaluation_result rdfs:label "Bias Analysis" .',
    f':bias_evaluation_result rdfs:comment "..." .',
    
]
engine.insert(evaluate_activity, prefixes=prefixes)

## Deployment

In [None]:
## Each Activity that follows is part of the Deployment Phase

deployment_phase_executor = [
f':deployment_phase rdf:type prov:Activity .',
f':deployment_phase rdfs:label "Deployment Phase" .', 
]
engine.insert(deployment_phase_executor, prefixes=prefixes)

In [None]:
#############################################
# Documentation
#############################################

comparison_and_recommendations_comment = """
...
"""

ethical_aspects_comment = """
...
"""

monitoring_plan_comment = """
...
"""

reproducibility_reflection_comment = """
...
"""

dep_ass_uuid_executor = "72a921e0-1234-4567-89ab-cdef01234567" # Generate once
deployment_executor = [
f':plan_deployment rdf:type prov:Activity .',
f':plan_deployment sc:isPartOf :deployment_phase .', # Connect to Parent Phase
f':plan_deployment rdfs:label "Plan Deployment"@en .',

f':plan_deployment prov:qualifiedAssociation :{dep_ass_uuid_executor} .',
f':{dep_ass_uuid_executor} prov:agent :{executed_by} .',
f':{dep_ass_uuid_executor} rdf:type prov:Association .',
f':{dep_ass_uuid_executor} prov:hadRole :{code_executor_role} .', 
]
engine.insert(deployment_executor, prefixes=prefixes)


deployment_data_executor = [
#6a
f':dep_recommendations rdf:type prov:Entity .',
f':dep_recommendations prov:wasGeneratedBy :plan_deployment .',
f':dep_recommendations rdfs:label "6a Business Objectives Reflection and Deployment Recommendations" .',
f':dep_recommendations rdfs:comment """{comparison_and_recommendations_comment}""" .',
#6b
f':dep_ethical_risks rdf:type prov:Entity .',
f':dep_ethical_risks prov:wasGeneratedBy :plan_deployment .',
f':dep_ethical_risks rdfs:label "6b Ethical Aspects and Risks" .',
f':dep_ethical_risks rdfs:comment """{ethical_aspects_comment}""" .',
#6c
f':dep_monitoring_plan rdf:type prov:Entity .',
f':dep_monitoring_plan prov:wasGeneratedBy :plan_deployment .',
f':dep_monitoring_plan rdfs:label "6c Monitoring Plan" .',
f':dep_monitoring_plan rdfs:comment """{monitoring_plan_comment}""" .',
#6d
f':dep_reproducibility_reflection rdf:type prov:Entity .',
f':dep_reproducibility_reflection prov:wasGeneratedBy :plan_deployment .',
f':dep_reproducibility_reflection rdfs:label "6d Reproducibility Reflection" .',
f':dep_reproducibility_reflection rdfs:comment """{reproducibility_reflection_comment}""" .',

]
engine.insert(deployment_data_executor, prefixes=prefixes)

# Generate Latex Report

The following cells give you an example of how to automatically create a Latex Report from your provenance documentation.

Feel free to use the example provided. If you use it, you should adapt and extend it with relevant sections/tables/plots/... 

In [76]:
base_iri = f"https://starvers.ec.tuwien.ac.at/BI2025/{group_id}/"

In [77]:
# This cell includes cleaning functions

from datetime import datetime

def latex_escape(text: str | None) -> str:
    if text is None: return ""
    text = str(text)
    text = text.replace("\\", r"\textbackslash{}")
    pairs = [
        ("&", r"\&"), ("%", r"\%"), ("$", r"\$"), ("#", r"\#"), 
        ("_", r"\_"), ("{", r"\{"), ("}", r"\}"), 
        ("~", r"\textasciitilde{}"), ("^", r"\textasciicircum{}")
    ]
    for k, v in pairs:
        text = text.replace(k, v)
    return text

def clean_rdf(x) -> str:
    if hasattr(x, "toPython"): return str(x.toPython())
    if x is None: return ""
    s = str(x).strip()
    s = s.strip('"').strip("'")
    s = s.strip()
    if "^^" in s:
        s = s.split("^^")[0].strip('"')
        
    return s

def fmt_iso(ts: str) -> str:
    if not ts: return ""
    try:
        clean_ts = ts.split("^^")[0].strip('"')
        clean_ts = clean_ts.replace("Z", "+00:00") if clean_ts.endswith("Z") else clean_ts
        return datetime.fromisoformat(clean_ts).strftime("%Y-%m-%d %H:%M:%S")
    except:
        return latex_escape(str(ts))

In [78]:
# This cell includes exemplary queries for different phases


### Author Block
author_query = f"""
{prefix_header}
PREFIX iao: <http://purl.obolibrary.org/obo/>

SELECT DISTINCT ?uri ?given ?family ?matr WHERE {{
  VALUES ?uri {{ :{student_a} :{student_b} }}
  
  ?uri a foaf:Person .
  ?uri foaf:givenName ?given .
  ?uri foaf:familyName ?family .
  ?uri iao:IAO_0000219 ?matr .
}}
"""

res_authors = engine.query(author_query)
author_block_latex = ""

if not res_authors.empty: # type:ignore
    for _, row in res_authors.iterrows(): # type:ignore

        uri_str = str(row['uri'])
        given = latex_escape(clean_rdf(row['given']))
        family = latex_escape(clean_rdf(row['family']))
        matr = latex_escape(clean_rdf(row['matr']))
        if student_a in uri_str:
            responsibility = "Student A"
        elif student_b in uri_str:
            responsibility = "Student B"
        else:
            responsibility = "Student"
        
        author_block_latex += rf"""
          \author{{{given} {family}}}
          \authornote{{{responsibility}, Matr.Nr.: {matr}}}
          \affiliation{{
            \institution{{TU Wien}}
            \country{{Austria}}
          }}
          """

### Business Understanding example
bu_query = f"""
{prefix_header}

SELECT ?ds_comment ?bo_comment WHERE {{
  OPTIONAL {{ :bu_data_source_and_scenario rdfs:comment ?ds_comment . }}
  OPTIONAL {{ :bu_business_objectives rdfs:comment ?bo_comment . }}
}} LIMIT 1
"""
res_bu = engine.query(bu_query)
row_bu = res_bu.iloc[0] if not res_bu.empty else {} # type:ignore
bu_data_source = latex_escape(clean_rdf(row_bu.get("ds_comment", "")))
bu_objectives  = latex_escape(clean_rdf(row_bu.get("bo_comment", "")))


### Data Understanding examples
# Example Dataset Description
du_desc_query = f"""
{prefix_header}
SELECT ?desc WHERE {{ :raw_data sc:description ?desc . }} LIMIT 1
"""
res_du_desc = engine.query(du_desc_query)
row_du_desc = res_du_desc.iloc[0] if not res_du_desc.empty else {} # type:ignore
du_description = latex_escape(clean_rdf(row_du_desc.get("desc", "")))

# Example Feature Columns Table
du_query = f"""
{prefix_header}

SELECT ?name (SAMPLE(?dtypeRaw) as ?dtype) (SAMPLE(?descRaw) as ?desc) WHERE {{
  :raw_data cr:recordSet ?rs .
  ?rs cr:field ?field .
  ?field sc:name ?name .
  ?field sc:description ?descRaw .
  ?field cr:dataType ?dtypeRaw .
}} 
GROUP BY ?name
ORDER BY ?name
"""
res_du = engine.query(du_query)
du_rows = []
if not res_du.empty: # type:ignore
    for _, f in res_du.iterrows(): # type:ignore
        dtype_raw = clean_rdf(f.get("dtype", ""))
        if '#' in dtype_raw: dtype = dtype_raw.split('#')[-1]
        elif '/' in dtype_raw: dtype = dtype_raw.split('/')[-1]
        else: dtype = dtype_raw
        
        desc = clean_rdf(f.get("desc", ""))
        row_str = f"{latex_escape(clean_rdf(f['name']))} & {latex_escape(dtype)} & {latex_escape(desc)} \\\\"
        du_rows.append(row_str)
du_table_rows = "\n    ".join(du_rows)

### Modeling example
# Hyperparameters
hp_query = f"""
{prefix_header}

SELECT ?hpName (SAMPLE(?hpValRaw) as ?hpVal) (MAX(?hpDescRaw) as ?hpDesc) WHERE {{
  ?run sc:isPartOf :train_and_finetune_model .
  ?run mls:hasInput ?setting .
  ?setting a mls:HyperParameterSetting .
  ?setting mls:hasValue ?hpValRaw .
  ?setting mls:specifiedBy ?hpDef .
  ?hpDef rdfs:label ?hpName .
  OPTIONAL {{ ?hpDef rdfs:comment ?hpDescRaw . }}
}} 
GROUP BY ?hpName
ORDER BY ?hpName
"""
res_hp = engine.query(hp_query)
hp_rows = []
if not res_hp.empty: #type:ignore
    for _, row in res_hp.iterrows(): #type:ignore
        name = latex_escape(clean_rdf(row['hpName']))
        val  = latex_escape(clean_rdf(row['hpVal']))
        desc = latex_escape(clean_rdf(row.get('hpDesc', '')))
        hp_rows.append(rf"{name} & {desc} & {val} \\")

hp_table_rows = "\n    ".join(hp_rows)

# Run Info
run_query = f"""
{prefix_header}

SELECT ?algoLabel ?start ?end ?metricLabel ?metricVal WHERE {{
  OPTIONAL {{ :train_and_finetune_model prov:startedAtTime ?start ; prov:endedAtTime ?end . }}
  OPTIONAL {{
      ?run sc:isPartOf :train_and_finetune_model .
      ?run mls:realizes ?algo .
      ?algo rdfs:label ?algoLabel .
  }}
  OPTIONAL {{
    ?run sc:isPartOf :train_and_finetune_model .
    ?run mls:hasOutput ?eval .
    ?eval a mls:ModelEvaluation ; mls:hasValue ?metricVal .
    OPTIONAL {{ ?eval mls:specifiedBy ?m . ?m rdfs:label ?metricLabel . }}
  }}
}} LIMIT 1
"""
res_run = engine.query(run_query)
row_run = res_run.iloc[0] if not res_run.empty else {} #type:ignore
mod_algo  = latex_escape(clean_rdf(row_run.get("algoLabel", "")))
mod_start = latex_escape(fmt_iso(clean_rdf(row_run.get("start"))))
mod_end   = latex_escape(fmt_iso(clean_rdf(row_run.get("end"))))
mod_m_lbl = latex_escape(clean_rdf(row_run.get("metricLabel", "")))
raw_val = clean_rdf(row_run.get('metricVal', ''))
mod_m_val = f"{float(raw_val):.4f}" if raw_val else ""

print("Data extraction done.")

ZoneInfoNotFoundError: 'No time zone found with key W. Europe Standard Time'

The following includes the Latex report itself. It fills in the query-results from the cell before. The ACM Template is already filled. 
Make sure that you update Student A and B accordingly.

In [79]:
latex_content = rf"""\documentclass[sigconf]{{acmart}}

\AtBeginDocument{{ \providecommand\BibTeX{{ Bib\TeX }} }}
\setcopyright{{acmlicensed}}
\copyrightyear{{2025}}
\acmYear{{2025}}
\acmDOI{{XXXXXXX.XXXXXXX}}

\acmConference[BI 2025]{{Business Intelligence}}{{-}}{{-}}

\begin{{document}}

\title{{BI2025 Experiment Report - Group {group_id}}}
%% ---Authors: Dynamically added ---
{author_block_latex}

\begin{{abstract}}
  This report documents the machine learning experiment for Group {group_id}, following the CRISP-DM process model.
\end{{abstract}}

\ccsdesc[500]{{Computing methodologies~Machine learning}}
\keywords{{CRISP-DM, Provenance, Knowledge Graph, Machine Learning}}

\maketitle

%% --- 1. Business Understanding ---
\section{{Business Understanding}}

\subsection{{Data Source and Scenario}}
{bu_data_source}

\subsection{{Business Objectives}}
{bu_objectives}

%% --- 2. Data Understanding ---
\section{{Data Understanding}}
\textbf{{Dataset Description:}} {du_description}

The following features were identified in the dataset:

\begin{{table}}[h]
  \caption{{Raw Data Features}}
  \label{{tab:features}}
  \begin{{tabular}}{{lp{{0.2\linewidth}}p{{0.4\linewidth}}}}
    \toprule
    \textbf{{Feature Name}} & \textbf{{Data Type}} & \textbf{{Description}} \\
    \midrule
    {du_table_rows}
    \bottomrule
  \end{{tabular}}
\end{{table}}

%% --- 3. Data Preparation ---
\section{{Data Preparation}}
\subsection{{Data Cleaning}}
Describe your Data preparation steps here and include respective graph data.


%% --- 4. Modeling ---
\section{{Modeling}}

\subsection{{Hyperparameter Configuration}}
The model was trained using the following hyperparameter settings:

\begin{{table}}[h]
  \caption{{Hyperparameter Settings}}
  \label{{tab:hyperparams}}
  \begin{{tabular}}{{lp{{0.4\linewidth}}l}}
    \toprule
    \textbf{{Parameter}} & \textbf{{Description}} & \textbf{{Value}} \\
    \midrule
    {hp_table_rows}
    \bottomrule
  \end{{tabular}}
\end{{table}}

\subsection{{Training Run}}
A training run was executed with the following characteristics:
\begin{{itemize}}
    \item \textbf{{Algorithm:}} {mod_algo}
    \item \textbf{{Start Time:}} {mod_start}
    \item \textbf{{End Time:}} {mod_end}
    \item \textbf{{Result:}} {mod_m_lbl} = {mod_m_val}
\end{{itemize}}

%% --- 5. Evaluation ---
\section{{Evaluation}}

%% --- 6. Deployment ---
\section{{Deployment}}

\section{{Conclusion}}

\end{{document}}
"""

NameError: name 'author_block_latex' is not defined

In [None]:
# This cell stores the Latex report to the data/report directory

out_dir = os.path.join("data", "report")
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, "experiment_report.tex")

with open(out_path, "w", encoding="utf-8") as f:
    f.write(latex_content)

print(f"Report written to: {out_path}")

Report written to: data/report/experiment_report.tex
