## First attempt

In [None]:
import requests
import os
import sys
import subprocess
import time
import selectors 

# --- CONFIGURATION ---
OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL_NAME = "llama3"  # Ensure you have this model: `ollama pull llama3`
WORKSPACE_DIR = "./contest_workspace"
PYTHON_EXEC = sys.executable 

# ROOT DATA PATH (Adjust this to your actual path)
ALL_DATA_PATH = "/Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/all_datasets" 

# Setup Directories
os.makedirs(WORKSPACE_DIR, exist_ok=True)
os.makedirs(ALL_DATA_PATH, exist_ok=True)


# --- 1. LLM COMMUNICATION LAYER ---
def call_llm(system_prompt, user_prompt):
    """Sends request to Ollama with a strict JSON payload."""
    full_prompt = f"System: {system_prompt}\nUser: {user_prompt}"
    payload = {
        "model": MODEL_NAME, 
        "prompt": full_prompt, 
        "stream": False,
        "options": {
            "temperature": 0.2, # Low temp for precise coding
            "num_ctx": 16384,    # Large context for code + error logs
            "stop": ["User:", "System:", "```python\n\n"] # Stop tokens
        }
    }
    try:
        print("   ðŸ§  (Agent is thinking...)", end="\r")
        response = requests.post(OLLAMA_URL, json=payload)
        response.raise_for_status()
        return response.json().get('response', '')
    except Exception as e:
        return f"LLM_ERROR: {str(e)}"

# def save_code_to_file(code_text, filename):
#     """Extracts Python block and saves to file."""
#     clean_code = code_text
#     # Extract code between markdown ticks
#     if "```python" in code_text:
#         clean_code = code_text.split("```python")[1].split("```")[0]
#     elif "```" in code_text:
#         clean_code = code_text.split("```")[1].split("```")[0]
    
#     clean_code = clean_code.strip()
    
#     # Remove accidental starting text
#     lines = clean_code.splitlines()
#     if lines and lines[0].lower().startswith("python"):
#         clean_code = "\n".join(lines[1:])

#     filepath = os.path.join(WORKSPACE_DIR, filename)
#     with open(filepath, "w") as f:
#         f.write(clean_code)
#     return filepath, clean_code

# def execute_code(filepath):
#     """Executes the script and returns (Success_Bool, Output/Error)."""
#     print(f"   âš¡ Executing {os.path.basename(filepath)}...")
#     try:
#         result = subprocess.run(
#             [PYTHON_EXEC, os.path.basename(filepath)], 
#             cwd=WORKSPACE_DIR,       
#             capture_output=True,     
#             text=True,               
#             timeout=120 
#         )
#         if result.returncode == 0:
#             return True, result.stdout
#         else:
#             return False, result.stderr
#     except subprocess.TimeoutExpired:
#         return False, "TIMEOUT: Script took longer than 120 seconds."
#     except Exception as e:
#         return False, str(e)

def execute_script(script_name, work_dir = ".", device="0"):    
    if not os.path.exists(os.path.join(work_dir, script_name)):
        raise Exception(f"The file {script_name} does not exist.")
    try:
        script_path = script_name
        device = device        
        cmd = f"CUDA_VISIBLE_DEVICES={device} python -u {script_path}"
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True, cwd=work_dir)

        stdout_lines = []
        stderr_lines = []

        selector = selectors.DefaultSelector()
        selector.register(process.stdout, selectors.EVENT_READ)
        selector.register(process.stderr, selectors.EVENT_READ)

        while process.poll() is None and selector.get_map():
            events = selector.select(timeout=1)

            for key, _ in events:
                line = key.fileobj.readline()
                if key.fileobj == process.stdout:
                    # print("STDOUT:", line, end =" ")
                    stdout_lines.append(line)
                else:
                    # print("STDERR:", line, end =" ")
                    stderr_lines.append(line)

        for line in process.stdout:
            line = line
            # print("STDOUT:", line, end =" ")
            stdout_lines.append(line)
        for line in process.stderr:
            line = line
            # print("STDERR:", line, end =" ")
            stderr_lines.append(line)

        return_code = process.returncode

        if return_code != 0:
            observation = "".join(stderr_lines)
        else:
            observation = "".join(stdout_lines)
        if observation == "" and return_code == 0:
            # printed to stderr only
            observation = "".join(stderr_lines)
        return return_code, "The script has been executed. Here is the output:\n" + observation
    
    except Exception as e:
        print("++++", "Wrong!")
        # raise Exception(f"Something went wrong in executing {script_name}: {e}. Please check if it is ready to be executed.")
        return -1, f"Something went wrong in executing {script_name}: {e}. Please check if it is ready to be executed."


class BaseAgent:
    """Parent class capable of Generating and Fixing code."""
    def __init__(self, role, system_instruction, specs, agent_profile):
        self.specs = specs
        self.role = role
        self.system = system_instruction
        self.code_path = "/Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/foundation-ml/bucket-code"
        self.root_path_data = "/Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/all_datasets"
        self.agent_profile = agent_profile
        self.device = 0

    def make_plan(self, data_description, requirement, plan_conditions):

        prompt = """Devise an end-to-end actionable plan for {} task according to the user's requirements described in the following JSON object.

        # User Requirements & Data Info
        ```json
        {{
            "Data Description": "{}",
            "Requirements": "{}"
            "Root path data": "{}"
        }}
        ```
        When devising a plan, follow these instructions and do not forget them:
        {}
        """
        prompt = prompt.format(
                self.role,
                data_description,
                requirement,
                self.root_path_data,
                self.system
        )

        return call_llm(self.agent_profile, prompt)

    def implement_solution(self, code_instructions, code="", n_attempts=5):

        print(
            self.role,
            "I am implementing the following instruction"
        )

        log = "Nothing. This is your first attempt."
        error_logs = []
        code = code  # if a template/skeleton code is provided
        iteration = 0
        completion = None
        action_result = ""
        rcode = -1


        while iteration < n_attempts:
            try:
                exec_prompt = """Carefully read the following instructions to write Python code for {} task.
                {}
                
                # Previously Written Code
                ```python
                {}
                ```
                
                # Error from the Previously Written Code
                {}
                

                Note that you need to write the python code for the {}. 
                Start the python code with "```python". Please ensure the completeness of the code so that it can be run without additional modifications.
                If there is any error from the previous attempt, please carefully fix it first."""
       
                exec_prompt = exec_prompt.format(
                    self.specs['requirements'],
                    code_instructions,
                    code,
                    log,
                    "data processing part",
                )

                res = call_llm(self.agent_profile, exec_prompt)
                raw_completion = res
                completion = raw_completion.split("```python")[1].split("```")[0]
                
                if not completion.strip(" \n"):
                    continue
                
                self.code_name = self.role 
                filename = f"{self.code_path}{self.code_name}.py"
                os.makedirs(os.path.dirname(filename), exist_ok=True)
                with open(filename, "wt") as file:
                    file.write(completion)
                code = completion
                rcode, log = self.self_validation(filename)
                if rcode == 0:
                    action_result = log
                    break
                else:
                    log = log
                    error_logs.append(log)
                    action_result = log
                    print(self.agent_type, f"I got this error (itr #{iteration}): {log}")
                    iteration += 1                    
                    # break
            except Exception as e:
                iteration += 1
                print(self.role, f"===== Retry: {iteration} =====")
                print(self.role, f"Executioin error occurs: {e}")
            continue
        if not completion:
            completion = ""

        print(
            self.role,
            f"I executed the given plan and got the follow results:\n\n{action_result}",
        )
        return {"rcode": rcode, "action_result": action_result, "code": completion, "error_logs": error_logs}


    def run_code(self, completion, code="", n_attempts=5):

        print(
            self.role,
            "I am implementing the following instruction"
        )

        log = "Nothing. This is your first attempt."
        error_logs = []
        code = code  # if a template/skeleton code is provided
        iteration = 0
        action_result = ""
        rcode = -1

        while iteration < n_attempts:
            try:
                raw_completion = completion
                if "```python" in raw_completion:
                    completion = raw_completion.split("```python")[1].split("```")[0]
                elif "```Python" in raw_completion:
                    completion = raw_completion.split("```Python")[1].split("```")[0]
                else:
                    completion = raw_completion.split("```")[1].split("```")[0]

                if not completion.strip(" \n"):
                    continue
                self.code_name = self.role 
                filename = f"{self.code_path}/{self.code_name}.py"
                os.makedirs(os.path.dirname(filename), exist_ok=True)
                with open(filename, "wt") as file:
                    file.write(completion)
                code = completion
                rcode, log = self.self_validation(filename)
                if rcode == 0:
                    action_result = log
                    break
                else:
                    log = log
                    error_logs.append(log)
                    action_result = log
                    print(self.role, f"I got this error (itr #{iteration}): {log}")
                    iteration += 1                    
                    # break
            except Exception as e:
                iteration += 1
                print(self.role, f"===== Retry: {iteration} =====")
                print(self.role, f"Executioin error occurs: {e}")
            continue
        if not completion:
            completion = ""

        print(
            self.role,
            f"I executed the given plan and got the follow results:\n\n{action_result}",
        )
        return {"rcode": rcode, "action_result": action_result, "code": completion, "error_logs": error_logs}


    def self_validation(self, filename):
        rcode, log = execute_script(filename, device=self.device)
        return rcode, log

class Reflexion:

    def __init__(self, specs, code):

        self.specs = specs 
        self.code = code 

    def self_reflexion(self):
        """ 
        This is for self-reflexion to self check code to make sure logic + syntax is right before executive
        """

        agent_profile = """ 
                    You are a Senior Python Code Reviewer and Quality Assurance Engineer
                    
                    Objective: 

                    Audit the generated code to ensure it is runnable, efficient, and error-free. You must strictly enforce the following rules:
                    
                    1. Logical Correctness: Verify the code logic is sound. Ensure variables are defined before use and data flows correctly through the pipeline.
                    2. Strict Import Verification (CRITICAL):
                        - No "Ghost" Imports: Scan the code for every external class or function used.
                        - Mapping Check: You must verify that a corresponding import or from ... import ... statement exists at the top of the file for every single tool used.
                    
                    3. Structural Efficiency: Ensure all classes and functions have a clear purpose. Remove any redundant code or "pass" blocks that do not contribute to the solution.
                    4. Requirement Compliance: The code must produce the exact output format requested by the user (e.g., specific CSV filenames or print statements).
                    5. Using complete: Ensure all library from import is using right syntax in the solution.  
                    """

        prompt = """ 
                    Carefully read the requirement for task: 
                    {} 

                    Carefully read the description of data for task
                    {} 

                    And review code result below: 
                    {}

                    If there is any error from the previous attempt, please carefully fix it first. Delete all columns uesless!

                    Response return will have two components (in two list):
                    - Error you check and solution
                    - Full code complete 
                """    

        prompt = prompt.format(
                    self.specs['requirements'],
                    self.specs['data_description'],
                    self.code
        )

        response = call_llm(agent_profile, prompt) 
        return response 

class DataAgent(BaseAgent):
    def __init__(self, specs):
        system = (
            "TASK: Write a robust Python script to Find, Load, and Preprocess data.\n"
            "STRICT RULES:\n"
            "1. USE `os.walk` to find the folder matching the Contest Name.\n"
            "2. CHECK file count using `glob`. IF 1 file -> Split 80/20. IF 2 files -> Load separately.\n"
            "3. PREPROCESS: Handle missing values and categorical encoding and delete cols useless.\n"
            "4. OUTPUT: Save 'processed_train.csv' and 'processed_test.csv'.\n"
            "5. PRINT 'DATA_READY: processed_train.csv processed_test.csv' at the very end.\n"
            "OUTPUT FORMAT: Single valid ```python block."
        )

        agent_profile = """You are the world's best data scientist of an automated machine learning project (AutoML) that can find the most relevant datasets,run useful preprocessing, perform suitable data augmentation, and make meaningful visulaization to comprehensively understand the data based on the user requirements. You have the following main responsibilities to complete.
                        1. Retrieve a dataset from the user or search for the dataset based on the user instruction.
                        2. Perform data preprocessing based on the user instruction or best practice based on the given tasks.
                        3. Perform data augmentation as neccesary.
                        4. Extract useful information and underlying characteristics of the dataset."""

        super().__init__("Data_Engineering", system, specs, agent_profile)


# --- 3. THE MANAGER (WORKFLOW ORCHESTRATOR) ---

class CopilotManager:
    def __init__(self, specs):
        self.specs = specs
        self.data_agent = DataAgent(specs)
        # self.model_agent = ModelAgent(specs)

    def run_agent_cycle(self, agent, max_retries=3):
        """
        The Core Loop: Gen -> Exec -> Fix -> Retry
        """
        print(f"\nðŸ”¹ STARTED AGENT: {agent.role}")

        print(f"\n Generating plan and code for: {agent.role} ")

        code = agent.make_plan(self.specs["data_description"], self.specs["requirements"], self.data_agent.system)
        
        print("__"* 69 )
        print(f"\n Self-reflexion code for: {agent.role} ")

        # re_code = Reflexion(self.specs, code)
        # print(re_code)

        # print(f"\n Generating code and self-reflexion for: {agent.role} ")
        # # results = agent.implement_solution(plan)
        # results = agent.run_code(code)
            
        return code

    def start_workflow(self):
        print(f"ðŸš€ STARTING WORKFLOW: {self.specs['name']}")
        start_time = time.time()

        # ==========================================
        # 1. DATA AGENT PHASE
        # ==========================================
        output = self.run_agent_cycle(self.data_agent)

        return output
        
# --- 4. EXECUTION ---
if __name__ == "__main__":
    
    # Requirement Spec
    my_challenge = {
        "name": "Titanic",
        "requirements": "Predict survival (0/1)",
        
        "data_description": """
        The dataset contains the following columns:
        - PassengerId: Unique ID (Drop this for training)
        - Survived: Target variable (0 = No, 1 = Yes)
        - Pclass: Ticket class (1, 2, 3). Treat as Ordinal.
        - Name: Passenger name. (Extract Title like Mr/Mrs to create new feature 'Title')
        - Sex: 'male'/'female'. (Map to 0/1)
        - Age: Numeric. Contains missing values (Fill with Median by Title).
        - Cabin: Cabin number. Many missing. (Extract first letter as 'Deck', fill missing with 'Unknown')
        - Embarked: Port of Embarkation (C, Q, S). (One-Hot Encode).
        """,
        
        "target_metric": "Accuracy",
        "output_expectations": "submission.csv with PassengerId, Survived"
    }

    # Run
    manager = CopilotManager(my_challenge)
    # manager.start_workflow()

In [52]:
data_agent = DataAgent(my_challenge)
a = manager.run_agent_cycle(data_agent)


ðŸ”¹ STARTED AGENT: Data_Engineering

 Generating plan and code for: Data_Engineering 
__________________________________________________________________________________________________________________________________________


In [53]:
if "```Python" in a:
    code = a.split("```Python")[1].split("```")[0]
    review = Reflexion(my_challenge, code)
    re_code = review.self_reflexion()

re_code

'As a Senior Python Code Reviewer and Quality Assurance Engineer, I\'ve reviewed the provided code and identified some issues that need to be addressed. Here are my findings:\n\n**Error 1:**\nThe code is trying to split a single file into train and test sets using `train_test_split`. However, this approach is not suitable for a single file. Instead, we should use stratified splitting or simply use the entire dataset as the training set.\n\n**Solution:** Remove the `train_test_split` variable and use the entire dataset as the training set.\n\n**Error 2:**\nThe code is trying to load two files (train and test) when there\'s only one file. This will cause an error when trying to split the data into train and test sets.\n\n**Solution:** Load the single file and use it as both the training and testing sets.\n\n**Error 3:**\nThe code is not handling missing values in the `Age` column correctly. The median imputer should be used with caution, as it can introduce bias if there are many missing

In [54]:
m = re_code.split("the corrected code:\n\n```")[1].split("```")[0]
n = m.strip("\n")


with open('/Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/foundation-ml/bucket-code/re_write.py', "wt") as file:
                    file.write(n)

In [31]:
import os
import glob
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define constants
CONTEST_NAME = 'Titanic'
TRAIN_FILE = f'{CONTEST_NAME}_train.csv'
TEST_FILE = f'{CONTEST_NAME}_test.csv'

def find_files(contest_name):
    root_dir = '/Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/all_datasets'
    for dirpath, dirnames, filenames in os.walk(root_dir):
        if contest_name in dirpath:
            return [os.path.join(dirpath, f) for f in filenames]
    return []

def preprocess_data(train_file, test_file):
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)

    categorical_cols = ['Pclass', 'Sex', 'Embarked']
    numerical_cols = ['Age']

    imputer = SimpleImputer(strategy='median')
    categorical_encoder = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('categorical', categorical_encoder, categorical_cols),
            ('numerical', imputer, numerical_cols)
        ]
    )

    train_df[['Age']] = preprocessor.fit_transform(train_df[['Age']])
    test_df[['Age']] = preprocessor.transform(test_df[['Age']])

    train_df['Title'] = train_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip().title())
    test_df['Title'] = test_df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip().title())

    sex_mapping = {'male': 0, 'female': 1}
    train_df['Sex'] = train_df['Sex'].map(sex_mapping)
    test_df['Sex'] = test_df['Sex'].map(sex_mapping)

    cabin_mapping = {'': 'Unknown'}
    train_df['Cabin'] = train_df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'Unknown')
    test_df['Cabin'] = test_df['Cabin'].apply(lambda x: x[0] if pd.notna(x) else 'Unknown')

    embarked_encoder = OneHotEncoder(handle_unknown='ignore')
    train_df[['Embarked']] = embarked_encoder.fit_transform(train_df[['Embarked']])
    test_df[['Embarked']] = embarked_encoder.transform(test_df[['Embarked']])

    train_df.drop('PassengerId', axis=1, inplace=True)
    test_df.drop('PassengerId', axis=1, inplace=True)

    return train_df, test_df

# Find files
files = find_files(CONTEST_NAME)
if len(files) == 1:
    file_count = 1
    train_file, test_file = files[0], None
elif len(files) == 2:
    file_count = 2
    train_file, test_file = files
else:
    print("Error: No matching files found.")
    exit()

# Split data if necessary
if file_count == 1:
    from sklearn.model_selection import train_test_split
    train_df, test_df = train_test_split(pd.read_csv(train_file), test_size=0.2)
    train_file, test_file = 'processed_train.csv', 'processed_test.csv'
else:
    train_df, test_df = preprocess_data(train_file, test_file)

# Save preprocessed data
train_df.to_csv('processed_train.csv', index=False)
test_df.to_csv('processed_test.csv', index=False)

print("DATA_READY: processed_train.csv processed_test.csv")

DATA_READY: processed_train.csv processed_test.csv


In [56]:
import os
import glob
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Set root path to find the dataset folder
root_path = "/Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/all_datasets"

# Find the folder matching the Contest Name
contest_folder = [folder for folder in os.listdir(root_path) if "titanic" in folder][0]
data_folder = os.path.join(root_path, contest_folder)

# Check file count using glob
files = glob.glob(os.path.join(data_folder, "*.csv"))
file_count = len(files)
if file_count == 1:
    # Load the single file as both train and test sets
    data_file = files[0]
    train_data = pd.read_csv(os.path.join(data_folder, data_file))
    test_data = train_data.copy()
elif file_count > 1:
    raise ValueError("Invalid number of files found")

# Define preprocessing steps
categorical_cols = ["Pclass", "Sex", "Embarked"]
numerical_cols = ["Age"]

# Preprocess categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('encoder', OrdinalEncoder(categories=[['1st', '2nd', '3rd'], ['male', 'female'], ['C', 'Q', 'S']]))
])

# Preprocess numerical columns
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', categorical_transformer, categorical_cols),
        ('numerical', numerical_transformer, numerical_cols)
    ]
)

train_data_processed = preprocessor.fit_transform(train_data.drop("PassengerId", axis=1))
test_data_processed = preprocessor.transform(test_data.drop("PassengerId", axis=1))

# Verify preprocessing steps
assert train_data_processed.shape[0] == train_data.shape[0]
assert test_data_processed.shape[0] == test_data.shape[0]

# Save processed data
train_data_processed_df = pd.DataFrame(train_data_processed, columns=categorical_cols + numerical_cols)
test_data_processed_df = pd.DataFrame(test_data_processed, columns=categorical_cols + numerical_cols)

train_data_processed_df.to_csv('processed_train.csv', index=False)
test_data_processed_df.to_csv('processed_test.csv', index=False)

print("DATA_READY: processed_train.csv processed_test.csv")

ValueError: Found unknown categories [1, 2, 3] in column 0 during fit

In [None]:
class ModelAgent(BaseAgent):
    def __init__(self, specs):
        self.specs = specs
        # Placeholder system prompt, will be updated when data is ready
        self.train_file = "processed_train.csv"
        self.test_file = "processed_test.csv"
        self._update_system()

    def _update_system(self):
        system = (
            f"You are a Kaggle Grandmaster. Goal: Maximize {self.specs['target_metric']}.\n"
            f"INPUT FILES: {self.train_file}, {self.test_file}\n"
            "TASK: Write Python code to Train a Model and Generate Submission.\n"
            "STEPS:\n"
            "1. Load the processed CSV files.\n"
            "2. Train a robust model (RandomForest/XGBoost/LightGBM).\n"
            "3. Evaluate on internal validation set.\n"
            "4. Generate 'submission.csv'.\n"
            "5. PRINT 'FINAL_SCORE: <score>' at the end.\n"
            "OUTPUT FORMAT: Single valid ```python block."
        )
        super().__init__("Model Training", system)
    
    def set_data_files(self, train, test):
        self.train_file = train
        self.test_file = test
        self._update_system()


    def generate_initial_code(self, plan):
        """Step 1: Create code from scratch."""
        prompt = """Write the Python code for the {} task based on your System Instructions:
                    {}"""

        prompt = prompt.format(
                        self.role, 
                        plan
                    )

        return call_llm(self.system, prompt)

    def fix_code(self, broken_code, error_log):
        """Step 2: Self-Reflection & Fix."""
        prompt = f"""
        ### SELF-REFLECTION & FIX TASK
        Your previous code failed to execute.
        
        **1. THE BROKEN CODE:**
        ```python
        {broken_code}
        ```
        **2. THE ERROR LOG:**
        {error_log}
        
        **INSTRUCTION:**
        - Analyze the error log carefully.
        - Rewrite the code to fix the specific bug.
        - Return the COMPLETE corrected Python script.
        """
        return call_llm(self.system, prompt)


In [82]:

import os
import glob
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Task: Write a robust Python script to Find, Load, and Preprocess data.
# Follow these instructions:
# 1. Use `os.walk` to find the folder matching the Contest Name.
# 2. Check file count using `glob`. IF 1 file -> Split 80/20. IF 2 files -> Load separately.
# 3. Preprocess: Handle missing values and categorical encoding, delete useless cols.
# 4. Output: Save 'processed_train.csv' and 'processed_test.csv'.
# 5. Print 'DATA_READY: processed_train.csv processed_test.csv' at the very end.

def find_data(contest_name):
    # Use os.walk to find the folder matching the Contest Name
    for root, dirs, files in os.walk('/path/to/contest'):
        if contest_name in root:
            return root

def load_data(data_dir):
    # Check file count using glob. IF 1 file -> Split 80/20. IF 2 files -> Load separately.
    files = glob.glob(os.path.join(data_dir, '*.csv'))
    if len(files) == 1:
        train_test_df = pd.read_csv(files[0])
        train_df, test_df = train_test_split(train_test_df, test_size=0.2, random_state=42)
    elif len(files) == 2:
        train_df = pd.read_csv(files[0])
        test_df = pd.read_csv(files[1])
    else:
        raise ValueError("Invalid number of files found")

    return train_df, test_df

def preprocess_data(train_df):
    # Preprocess: Handle missing values and categorical encoding, delete useless cols.
    # Extract Title from Name
    train_df['Title'] = train_df['Name'].apply(lambda x: x.split(', ')[0].strip().title())

    # Map Sex to 0/1
    sex_mapping = {'male': 0, 'female': 1}
    train_df['Sex'] = train_df['Sex'].map(sex_mapping)

    # Fill missing Age values with Median by Title
    imputer = SimpleImputer(strategy='median')
    age_pipeline = Pipeline(steps=[('imputer', imputer)])
    train_df[['Age']] = age_pipeline.fit_transform(train_df[['Age']])

    # Extract first letter of Cabin as 'Deck'
    train_df['Cabin'] = train_df['Cabin'].apply(lambda x: x[0].upper() if pd.notna(x) else 'Unknown')

    # One-Hot Encode Embarked
    embarked_encoder = OneHotEncoder(handle_unknown='ignore')
    embarked_pipeline = Pipeline(steps=[('encoder', embarked_encoder)])
    embarked_pipeline.fit(train_df[['Embarked']])
    train_df[['Embarked']] = embarked_pipeline.transform(train_df[['Embarked']])

    return train_df

# Find the folder matching the Contest Name
data_dir = find_data('titanic')

# Load data
train_df, test_df = load_data(data_dir)

# Preprocess data
processed_train_df = preprocess_data(train_df)

# Save preprocessed data
processed_train_df.to_csv('processed_train.csv', index=False)
test_df.to_csv('processed_test.csv', index=False)

print("DATA_READY: processed_train.csv processed_test.csv")


TypeError: expected str, bytes or os.PathLike object, not NoneType

## Second attempt

REDESIGN WORKFLOW from FIRST ATTEMPT

1. Based on previous foundation attempt, I created a workflow basis: 
```Gen -> Self check -> Exec -> Fix -> Retry. ```

2. But in real life, this is quite not suitable, circle in second will focus on loop: ``` gen -> self check -> exec -> re-gen.... ```

In [83]:
import requests
import os
import sys
import subprocess
import time
import selectors 

# --- CONFIGURATION ---
OLLAMA_URL = "http://localhost:11434/api/generate"
MODEL_NAME = "llama3"  # Ensure you have this model: `ollama pull llama3`
WORKSPACE_DIR = "./contest_workspace"
PYTHON_EXEC = sys.executable 

# ROOT DATA PATH (Adjust this to your actual path)
ALL_DATA_PATH = "/Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/all_datasets" 

# Setup Directories
os.makedirs(WORKSPACE_DIR, exist_ok=True)
os.makedirs(ALL_DATA_PATH, exist_ok=True)


# --- 1. LLM COMMUNICATION LAYER ---
def call_llm(system_prompt, user_prompt):
    """Sends request to Ollama with a strict JSON payload."""
    full_prompt = f"System: {system_prompt}\nUser: {user_prompt}"
    payload = {
        "model": MODEL_NAME, 
        "prompt": full_prompt, 
        "stream": False,
        "options": {
            "temperature": 0.2, # Low temp for precise coding
            "num_ctx": 16384,    # Large context for code + error logs
            "stop": ["User:", "System:", "```python\n\n"] # Stop tokens
        }
    }
    try:
        print("   ðŸ§  (Agent is thinking...)", end="\r")
        response = requests.post(OLLAMA_URL, json=payload)
        response.raise_for_status()
        return response.json().get('response', '')
    except Exception as e:
        return f"LLM_ERROR: {str(e)}"

def execute_script(script_name, work_dir = ".", device="0"):    
    if not os.path.exists(os.path.join(work_dir, script_name)):
        raise Exception(f"The file {script_name} does not exist.")
    try:
        script_path = script_name
        device = device        
        cmd = f"CUDA_VISIBLE_DEVICES={device} python -u {script_path}"
        process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True, cwd=work_dir)

        stdout_lines = []
        stderr_lines = []

        selector = selectors.DefaultSelector()
        selector.register(process.stdout, selectors.EVENT_READ)
        selector.register(process.stderr, selectors.EVENT_READ)

        while process.poll() is None and selector.get_map():
            events = selector.select(timeout=1)

            for key, _ in events:
                line = key.fileobj.readline()
                if key.fileobj == process.stdout:
                    # print("STDOUT:", line, end =" ")
                    stdout_lines.append(line)
                else:
                    # print("STDERR:", line, end =" ")
                    stderr_lines.append(line)

        for line in process.stdout:
            line = line
            # print("STDOUT:", line, end =" ")
            stdout_lines.append(line)
        for line in process.stderr:
            line = line
            # print("STDERR:", line, end =" ")
            stderr_lines.append(line)

        return_code = process.returncode

        if return_code != 0:
            observation = "".join(stderr_lines)
        else:
            observation = "".join(stdout_lines)
        if observation == "" and return_code == 0:
            # printed to stderr only
            observation = "".join(stderr_lines)
        return return_code, "The script has been executed. Here is the output:\n" + observation
    
    except Exception as e:
        print("++++", "Wrong!")
        # raise Exception(f"Something went wrong in executing {script_name}: {e}. Please check if it is ready to be executed.")
        return -1, f"Something went wrong in executing {script_name}: {e}. Please check if it is ready to be executed."


class BaseAgent:
    """Parent class capable of Generating and Fixing code."""
    def __init__(self, role, system_instruction, specs, agent_profile):
        self.specs = specs
        self.role = role
        self.system = system_instruction
        self.code_path = "/Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/foundation-ml/bucket-code"
        self.root_path_data = "/Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/all_datasets"
        self.agent_profile = agent_profile
        self.device = 0

    def make_plan(self, data_description, requirement, plan_conditions, code="", error_logs=[]):
        
        log = "Nothing. This is your first attempt."
        code = code 

        prompt = """Devise an end-to-end actionable plan for {} task according to the user's requirements described in the following JSON object.

        # User Requirements & Data Info
        ```json
        {{
            "Data Description": "{}",
            "Requirements": "{}"
            "Root path data": "{}"
        }}
        ```
        When devising a plan, follow these instructions and do not forget them:
        {}
                        
        # Error from the Previously Written Code
                        {}
        """
        prompt = prompt.format(
                self.role,
                data_description,
                requirement,
                self.root_path_data,
                self.system,
                # code, 
                error_logs
        )

        print(prompt)
        return call_llm(self.agent_profile, prompt)

    def implement_solution(self, code_instructions, code="", n_attempts=5):

        print(
            self.role,
            "I am implementing the following instruction"
        )

        log = "Nothing. This is your first attempt."
        error_logs = []
        code = code  # if a template/skeleton code is provided
        iteration = 0
        completion = None
        action_result = ""
        rcode = -1


        while iteration < n_attempts:
            try:
                exec_prompt = """Carefully read the following instructions to write Python code for {} task.
                {}
                
                # Previously Written Code
                ```python
                {}
                ```
                
                # Error from the Previously Written Code
                {}
                

                Note that you need to write the python code for the {}. 
                Start the python code with "```python". Please ensure the completeness of the code so that it can be run without additional modifications.
                If there is any error from the previous attempt, please carefully fix it first."""
       
                exec_prompt = exec_prompt.format(
                    self.specs['requirements'],
                    code_instructions,
                    code,
                    log,
                    "data processing part",
                )

                res = call_llm(self.agent_profile, exec_prompt)
                raw_completion = res
                completion = raw_completion.split("```python")[1].split("```")[0]
                
                if not completion.strip(" \n"):
                    continue
                
                self.code_name = self.role 
                filename = f"{self.code_path}{self.code_name}.py"
                os.makedirs(os.path.dirname(filename), exist_ok=True)
                with open(filename, "wt") as file:
                    file.write(completion)
                code = completion
                rcode, log = self.self_validation(filename)
                if rcode == 0:
                    action_result = log
                    break
                else:
                    log = log
                    error_logs.append(log)
                    action_result = log
                    print(self.agent_type, f"I got this error (itr #{iteration}): {log}")
                    iteration += 1                    
                    # break
            except Exception as e:
                iteration += 1
                print(self.role, f"===== Retry: {iteration} =====")
                print(self.role, f"Executioin error occurs: {e}")
            continue
        if not completion:
            completion = ""

        print(
            self.role,
            f"I executed the given plan and got the follow results:\n\n{action_result}",
        )
        return {"rcode": rcode, "action_result": action_result, "code": completion, "error_logs": error_logs}


    def run_code(self, completion, code="", n_attempts=5):

        print(
            self.role,
            "I am implementing the following instruction"
        )

        log = "Nothing. This is your first attempt."
        error_logs = []
        code = code  # if a template/skeleton code is provided
        iteration = 0
        action_result = ""
        rcode = -1

        while iteration < n_attempts:
            try:
                raw_completion = completion
                if "```python" in raw_completion:
                    completion = raw_completion.split("```python")[1].split("```")[0]
                elif "```Python" in raw_completion:
                    completion = raw_completion.split("```Python")[1].split("```")[0]
                else:
                    completion = raw_completion.split("```")[1].split("```")[0]

                if not completion.strip(" \n"):
                    continue
                self.code_name = self.role 
                filename = f"{self.code_path}/{self.code_name}.py"
                os.makedirs(os.path.dirname(filename), exist_ok=True)
                with open(filename, "wt") as file:
                    file.write(completion)
                code = completion
                rcode, log = self.self_validation(filename)
                if rcode == 0:
                    action_result = log
                    break
                else:
                    log = log
                    error_logs.append(log)
                    action_result = log
                    print(self.role, f"I got this error (itr #{iteration}): {log}")
                    iteration += 1                    
                    # break
            except Exception as e:
                iteration += 1
                print(self.role, f"===== Retry: {iteration} =====")
                print(self.role, f"Executioin error occurs: {e}")
            continue
        if not completion:
            completion = ""

        print(
            self.role,
            f"I executed the given plan and got the follow results:\n\n{action_result}",
        )
        return {"rcode": rcode, "action_result": action_result, "code": completion, "error_logs": error_logs}


    def self_validation(self, filename):
        rcode, log = execute_script(filename, device=self.device)
        return rcode, log

class Reflexion:

    def __init__(self, specs, code, rules):

        self.specs = specs 
        self.code = code 
        self.rules = rules

    def self_reflexion(self):
        """ 
        This is for self-reflexion to self check code to make sure logic + syntax is right before executive
        """

        agent_profile = """ 
                    You are a Senior Python Code Reviewer and Quality Assurance Engineer
                    
                    Objective: 

                    Audit the generated code to ensure it is runnable, efficient, and error-free. You must strictly enforce the following rules:
                    
                    1. Logical Correctness: Verify the code logic is sound. Ensure variables are defined before use and data flows correctly through the pipeline.
                    2. Strict Import Verification (CRITICAL):
                        - No "Ghost" Imports: Scan the code for every external class or function used.
                        - Mapping Check: You must verify that a corresponding import or from ... import ... statement exists at the top of the file for every single tool used.
                    
                    3. Structural Efficiency: Ensure all classes and functions have a clear purpose. Remove any redundant code or "pass" blocks that do not contribute to the solution.
                    4. Requirement Compliance: The code must produce the exact output format requested by the user (e.g., specific CSV filenames or print statements).
                    5. Using complete: Ensure all library from import is using right syntax in the solution.  
                    """

        prompt = """ 
                    Carefully read the requirement for task: 
                    {} 

                    Carefully read the description of data for task
                    {} 

                    Carefully read the rule for task engineering data
                    {}

                    And review code result below: 
                    {}

                    If there is any error from the previous attempt, please carefully fix it first. Delete all columns uesless!

                    Response return will have two components (in two list):
                    - Error you check and solution
                    - Full code complete 
                """    

        prompt = prompt.format(
                    self.specs['requirements'],
                    self.specs['data_description'],
                    self.rules,
                    self.code
        )

        response = call_llm(agent_profile, prompt) 
        return response 

class DataAgent(BaseAgent):
    def __init__(self, specs):
        system = (
            "TASK: Write a robust Python script to Find, Load, and Preprocess data.\n"
            "STRICT RULES:\n"
            "1. USE `os.walk` to find the folder matching the Contest Name.\n"
            "2. CHECK file count using `glob`. IF 1 file -> Split 80/20. IF 2 files -> Load separately.\n"
            "3. PREPROCESS: Handle missing values and categorical encoding and delete cols useless.\n"
            "4. OUTPUT: Save 'processed_train.csv' and 'processed_test.csv'.\n"
            "5. PRINT 'DATA_READY: processed_train.csv processed_test.csv' at the very end.\n"
            "OUTPUT FORMAT: Single valid ```python block."
        )

        agent_profile = """You are the world's best data scientist of an automated machine learning project (AutoML) that can find the most relevant datasets,run useful preprocessing, perform suitable data augmentation, and make meaningful visulaization to comprehensively understand the data based on the user requirements. You have the following main responsibilities to complete.
                        1. Retrieve a dataset from the user or search for the dataset based on the user instruction.
                        2. Perform data preprocessing based on the user instruction or best practice based on the given tasks.
                        3. Perform data augmentation as neccesary.
                        4. Extract useful information and underlying characteristics of the dataset."""

        super().__init__("Data_Engineering", system, specs, agent_profile)


# --- 3. THE MANAGER (WORKFLOW ORCHESTRATOR) ---

class CopilotManager:
    def __init__(self, specs):
        self.specs = specs
        self.data_agent = DataAgent(specs)
        # self.model_agent = ModelAgent(specs)

    def run_agent_cycle(self, agent, max_retries=3):
        """
        The Core Loop: Gen -> Exec -> Fix -> Retry
        """
        print(f"\nðŸ”¹ STARTED AGENT: {agent.role}")

        print(f"\n Generating plan and code for: {agent.role} ")

        code = agent.make_plan(self.specs["data_description"], self.specs["requirements"], self.data_agent.system)
        
        print("__"* 69 )
        print(f"\n Self-reflexion code for: {agent.role} ")

        # re_code = Reflexion(self.specs, code)
        # print(re_code)

        # print(f"\n Generating code and self-reflexion for: {agent.role} ")
        # # results = agent.implement_solution(plan)
        # results = agent.run_code(code)
            
        return code

    def start_workflow(self):
        print(f"ðŸš€ STARTING WORKFLOW: {self.specs['name']}")
        start_time = time.time()

        # ==========================================
        # 1. DATA AGENT PHASE
        # ==========================================
        output = self.run_agent_cycle(self.data_agent)

        return output
        
# --- 4. EXECUTION ---
if __name__ == "__main__":
    
    # Requirement Spec
    my_challenge = {
        "name": "Titanic",
        "requirements": "Predict survival (0/1)",
        
        "data_description": """
        The dataset contains the following columns:
        - PassengerId: Unique ID (Drop this for training)
        - Survived: Target variable (0 = No, 1 = Yes)
        - Pclass: Ticket class (1, 2, 3). Treat as Ordinal.
        - Name: Passenger name. (Extract Title like Mr/Mrs to create new feature 'Title')
        - Sex: 'male'/'female'. (Map to 0/1)
        - Age: Numeric. Contains missing values (Fill with Median by Title).
        - Cabin: Cabin number. Many missing. (Extract first letter as 'Deck', fill missing with 'Unknown')
        - Embarked: Port of Embarkation (C, Q, S). (One-Hot Encode).
        """,
        
        "target_metric": "Accuracy",
        "output_expectations": "submission.csv with PassengerId, Survived"
    }

    # Run
    # manager = CopilotManager(my_challenge)
    # manager.start_workflow()


my_challenge = {
        "name": "Titanic",
        "requirements": "Predict survival (0/1)",
        
        "data_description": """
        The dataset contains the following columns:
        - PassengerId: Unique ID (Drop this for training)
        - Survived: Target variable (0 = No, 1 = Yes)
        - Pclass: Ticket class (1, 2, 3). Treat as Ordinal.
        - Name: Passenger name. (Extract Title like Mr/Mrs to create new feature 'Title')
        - Sex: 'male'/'female'. (Map to 0/1)
        - Age: Numeric. Contains missing values (Fill with Median by Title).
        - Cabin: Cabin number. Many missing. (Extract first letter as 'Deck', fill missing with 'Unknown')
        - Embarked: Port of Embarkation (C, Q, S). (One-Hot Encode).
        """,
        
        "target_metric": "Accuracy",
        "output_expectations": "submission.csv with PassengerId, Survived"
    }

specs = my_challenge
data_agent = DataAgent(specs)

print(f"ðŸš€ STARTING WORKFLOW: {specs['name']}")

"""
The Core Loop: Gen -> Exec -> Fix -> Retry
"""
print(f"\nðŸ”¹ STARTED AGENT: {data_agent.role}")

print(f"\n Generating plan and code for: {data_agent.role} ")

print(
    data_agent.role,
    "I am implementing the following instruction"
)

log = "Nothing. This is your first attempt."
error_logs = []
code = ""  # if a template/skeleton code is provided
iteration = 0
action_result = ""
rcode = -1
code_path = "/Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/foundation-ml/bucket-code"

n_attempts = 5

while iteration < n_attempts:
    print("Attempt: ", iteration)
    try:

        raw_completion = data_agent.make_plan(specs["data_description"], 
                                        specs["requirements"], 
                                        data_agent.system, 
                                        code,
                                        log)


        if "```python" in raw_completion:
            completion = raw_completion.split("```python")[1].split("```")[0]
        elif "```Python" in raw_completion:
            completion = raw_completion.split("```Python")[1].split("```")[0]
        else:
            completion = raw_completion.split("```")[1].split("```")[0]

        if not completion.strip(" \n"):
            continue

        review = Reflexion(specs, completion, data_agent.system)
        re_code_raw = review.self_reflexion()

        if "```python" in re_code_raw:
            re_code = re_code_raw.split("```python")[1].split("```")[0]
        elif "```Python" in raw_completion:
            re_code = re_code_raw.split("```Python")[1].split("```")[0]
        else:
            re_code = re_code_raw.split("```")[1].split("```")[0]
            
        re_code = re_code.strip("\n")

        completion = re_code

        code_name = data_agent.role 
        filename = f"{code_path}/{code_name}.py"
        os.makedirs(os.path.dirname(filename), exist_ok=True)
        with open(filename, "wt") as file:
            file.write(completion)
        code = completion
        rcode, log = data_agent.self_validation(filename)
        if rcode == 0:
            action_result = log
            break
        else:
            log = log
            error_logs.append(log)
            action_result = log
            print(data_agent.role, f"I got this error (itr #{iteration}): {log}")
            iteration += 1                    
            # break
    except Exception as e:
        iteration += 1
        print(data_agent.role, f"===== Retry: {iteration} =====")
        print(data_agent.role, f"Executioin error occurs: {e}")
    continue

print(
    data_agent.role,
    f"I executed the given plan and got the follow results:\n\n{action_result}",
)



ðŸš€ STARTING WORKFLOW: Titanic

ðŸ”¹ STARTED AGENT: Data_Engineering

 Generating plan and code for: Data_Engineering 
Data_Engineering I am implementing the following instruction
Attempt:  0
Devise an end-to-end actionable plan for Data_Engineering task according to the user's requirements described in the following JSON object.

        # User Requirements & Data Info
        ```json
        {
            "Data Description": "
        The dataset contains the following columns:
        - PassengerId: Unique ID (Drop this for training)
        - Survived: Target variable (0 = No, 1 = Yes)
        - Pclass: Ticket class (1, 2, 3). Treat as Ordinal.
        - Name: Passenger name. (Extract Title like Mr/Mrs to create new feature 'Title')
        - Sex: 'male'/'female'. (Map to 0/1)
        - Age: Numeric. Contains missing values (Fill with Median by Title).
        - Cabin: Cabin number. Many missing. (Extract first letter as 'Deck', fill missing with 'Unknown')
        - Embarked: Por

In [80]:
import os
import glob
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Set root path to the dataset
root_path = "/Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/all_datasets"

# Find the folder matching the Contest Name (Titanic)
contest_folder = [folder for folder in os.listdir(root_path) if "titanic" in folder.lower()][0]

# Load the dataset
train_file = glob.glob(os.path.join(root_path, contest_folder, "*_train.csv"))[0]
test_file = glob.glob(os.path.join(root_path, contest_folder, "*_test.csv"))[0]

# Check file count using glob. If 1 file -> Split 80/20. If 2 files -> Load separately.
if len(train_file) == 1:
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
else:
    train_data = pd.concat([pd.read_csv(f) for f in glob.glob(os.path.join(root_path, contest_folder, "*_train.csv"))], ignore_index=True)
    test_data = pd.concat([pd.read_csv(f) for f in glob.glob(os.path.join(root_path, contest_folder, "*_test.csv"))], ignore_index=True)

# Preprocess data
categorical_cols = ["Pclass", "Sex", "Embarked"]
numerical_cols = ["Age"]

# Handle missing values and categorical encoding
imputer = SimpleImputer(strategy="median")
preprocessor = ColumnTransformer(
    transformers=[
        ("num", imputer, numerical_cols),
        ("cat", OrdinalEncoder(handle_unknown="use_encoded_value", unknown="missing"), categorical_cols)
    ]
)

train_data_preprocessed = pd.DataFrame(preprocessor.fit_transform(train_data.drop("PassengerId", axis=1)), columns=train_data.drop("PassengerId", axis=1).columns)
test_data_preprocessed = pd.DataFrame(preprocessor.transform(test_data.drop("PassengerId", axis=1)), columns=test_data.drop("PassengerId", axis=1).columns)

# Extract Title from Name
train_data_preprocessed["Title"] = train_data_preprocessed["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip().title())
test_data_preprocessed["Title"] = test_data_preprocessed["Name"].apply(lambda x: x.split(",")[1].split(".")[0].strip().title())

# Extract first letter of Cabin as 'Deck'
train_data_preprocessed["Cabin"] = train_data_preprocessed["Cabin"].str[0]
test_data_preprocessed["Cabin"] = test_data_preprocessed["Cabin"].str[0]

# Fill missing values in Age with Median by Title
title_age_median = imputer.fit_transform(train_data_preprocessed[["Age", "Title"]].groupby("Title")["Age"].apply(lambda x: x.fillna(x.median())))
train_data_preprocessed.loc[(train_data_preprocessed["Age"].isna()) & (train_data_preprocessed["Title"].isin(title_age_median[:, 0])), "Age"] = title_age_median[:, 1]
test_data_preprocessed.loc[test_data_preprocessed["Age"].isna(), "Age"] = title_age_median[:, 1]

# One-Hot Encode Embarked
ohe = OneHotEncoder(handle_unknown="ignore")
embarked_encoded = ohe.fit_transform(train_data_preprocessed[["Embarked"]])
train_data_preprocessed = pd.DataFrame(embarked_encoded.toarray(), columns=ohe.get_feature_names_out(["Embarked"]))
test_data_preprocessed = pd.DataFrame(ohe.transform(test_data_preprocessed[["Embarked"]]).toarray(), columns=ohe.get_feature_names_out(["Embarked"]))

# Save preprocessed data
train_data_preprocessed.to_csv("processed_train.csv", index=False)
test_data_preprocessed.to_csv("processed_test.csv", index=False)

print("DATA_READY: processed_train.csv processed_test.csv")

IndexError: list index out of range

In [71]:
import os
import glob
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define the root path for the dataset
root_path = "/Users/minhtuan/Documents/Documents/Work/Hanoi/crawler/all_datasets"

# Find the folder matching the Contest Name (Titanic)
contest_folder = [folder for folder in os.listdir(root_path) if "titanic" in folder][0]

# Load the dataset
train_files = glob.glob(os.path.join(root_path, contest_folder, "*.csv"))
test_file = None

if len(train_files) > 1:
    train_file = train_files[0]
    test_file = train_files[1] if len(train_files) > 1 else None
else:
    train_file = train_files[0]

train_df = pd.read_csv(train_file)
test_df = None

if test_file:
    test_df = pd.read_csv(test_file)

# Define preprocessing steps
categorical_cols = ["Pclass", "Sex", "Embarked"]
numerical_cols = ["Age"]

# Preprocess categorical columns
categorical_transformer = Pipeline([
    ("ordinal_encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
])

# Preprocess numerical columns
numerical_transformer = SimpleImputer(strategy='median')

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", categorical_transformer, categorical_cols),
        ("numerical", numerical_transformer, numerical_cols)
    ]
)

# Apply preprocessing to the data
train_df_preprocessed = preprocessor.fit_transform(train_df.drop("Survived", axis=1))
test_df_preprocessed = None

if test_df:
    test_df_preprocessed = preprocessor.transform(test_df.drop("Survived", axis=1))

# Convert back to DataFrame
train_df_preprocessed = pd.DataFrame(train_df_preprocessed, columns=categorical_cols + numerical_cols)
test_df_preprocessed = None

if test_df_preprocessed:
    test_df_preprocessed = pd.DataFrame(test_df_preprocessed, columns=categorical_cols + numerical_cols)

# Save the processed data
train_df_preprocessed.to_csv("processed_train.csv", index=False)
test_df_preprocessed.to_csv("processed_test.csv", index=False) if test_df_preprocessed else None

print("DATA_READY: processed_train.csv" + (" processed_test.csv" if test_df_preprocessed else ""))

DATA_READY: processed_train.csv
