In [2]:
from ucimlrepo import fetch_ucirepo, list_available_datasets
import pandas as pd
import random


In [3]:
%%capture output
list_available_datasets()

In [4]:
prev_cell_output = str(output)
with open('datasets.txt', 'w') as f:
    f.write(prev_cell_output)

In [5]:
datasets = pd.read_csv('datasets.txt', skiprows=5, names=['name', 'id'], header=None, 
                        delimiter=r'\s{2,}', skipinitialspace=True, engine='python')
datasets

Unnamed: 0,name,id
0,Abalone,1
1,Adult,2
2,Annealing,3
3,Audiology (Standardized),8
4,Auto MPG,9
...,...,...
204,Infrared Thermography Temperature,925
205,National Poll on Healthy Aging (NPHA),936
206,Regensburg Pediatric Appendicitis,938
207,RT-IoT2022,942


In [6]:
dataset_id = random.randint(0, len(datasets)-1)
dataset_name = datasets.iloc[dataset_id]['name']
adequate = False
while not adequate:
    dataset = fetch_ucirepo(datasets.iloc[dataset_id]['name'])
    if dataset["metadata"]['has_missing_values'] =='yes':
        adequate = True
dataset

{'data': {'ids': None,
  'features':      BI-RADS   Age  Shape  Margin  Density
  0        5.0  67.0    3.0     5.0      3.0
  1        4.0  43.0    1.0     1.0      NaN
  2        5.0  58.0    4.0     5.0      3.0
  3        4.0  28.0    1.0     1.0      3.0
  4        5.0  74.0    1.0     5.0      NaN
  ..       ...   ...    ...     ...      ...
  956      4.0  47.0    2.0     1.0      3.0
  957      4.0  56.0    4.0     5.0      3.0
  958      4.0  64.0    4.0     5.0      3.0
  959      5.0  66.0    4.0     5.0      3.0
  960      4.0  62.0    3.0     3.0      3.0
  
  [961 rows x 5 columns],
  'targets':      Severity
  0           1
  1           1
  2           1
  3           0
  4           1
  ..        ...
  956         0
  957         1
  958         0
  959         1
  960         0
  
  [961 rows x 1 columns],
  'original':      BI-RADS   Age  Shape  Margin  Density  Severity
  0        5.0  67.0    3.0     5.0      3.0         1
  1        4.0  43.0    1.0     1.0      N

In [7]:
dataset['data']["features"]

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density
0,5.0,67.0,3.0,5.0,3.0
1,4.0,43.0,1.0,1.0,
2,5.0,58.0,4.0,5.0,3.0
3,4.0,28.0,1.0,1.0,3.0
4,5.0,74.0,1.0,5.0,
...,...,...,...,...,...
956,4.0,47.0,2.0,1.0,3.0
957,4.0,56.0,4.0,5.0,3.0
958,4.0,64.0,4.0,5.0,3.0
959,5.0,66.0,4.0,5.0,3.0


<class 'ucimlrepo.dotdict.dotdict'>


TypeError: Object of type DataFrame is not JSON serializable

In [163]:
from smolagents import CodeAgent, LiteLLMModel
model = LiteLLMModel("ollama/qwen2.5-coder")

In [164]:
agent = CodeAgent(tools=[], model=model, max_steps=10, 
                  additional_authorized_imports=["pandas", "numpy"])

In [165]:
prompt = f"""Given the dataset with the following description: 
            {dataset['metadata']["name"]} - {dataset['metadata']["abstract"]} - {dataset['metadata']['additional_info']["summary"]} 
            and the following columns:
            {dataset['metadata']['additional_info']['variable_info']}
            {dataset['variables']}
            Study if the dataset has missing values, and if so, propose and execute the best strategy to deal with them.
            Please remember that the features are in variable 'X' and the target is in variable 'y', 
            you do not need to read any additional data from files, simply use those variables."""
prompt

"Given the dataset with the following description: \n            Pedestrians in Traffic - This data-set contains a number of pedestrian tracks recorded from a vehicle driving in a town in southern Germany. The data is particularly well-suited for multi-agent motion prediction tasks. - The raw data was acquired from a vehicle equipped with multiple sensors while driving, for approximately five hours, in an urban area in southern Germany. The sensor set included one mono-RGB camera, one stereo-RGB camera, an inertial measurement system with differential GPS and a lidar system. The preprocessed data available from this repository consists of 45 pedestrian tracks (in world coordinates) together with a semantic map of the static environment. For each track and at each time-step, not only the agent position is provided, but also body and head orientation attributes, as well as the position of all other agents and their type (e.g. car, cyclist, pedestrian etc.). Additional details about the p

In [166]:
result = agent.run(prompt, additional_args={"X": dataset['data']["features"], "y": dataset['data']["targets"]})

In [167]:
with open(f"results/{dataset_name}.md", "w") as f:
        f.write(result)

In [168]:
dataset["data"]["features"]

Unnamed: 0,oid,timestamp,x,y,body_roll,body_pitch,body_yaw,head_roll,head_pitch,head_yaw,other_oid,other_class,other_x,other_y
0,50187,1842.4,495854.6403,5405750.912,,,,,,,"[47646, 50181, 50184, 50187]","[0, 4, 4, 4]","[495923.373133135, 495899.069769386, 495899.05...","[5405744.32136751, 5405738.47595118, 5405739.1..."
1,50187,1842.5,495854.7921,5405750.939,,,,,,,"[50181, 50187, 50184, 47646]","[4, 4, 4, 0]","[495899.234566716, 495854.792078353, 495899.22...","[5405738.39126416, 5405750.93930797, 5405739.2..."
2,50187,1842.6,495854.9438,5405750.966,,,,,,,"[47646, 50187, 50184, 50181]","[0, 4, 4, 4]","[495921.779445452, 495854.943847121, 495899.35...","[5405744.51929698, 5405750.96626812, 5405739.1..."
3,50187,1842.7,495855.0956,5405750.993,,,,,,,"[50187, 47646, 50184, 50181]","[4, 0, 4, 4]","[495855.09561589, 495920.943052671, 495899.490...","[5405750.99322827, 5405744.63008031, 5405739.1..."
4,50187,1842.8,495855.2569,5405751.022,,,,,,,"[50187, 50184, 50181, 47646]","[4, 4, 4, 0]","[495855.256935427, 495899.585908147, 495899.72...","[5405751.02150176, 5405739.0332702, 5405738.08..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4754,39406,694.2,496244.5401,5406055.865,,,,,,,"[39406, 38509, 39402, 39414, 39415, 39416]","[4, 0, 3, 3, 1, 3]","[496244.54009617, 496227.152782719, 496217.565...","[5406055.86543169, 5406069.02898164, 5406073.3..."
4755,39406,694.3,496244.6329,5406055.854,,,,,,,"[39416, 39406, 39415, 39402, 39414, 38509]","[3, 4, 1, 3, 3, 0]","[496213.205162677, 496244.63289458, 496208.076...","[5406066.94295635, 5406055.85444835, 5406071.5..."
4756,39406,694.4,496244.7329,5406055.847,,,,,,,"[38509, 39414, 39415, 39416, 39406, 39402]","[0, 3, 1, 3, 4, 3]","[496227.152782719, 496206.206044983, 496208.70...","[5406069.02898164, 5406068.88933245, 5406071.2..."
4757,39406,694.5,496244.8396,5406055.840,,,,,,,"[39415, 38509, 39402, 39406, 39414, 39416]","[1, 0, 3, 4, 3, 3]","[496209.35021584, 496227.152782719, 496217.590...","[5406070.75167456, 5406069.02898164, 5406073.4..."
