In [1]:
# Create a smol agent does the has access to the following tools
# Read metadata about the dataset
# Read a sample of the dataset and produce some summary statistics
# Produce an R script to preprocess the data
# Execute the R script and the return the stdout and stderr
# The agent should be able to call these tools in a sequence to achieve the goal of understanding the dataset and preparing it for analysis.

# Extra can we look at integrating langfuse for tracking?

In [2]:
from smolagents import tool
from striprtf.striprtf import rtf_to_text

@tool
def read_metadata(filename: str, columns: list[str]) -> str:
    """Read the corresponding metadata of a file and return its contents as a string.
    
        Args:
        filename: The name of the metadata file to read (without path).
        columns: The list of columns to search for in the metadata.
    """
    
    # first assert there is an rtf extension
    if not filename.endswith('.rtf'):
        raise ValueError("Metadata file must be an RTF file with .rtf extension.")

    with open(filename, 'r') as file:
        metadata = file.read()
    plain_text = rtf_to_text(metadata)

    ## Search text for variable name and return the surrounding context
    if columns:
        import re

        # Find all occurrences of the variable name (case insensitive)
        matches = []
        padding = 300
        
        for variable_name in columns:
            for match in re.finditer(re.escape(variable_name), plain_text, re.IGNORECASE):
                start_pos = match.start()
                end_pos = match.end()
                
                # Get characters before and after the match
                context_start = max(0, start_pos - padding)
                context_end = min(len(plain_text), end_pos + padding)
                
                matches.append((context_start, context_end))
        
        if not matches:
            return f"Variable name '{variable_name}' not found in metadata."
        
        # Merge overlapping ranges
        merged_ranges = []
        matches.sort()  # Sort by start position
        
        for start, end in matches:
            if merged_ranges and start <= merged_ranges[-1][1]:

                # Overlapping or adjacent - merge with the last range
                merged_ranges[-1] = (merged_ranges[-1][0], max(merged_ranges[-1][1], end))
            else:
                # No overlap - add as new range
                merged_ranges.append((start, end))
        
        # Extract text from merged ranges
        contexts = []
        for start, end in merged_ranges:
            context = plain_text[start:end]
            contexts.append(context)
        
        return f"Found {len(matches)} occurrence(s) of '{columns}' in {len(contexts)} merged chunk(s):\n\n" + "\n\n" + "="*50 + "\n\n".join(contexts)

    return plain_text

@tool
def read_sample_and_summarise(filename: str, columns: list[str]) -> str:
    """Reads a sample of the dataset and produce some summary statistics.
    
        Args:
        filename: The name of the data file to read.
        columns: The list of columns to summarise.
    """ 
    import pandas as pd

    df = pd.read_csv(filename, sep="\t")
    
    sample = df[columns].head(5).to_string()

    # Get summary statistics
    summary = df[columns].describe(include='all').to_string()
    
    return f"Summary statistics:\n{summary}\n\nSample:\n{sample}"

@tool
def produce_r_script(rscript: str) -> str:
    """Produce an executable R script and write and execute the r script returning stdout and stderr.
        
        Args:
        rscript: The rscript to execute. This should be correct R.
    """
    import subprocess
    
    # Write the R script to a file
    rscript_filename = "preprocess_data.R"
    with open(rscript_filename, 'w') as file:
        file.write(rscript)
    
    # Execute the R script and capture stdout and stderr
    result = subprocess.run(['Rscript', rscript_filename], capture_output=True, text=True)
    
    return f"STDOUT:\n{result.stdout}\n\nSTDERR:\n{result.stderr}"

# Example usage
filename = "metadata.rtf"
print("Reading metadata...")
metadata = read_metadata(filename, ["W9CLENDYR"])
print(metadata)

print("Reading sample and summarizing...")
filename = "sample_data.tab"
sample_summary = read_sample_and_summarise(filename, ["W9CLENDYR"])
print(sample_summary)

print("Producing and executing R script...")
rscript = """
# Example hello world R script
print("Hello, World!")
"""
rscript_output = produce_r_script(rscript)
print(rscript_output)

Reading metadata...
Found 2 occurrence(s) of '['W9CLENDYR']' in 1 merged chunk(s):



	Value = 4.0	Label = April
	Value = 5.0	Label = May
	Value = 6.0	Label = June
	Value = 7.0	Label = July
	Value = 8.0	Label = August
	Value = 9.0	Label = September
	Value = 10.0	Label = October
	Value = 11.0	Label = November
	Value = 12.0	Label = December

Pos. = 6	Variable = W9CLENDYR	Variable label = When period of economic activity ended - year
This variable is  numeric, the SPSS measurement level is SCALE
SPSS user missing values = -9.0 thru -1.0
	Value label information for W9CLENDYR
	Value = -9.0	Label = Refused
	Value = -8.0	Label = Don't know
	Value = -3.0	Label = Not asked at fieldwork stage
	Value = -1.0	Label = Not applicable

Pos. = 7	Variable = W9CLSEMPCHK	Variable label = Recorded period of economic activity ended before stopping previous activity
This variable is  num
Reading sample and summarizing...
Summary statistics:
         W9CLENDYR
count    20.000000
mean   2017.100000
std       

In [None]:
from smolagents import LiteLLMModel, ToolCallingAgent

# Ensure you have ollama running with the correct model downloaded
model=LiteLLMModel(model_id="ollama_chat/gemma3:12b", api_key="ollama")

# Not the use of the sampling tools and metadata tools could be replaced by programitcally embedding them in the prompt
agent = ToolCallingAgent(
    tools=[read_metadata, read_sample_and_summarise, produce_r_script],
    model=model,
    stream_outputs=True,
)

columns = ["W9CLENDYR"]

task = f"""Your task is preprocess the dataset so that it is clean and ready for epidemiological analysis.

The dataset can be found in the file 'sample_data.tab'.

And the metadata can be found in the file 'metadata.rtf'.

I want the R script to do the following:

- Only preprocess the columns "{columns}". 
- Use only base R functions
- Include installs for all required packages
- Saves the cleaned data to a new file called 'clean_{{filename}}'.

The script should look like this:
# imports
```all imports here```
# data cleaning steps
```all data cleaning steps here```
# save cleaned data
```all data saving steps here```


IMPORTANT: sample the data and read the metadata to inform how you preprocess the data giving the column a new and more useful name.

If the R script has errors fix them and re-run the script until it runs without error.
"""

agent.run(task)


Output()

Output()

Output()

Output()

Output()

Output()

'clean_sample_data.tab'

In [4]:
# read in th