In [3]:
# https://bhavikjikadara.medium.com/data-science-automation-a-step-by-step-guide-using-crewai-e1468823e0f8


from crewai import Agent, Task, Crew, Process, LLM
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from dotenv import load_dotenv
import os
from crewai.tools import tool

load_dotenv()
API_KEY = os.environ.get("GEMINI_API_KEY")



llm = LLM(
    model="groq/llama-3.3-70b-versatile",
    temperature=0.7
)


# llm = LLM(
#     model="ollama/mistral",
#     base_url="http://localhost:11434"
# )


In [4]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from crewai_tools import BaseTool

class DataPreprocessor(BaseTool):
    name: str = "Data Preprocessor"
    description: str = "Preprocesses data by handling missing values, removing duplicates, and encoding categorical variables."

    def _run(self, file_path: str) -> str:
        # Load the data
        df = pd.read_csv(file_path)
        
        # Get initial info
        initial_shape = df.shape
        initial_missing = df.isnull().sum().sum()
        
        # Calculate the percentage of missing values
        missing_percentage = (initial_missing / df.size) * 100
        
        # Handle missing values
        if missing_percentage < 5:
            df = df.dropna()
        else:
            # Use SimpleImputer for numerical columns
            num_cols = df.select_dtypes(include=['number']).columns
            num_imputer = SimpleImputer(strategy='mean')
            df[num_cols] = num_imputer.fit_transform(df[num_cols])
            
            # Use SimpleImputer for categorical columns
            cat_cols = df.select_dtypes(include=['object']).columns
            cat_imputer = SimpleImputer(strategy='most_frequent')
            df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
        
        # Remove duplicate entries
        df = df.drop_duplicates()
        
        # Identify categorical columns
        categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
        
        # Convert categorical variables to numerical
        label_encoder = LabelEncoder()
        for col in categorical_columns:
            df[col] = label_encoder.fit_transform(df[col])
        
        # Get final info
        final_shape = df.shape
        final_missing = df.isnull().sum().sum()
        
        # Save the processed data
        processed_file_path = os.path.join('processed_data', 'processed_data.csv')
        os.makedirs(os.path.dirname(processed_file_path), exist_ok=True)
        df.to_csv(processed_file_path, index=False)
        
        return f"""
        Data preprocessing completed:
        - Initial shape: {initial_shape}
        - Initial missing values: {initial_missing}
        - Final shape: {final_shape}
        - Final missing values: {final_missing}
        - Categorical variables encoded: {categorical_columns}
        - Duplicates removed
        - Processed data saved to: {processed_file_path}
        """


In [5]:
from crewai_tools import DirectoryReadTool

docs_tool_a = DirectoryReadTool(directory='data.csv')
data_processing_tool = DataPreprocessor()
data_preprocessing_agent = Agent(
    role="Data Preprocessing Specialist",
    goal="Load, clean, and perform initial transformations on datasets",
    backstory="Expert in data cleaning and preprocessing using pandas, numpy, and sklearn libraries",
    llm=llm,
    tools=[docs_tool_a, data_processing_tool],
    verbose=True
    # allow_code_execution=True
)

data_preprocessing_task = Task(
  description="""
  Load the file, handle missing values (remove missing values if number of missing values is less than 5 percent of dataset else use imputer), remove duplicates, and convert categorical variables to numerical values to make the dataset model-ready.
  """,
  expected_output='Processed dataset saved successfully',
  agent=data_preprocessing_agent,
  )

In [None]:
crew = Crew(
    agents=[data_preprocessing_agent], 
    tasks=[data_preprocessing_task], 
    process=Process.sequential
)

result = crew.kickoff()

In [None]:
print(result)

In [None]:
import pandas as pd
df = pd.read_csv("data.csv")
df.isnull().sum().sum()

In [None]:
from crewai import Agent, Task, Crew, Process, LLM
from crewai.tools import tool
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import os
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from crewai_tools import BaseTool

# llm = LLM(
#     model="ollama/mistral",
#     base_url="http://localhost:11434"
# )


llm = LLM(
    model="groq/llama-3.3-70b-versatile",
    temperature=0.7
)



class FeatureEngineeringTool(BaseTool):
    name: str = "Feature Scaling Tool"
    description: str = "Scales numerical features to a standard range using techniques like normalization or standardization also encode categorical values"

    def _run(self, file_path: str) -> str:
        try:
            df = pd.read_csv(file_path)
            df_engineered = df.copy()
            
            # Encode categorical variables
            le = LabelEncoder()
            categorical_cols = df_engineered.select_dtypes(include=['object']).columns
            for col in categorical_cols:
                df_engineered[col] = le.fit_transform(df_engineered[col].astype(str))
            
            # Scale numerical features
            scaler = StandardScaler()
            numerical_cols = df_engineered.select_dtypes(include=['int64', 'float64']).columns
            df_engineered[numerical_cols] = scaler.fit_transform(df_engineered[numerical_cols])

            output_path = 'engineered_features.csv'
            df_engineered.to_csv(output_path, index=False)
            
            return f"file saved to {output_path}"
            
        except Exception as e:
            print(f"Error engineering features: {str(e)}")
            return None
        

from crewai_tools import DirectoryReadTool

docs_tool_a = DirectoryReadTool(directory='data.csv')
# Create Feature Engineering agent
feature_engineering_agent = Agent(
    role="Feature Engineering Specialist",
    goal="Transform and optimize features for machine learning models and save the transformed file",
    backstory="Expert in feature scaling, encoding",
    tools=[docs_tool_a,
        FeatureEngineeringTool(),
        # FeatureEngineeringTool().generate_interaction_features
    ],
    llm=llm,
    verbose=True
)

# Create Feature Engineering task
feature_engineering_task = Task(
    description="""
    Load the file, convert categorical variables to numerical values and scale the numerical values to make the dataset model-ready.
    """,
    agent=feature_engineering_agent,
    expected_output="transformed dataset saved. also provide short description about which technologies are used for feature engineering"
)

# Create and run the crew
crew = Crew(
    agents=[feature_engineering_agent],
    tasks=[feature_engineering_task],
    process=Process.sequential
)

result = crew.kickoff()
print(result)

In [None]:
crew = Crew(
    agents=[data_preprocessing_agent,feature_engineering_agent],
    tasks=[data_preprocessing_task,feature_engineering_task],
    process=Process.sequential
)

result = crew.kickoff()
print(result)

In [None]:
pip install langtrace-python-sdk

In [None]:
# Must precede any llm module imports

from langtrace_python_sdk import langtrace

langtrace.init(api_key = 'a8f171dce8f1c150104082f5adbe1a67710f292d255ba94c5082347f372e04af')

## Feature Engineering Agent

In [None]:
from crewai import Agent, Task, Crew, Process, LLM
from crewai.tools import tool
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import os
import pandas as pd


from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain_groq import ChatGroq


# llm = LLM(
#     model="ollama/mistral",
#     base_url="http://localhost:11434"
# )

llm = LLM(
    model="groq/llama-3.3-70b-versatile",
    temperature=0.7
)


class CsvRAGtool(BaseTool):
    name: str = "CSV Query Tool"
    description: str = "A tool that analyzes CSV data and answers questions about its content using natural language queries."

    def _run(self, query: str) -> str:
        try:
            llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")

            # Add allow_dangerous_code=True to acknowledge the security implications
            agent = create_csv_agent(
                llm,
                "data.csv",
                verbose=True,
                agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                handle_parsing_errors=True,
                allow_dangerous_code=True
            )

            return agent.run(query)
            
        except Exception as e:
            print(f"Error processing CSV query: {str(e)}")
            return None



from crewai_tools import DirectoryReadTool

docs_tool_a = DirectoryReadTool(directory='data.csv')
csv_rag = CsvRAGtool()
feature_engineering_agent = Agent(
    role="Feature Engineering Specialist",
    goal="Your task is to analyze the different features of data and tell that is feature engineering really required for data or not",
    backstory="Expert in feature scaling, encoding",
    tools=[docs_tool_a,csv_rag],
    llm=llm,
    verbose=True
)

# Create Feature Engineering task
feature_engineering_task = Task(
    description="""
    1. Load the preprocessed dataset
    2. Check if feature scaling really reqd or not
    3. answer should be YES or NO along with a description
    """,
    agent=feature_engineering_agent,
    expected_output="Final answer ",
    human_input= True
)

# Create and run the crew
crew = Crew(
    agents=[feature_engineering_agent],
    tasks=[feature_engineering_task],
    process=Process.sequential
)

result = crew.kickoff()
print(result)

## Feature Selection agent

In [1]:
from crewai import Agent, Task, Crew, Process, LLM
from crewai.tools import tool
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import os
import pandas as pd
from crewai_tools import BaseTool

from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain_groq import ChatGroq


# llm = LLM(
#     model="ollama/mistral",
#     base_url="http://localhost:11434"
# )

llm = LLM(
    model="groq/llama3-8b-8192",
    temperature=0.7
)


class CsvRAGtool(BaseTool):
    name: str = "CSV Query Tool"
    description: str = "A tool that analyzes CSV data and answers questions about its content using natural language queries."

    def _run(self, query: str) -> str:
        try:
            llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")

            # Add allow_dangerous_code=True to acknowledge the security implications
            agent = create_csv_agent(
                llm,
                "data.csv",
                verbose=True,
                agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                handle_parsing_errors=True,
                allow_dangerous_code=True
            )

            return agent.run(query)
            
        except Exception as e:
            print(f"Error processing CSV query: {str(e)}")
            return None



from crewai_tools import DirectoryReadTool

docs_tool_a = DirectoryReadTool(directory='data.csv')
csv_rag = CsvRAGtool()
feature_engineering_agent = Agent(
    role="Feature selection Specialist",
    goal="Your task is to analyze the different features of data and tell that which features we can drop that are not usefull for the data to train model",
    backstory="Expert in feature selection",
    tools=[docs_tool_a,csv_rag],
    llm=llm,
    verbose=True
)

# Create Feature Engineering task
feature_engineering_task = Task(
    description="""
    1. Load the preprocessed dataset
    2. Check features that would not be having any impact on the dataset to train model
    3. answer should be list of features to drop along with a description
    """,
    agent=feature_engineering_agent,
    expected_output="Final answer "
)

# Create and run the crew
crew = Crew(
    agents=[feature_engineering_agent],
    tasks=[feature_engineering_task],
    process=Process.sequential
)

result = crew.kickoff()
print(result)

[1m[95m# Agent:[00m [1m[92mFeature selection Specialist[00m
[95m## Task:[00m [92m
    1. Load the preprocessed dataset
    2. Check features that would not be having any impact on the dataset to train model
    3. answer should be list of features to drop along with a description
    [00m


  return agent.run(query)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To answer this question, I need to describe each feature (column) in the dataframe and calculate its correlation with the target variable, which is the 'Price' column. I will use the pandas `describe()` function to get a summary of each feature and the `corr()` function to calculate the correlation.

Action: python_repl_ast
Action Input: df.describe()[0m[36;1m[1;3m                Id         Area     Bedrooms   Bathrooms       Floors  \
count  1999.000000  1998.000000  2000.000000  2000.00000  2000.000000   
mean   1000.993497  2787.113113     3.003500     2.55250     1.993500   
std     577.217113  1295.444785     1.424606     1.10899     0.809188   
min       1.000000   501.000000     1.000000     1.00000     1.000000   
25%     501.500000  1654.250000     2.000000     2.00000     1.000000   
50%    1001.000000  2834.500000     3.000000     3.00000     2.000000   
75%    1500.500000  3888.500000     4.000000     





[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To describe the dataset, I need to understand the number of rows and columns, as well as the data types and values of each column.
Action: python_repl_ast
Action Input: `print(df.info())`[0m[36;1m[1;3m<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Id         1999 non-null   float64
 1   Area       1998 non-null   float64
 2   Bedrooms   2000 non-null   int64  
 3   Bathrooms  2000 non-null   int64  
 4   Floors     2000 non-null   int64  
 5   YearBuilt  2000 non-null   int64  
 6   Location   2000 non-null   object 
 7   Condition  2000 non-null   object 
 8   Garage     2000 non-null   object 
 9   Price      2000 non-null   int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 156.4+ KB
None
[0m[32;1m[1;3mThe dataset has 2000 rows and 10 columns. The data ty





[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: To answer this question, I need to find out the correlation between each feature and the target variable, which is the 'Price' column. I also need to check for missing values in each feature.

Action: python_repl_ast
Action Input: `df.isnull().sum()`[0m[36;1m[1;3mId           1
Area         2
Bedrooms     0
Bathrooms    0
Floors       0
YearBuilt    0
Location     0
Condition    0
Garage       0
Price        0
dtype: int64[0m[32;1m[1;3mThere are no missing values in the dataset, so I can move on to checking the correlation between each feature and the target variable, 'Price'.

Action: python_repl_ast
Action Input: `df.corr()['Price']`[0m[36;1m[1;3mId          -0.026091
Area         0.002139
Bedrooms    -0.003471
Bathrooms   -0.015737
Floors       0.055890
YearBuilt    0.004845
Price        1.000000
Name: Price, dtype: float64[0m



[32;1m[1;3mThe correlation between 'Price' and 'Id' is very low, and the correlation between 'Price' and 'Floors' is the highest among the features. However, the correlation between 'Price' and other features is negligible. Therefore, all features except 'Floors' have low correlation with the target variable.

Final Answer: The features with low correlation with the target variable 'Price' are 'Id', 'Area', 'Bedrooms', 'Bathrooms', 'YearBuilt', 'Location', 'Condition', and 'Garage'. The feature with the highest correlation with 'Price' is 'Floors'. However, the correlation between 'Price' and all features is generally low.[0m

[1m> Finished chain.[0m


[1m[95m# Agent:[00m [1m[92mFeature selection Specialist[00m
[95m## Thought:[00m [92mThought: I will analyze the preprocessed dataset to identify features that would not be having any impact on the dataset to train a model. I will use the CSV Query Tool to ask questions about the dataset and identify the features with low co

ValueError: Invalid response from LLM call - None or empty.

In [None]:
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain_groq import ChatGroq


llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")

# Add allow_dangerous_code=True to acknowledge the security implications
agent = create_csv_agent(
    llm,
    "data.csv",
    verbose=True,
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    handle_parsing_errors=True,
    allow_dangerous_code=True  # Add this line
)

agent.run("how many columns are there")

In [12]:
class CsvRAGtool(BaseTool):
    name: str = "CSV Query Tool"
    description: str = "A tool that analyzes CSV data and answers questions about its content using natural language queries."

    def _run(self, query: str) -> str:
        try:
            llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")

            # Add allow_dangerous_code=True to acknowledge the security implications
            agent = create_csv_agent(
                llm,
                "data.csv",
                verbose=True,
                agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                handle_parsing_errors=True,
                allow_dangerous_code=True
            )

            return agent.run(query)
            
        except Exception as e:
            print(f"Error processing CSV query: {str(e)}")
            return None