In [3]:
# https://bhavikjikadara.medium.com/data-science-automation-a-step-by-step-guide-using-crewai-e1468823e0f8


from crewai import Agent, Task, Crew, Process, LLM
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from dotenv import load_dotenv
import os
from crewai.tools import tool

load_dotenv()
API_KEY = os.environ.get("GEMINI_API_KEY")



llm = LLM(
    model="groq/llama-3.3-70b-versatile",
    temperature=0.7
)


# llm = LLM(
#     model="ollama/mistral",
#     base_url="http://localhost:11434"
# )


In [4]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from crewai_tools import BaseTool

class DataPreprocessor(BaseTool):
    name: str = "Data Preprocessor"
    description: str = "Preprocesses data by handling missing values, removing duplicates, and encoding categorical variables."

    def _run(self, file_path: str) -> str:
        # Load the data
        df = pd.read_csv(file_path)
        
        # Get initial info
        initial_shape = df.shape
        initial_missing = df.isnull().sum().sum()
        
        # Calculate the percentage of missing values
        missing_percentage = (initial_missing / df.size) * 100
        
        # Handle missing values
        if missing_percentage < 5:
            df = df.dropna()
        else:
            # Use SimpleImputer for numerical columns
            num_cols = df.select_dtypes(include=['number']).columns
            num_imputer = SimpleImputer(strategy='mean')
            df[num_cols] = num_imputer.fit_transform(df[num_cols])
            
            # Use SimpleImputer for categorical columns
            cat_cols = df.select_dtypes(include=['object']).columns
            cat_imputer = SimpleImputer(strategy='most_frequent')
            df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
        
        # Remove duplicate entries
        df = df.drop_duplicates()
        
        # Identify categorical columns
        categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
        
        # Convert categorical variables to numerical
        label_encoder = LabelEncoder()
        for col in categorical_columns:
            df[col] = label_encoder.fit_transform(df[col])
        
        # Get final info
        final_shape = df.shape
        final_missing = df.isnull().sum().sum()
        
        # Save the processed data
        processed_file_path = os.path.join('processed_data', 'processed_data.csv')
        os.makedirs(os.path.dirname(processed_file_path), exist_ok=True)
        df.to_csv(processed_file_path, index=False)
        
        return f"""
        Data preprocessing completed:
        - Initial shape: {initial_shape}
        - Initial missing values: {initial_missing}
        - Final shape: {final_shape}
        - Final missing values: {final_missing}
        - Categorical variables encoded: {categorical_columns}
        - Duplicates removed
        - Processed data saved to: {processed_file_path}
        """


In [5]:
from crewai_tools import DirectoryReadTool

docs_tool_a = DirectoryReadTool(directory='data.csv')
data_processing_tool = DataPreprocessor()
data_preprocessing_agent = Agent(
    role="Data Preprocessing Specialist",
    goal="Load, clean, and perform initial transformations on datasets",
    backstory="Expert in data cleaning and preprocessing using pandas, numpy, and sklearn libraries",
    llm=llm,
    tools=[docs_tool_a, data_processing_tool],
    verbose=True
    # allow_code_execution=True
)

data_preprocessing_task = Task(
  description="""
  Load the file, handle missing values (remove missing values if number of missing values is less than 5 percent of dataset else use imputer), remove duplicates, and convert categorical variables to numerical values to make the dataset model-ready.
  """,
  expected_output='Processed dataset saved successfully',
  agent=data_preprocessing_agent,
  )

In [None]:
crew = Crew(
    agents=[data_preprocessing_agent], 
    tasks=[data_preprocessing_task], 
    process=Process.sequential
)

result = crew.kickoff()

In [None]:
print(result)

In [None]:
import pandas as pd
df = pd.read_csv("data.csv")
df.isnull().sum().sum()

In [None]:
from crewai import Agent, Task, Crew, Process, LLM
from crewai.tools import tool
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import os
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from crewai_tools import BaseTool

# llm = LLM(
#     model="ollama/mistral",
#     base_url="http://localhost:11434"
# )


llm = LLM(
    model="groq/llama-3.3-70b-versatile",
    temperature=0.7
)



class FeatureEngineeringTool(BaseTool):
    name: str = "Feature Scaling Tool"
    description: str = "Scales numerical features to a standard range using techniques like normalization or standardization also encode categorical values"

    def _run(self, file_path: str) -> str:
        try:
            df = pd.read_csv(file_path)
            df_engineered = df.copy()
            
            # Encode categorical variables
            le = LabelEncoder()
            categorical_cols = df_engineered.select_dtypes(include=['object']).columns
            for col in categorical_cols:
                df_engineered[col] = le.fit_transform(df_engineered[col].astype(str))
            
            # Scale numerical features
            scaler = StandardScaler()
            numerical_cols = df_engineered.select_dtypes(include=['int64', 'float64']).columns
            df_engineered[numerical_cols] = scaler.fit_transform(df_engineered[numerical_cols])

            output_path = 'engineered_features.csv'
            df_engineered.to_csv(output_path, index=False)
            
            return f"file saved to {output_path}"
            
        except Exception as e:
            print(f"Error engineering features: {str(e)}")
            return None
        

from crewai_tools import DirectoryReadTool

docs_tool_a = DirectoryReadTool(directory='data.csv')
# Create Feature Engineering agent
feature_engineering_agent = Agent(
    role="Feature Engineering Specialist",
    goal="Transform and optimize features for machine learning models and save the transformed file",
    backstory="Expert in feature scaling, encoding",
    tools=[docs_tool_a,
        FeatureEngineeringTool(),
        # FeatureEngineeringTool().generate_interaction_features
    ],
    llm=llm,
    verbose=True
)

# Create Feature Engineering task
feature_engineering_task = Task(
    description="""
    Load the file, convert categorical variables to numerical values and scale the numerical values to make the dataset model-ready.
    """,
    agent=feature_engineering_agent,
    expected_output="transformed dataset saved. also provide short description about which technologies are used for feature engineering"
)

# Create and run the crew
crew = Crew(
    agents=[feature_engineering_agent],
    tasks=[feature_engineering_task],
    process=Process.sequential
)

result = crew.kickoff()
print(result)

In [None]:
crew = Crew(
    agents=[data_preprocessing_agent,feature_engineering_agent],
    tasks=[data_preprocessing_task,feature_engineering_task],
    process=Process.sequential
)

result = crew.kickoff()
print(result)

In [None]:
# Must precede any llm module imports

from langtrace_python_sdk import langtrace

langtrace.init(api_key = 'a8f171dce8f1c150104082f5adbe1a67710f292d255ba94c5082347f372e04af')

## Feature Engineering Agent

In [None]:
from crewai import Agent, Task, Crew, Process, LLM
from crewai.tools import tool
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import os
import pandas as pd


from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain_groq import ChatGroq


# llm = LLM(
#     model="ollama/mistral",
#     base_url="http://localhost:11434"
# )

llm = LLM(
    model="groq/llama-3.3-70b-versatile",
    temperature=0.7
)


class CsvRAGtool(BaseTool):
    name: str = "CSV Query Tool"
    description: str = "A tool that analyzes CSV data and answers questions about its content using natural language queries."

    def _run(self, query: str) -> str:
        try:
            llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")

            # Add allow_dangerous_code=True to acknowledge the security implications
            agent = create_csv_agent(
                llm,
                "data.csv",
                verbose=True,
                agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                handle_parsing_errors=True,
                allow_dangerous_code=True
            )

            return agent.run(query)
            
        except Exception as e:
            print(f"Error processing CSV query: {str(e)}")
            return None



from crewai_tools import DirectoryReadTool

docs_tool_a = DirectoryReadTool(directory='data.csv')
csv_rag = CsvRAGtool()
feature_engineering_agent = Agent(
    role="Feature Engineering Specialist",
    goal="Your task is to analyze the different features of data and tell that is feature engineering really required for data or not",
    backstory="Expert in feature scaling, encoding",
    tools=[docs_tool_a,csv_rag],
    llm=llm,
    verbose=True
)

# Create Feature Engineering task
feature_engineering_task = Task(
    description="""
    1. Load the preprocessed dataset
    2. Check if feature scaling really reqd or not
    3. answer should be YES or NO along with a description
    """,
    agent=feature_engineering_agent,
    expected_output="Final answer ",
    human_input= True
)

# Create and run the crew
crew = Crew(
    agents=[feature_engineering_agent],
    tasks=[feature_engineering_task],
    process=Process.sequential
)

result = crew.kickoff()
print(result)

## Check if data cleaning and feature engineering really required 

In [1]:
from crewai import Agent, Task, Crew, Process, LLM
from crewai.tools import tool
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import os
import pandas as pd


from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain_groq import ChatGroq


# llm = LLM(
#     model="ollama/mistral",
#     base_url="http://localhost:11434"
# )

llm = LLM(
    model="groq/llama-3.3-70b-versatile",
    temperature=0.7
)


class CsvRAGtool(BaseTool):
    name: str = "CSV Query Tool"
    description: str = "A tool that analyzes CSV data and answers questions about its content using natural language queries."

    def _run(self, query: str) -> str:
        try:
            llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")

            # Add allow_dangerous_code=True to acknowledge the security implications
            agent = create_csv_agent(
                llm,
                "data.csv",
                verbose=True,
                agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                handle_parsing_errors=True,
                allow_dangerous_code=True
            )

            return agent.run(query)
            
        except Exception as e:
            print(f"Error processing CSV query: {str(e)}")
            return None



from crewai_tools import DirectoryReadTool

docs_tool_a = DirectoryReadTool(directory='data.csv')
csv_rag = CsvRAGtool()
data_analysis_agent = Agent(
    role="Data analysis Specialist",
    goal="Your task is to analyze the different features of data and tell that is data cleaning and feature engineering really required for data or not",
    backstory="Expert in data cleaning, feature scaling, encoding",
    tools=[docs_tool_a,csv_rag],
    llm=llm,
    verbose=True
)

# Create Feature Engineering task
data_analysis_task = Task(
    description="""
    1. Load the dataset
    2. Check data cleaning is required or not :
      - Check if there are any null values in the data 
      - check if there are dduplicate values in the data 
    3. Feature engineering is required or not by analysing following points:
      - Check each columns data types
      - check min max values of columns for feature scaling 
    
    3. answer should be YES or NO along with a description
    """,
    agent=feature_engineering_agent,
    expected_output="""Final answer should be in following format:
    data cleaning : required or not 
    Feature engineering : required or not
    """,

)

# Create and run the crew
crew = Crew(
    agents=[data_analysis_agent],
    tasks=[data_analysis_task],
    process=Process.sequential
)

result = crew.kickoff()
print(result)

ImportError: cannot import name 'InstanceOf' from 'pydantic' (C:\Users\mayur\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pydantic\__init__.cp311-win_amd64.pyd)

## Feature Selection agent

In [None]:
from crewai import Agent, Task, Crew, Process, LLM
from crewai.tools import tool
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import os
import pandas as pd
from crewai_tools import BaseTool

from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain_groq import ChatGroq


# llm = LLM(
#     model="ollama/mistral",
#     base_url="http://localhost:11434"
# )

llm = LLM(
    model="groq/llama3-8b-8192",
    temperature=0.7
)


class CsvRAGtool(BaseTool):
    name: str = "CSV Query Tool"
    description: str = "A tool that analyzes CSV data and answers questions about its content using natural language queries."

    def _run(self, query: str) -> str:
        try:
            llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")

            # Add allow_dangerous_code=True to acknowledge the security implications
            agent = create_csv_agent(
                llm,
                "data.csv",
                verbose=True,
                agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                handle_parsing_errors=True,
                allow_dangerous_code=True
            )

            return agent.run(query)
            
        except Exception as e:
            print(f"Error processing CSV query: {str(e)}")
            return None



from crewai_tools import DirectoryReadTool

docs_tool_a = DirectoryReadTool(directory='data.csv')
csv_rag = CsvRAGtool()
feature_engineering_agent = Agent(
    role="Feature selection Specialist",
    goal="Your task is to analyze the different features of data and tell that which features we can drop that are not usefull for the data to train model",
    backstory="Expert in feature selection",
    tools=[docs_tool_a,csv_rag],
    llm=llm,
    verbose=True
)

# Create Feature Engineering task
feature_engineering_task = Task(
    description="""
    1. Load the preprocessed dataset
    2. Check features that would not be having any impact on the dataset to train model
    3. answer should be list of features to drop along with a description
    """,
    agent=feature_engineering_agent,
    expected_output="Final answer "
)

# Create and run the crew
crew = Crew(
    agents=[feature_engineering_agent],
    tasks=[feature_engineering_task],
    process=Process.sequential
)

result = crew.kickoff()
print(result)

In [None]:
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain_groq import ChatGroq


llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")

# Add allow_dangerous_code=True to acknowledge the security implications
agent = create_csv_agent(
    llm,
    "data.csv",
    verbose=True,
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    handle_parsing_errors=True,
    allow_dangerous_code=True  # Add this line
)

agent.run("how many columns are there")

In [12]:
class CsvRAGtool(BaseTool):
    name: str = "CSV Query Tool"
    description: str = "A tool that analyzes CSV data and answers questions about its content using natural language queries."

    def _run(self, query: str) -> str:
        try:
            llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")

            # Add allow_dangerous_code=True to acknowledge the security implications
            agent = create_csv_agent(
                llm,
                "data.csv",
                verbose=True,
                agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                handle_parsing_errors=True,
                allow_dangerous_code=True
            )

            return agent.run(query)
            
        except Exception as e:
            print(f"Error processing CSV query: {str(e)}")
            return None

## End to end ml model

### 1. Load required libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pickle


### 2. Load Data

In [2]:
df = pd.read_csv("Health_insurance.csv")
new_df = df.drop(columns="charges")


### 3. Preprocess data by feature scaling and label encoding

In [3]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import pickle
import os

def create_artifacts_directory():
    """Create artifacts directory if it doesn't exist"""
    if not os.path.exists('artifacts'):
        os.makedirs('artifacts')

def feature_scaler_encoder(new_df):
    """Prepare training data and save preprocessing objects"""
    # Store the original column order
    original_columns = new_df.columns.tolist()
    
    # Initialize preprocessing objects
    label_encoders = {}
    categorical_cols = new_df.select_dtypes(include=['object']).columns
    numerical_cols = new_df.select_dtypes(include=['int64', 'float64']).columns
    
    # Apply LabelEncoder to each categorical column
    for col in categorical_cols:
        le = LabelEncoder()
        new_df[col] = le.fit_transform(new_df[col])
        label_encoders[col] = le
    
    # Scale numerical features
    scaler = StandardScaler()
    new_df[numerical_cols] = scaler.fit_transform(new_df[numerical_cols])
    
    # Ensure columns are in original order
    new_df = new_df[original_columns]
    
    # Save preprocessing objects
    create_artifacts_directory()
    
    # Save label encoders
    with open('artifacts/label_encoders.pkl', 'wb') as f:
        pickle.dump(label_encoders, f)
    
    # Save scaler
    with open('artifacts/scaler.pkl', 'wb') as f:
        pickle.dump(scaler, f)
    
    # Save column order
    with open('artifacts/columns.pkl', 'wb') as f:
        pickle.dump(original_columns, f)
    
    return new_df


### 4. Model Training and saving it in pkl file

In [23]:
new_df = feature_scaler_encoder(new_df)
# Splitting the dataset
X = new_df
y = df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
model = LinearRegression()
model.fit(X_train, y_train)

# Save the model
model_path = os.path.join('artifacts', 'health_insurance.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(model, f)


In [None]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

### 5. Model Evaluation

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score

# Evaluate performance using different metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'R²: {r2}')
print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'RMSE: {rmse}')



### 6. Load saved pkl files, model and define prediction function

In [7]:
def load_preprocessing_objects():
    """Load saved preprocessing objects"""
    try:
        with open('artifacts/label_encoders.pkl', 'rb') as f:
            label_encoders = pickle.load(f)
        
        with open('artifacts/scaler.pkl', 'rb') as f:
            scaler = pickle.load(f)
        
        # with open('artifacts/columns.pkl', 'rb') as f:
        #     original_columns = pickle.load(f)

        with open('artifacts/health_insurance.pkl', 'rb') as f:
            model = pickle.load(f)
        
        return label_encoders, scaler, model
    
    except FileNotFoundError:
        raise Exception("Preprocessing objects not found. Please run training first.")


In [21]:
label_encoders, scaler, model = load_preprocessing_objects()

def make_prediction(data):
    """Prepare new data for prediction using saved preprocessing objects"""
    
    # Convert input dictionary to DataFrame
    input_df = pd.DataFrame([data])

    # Encode categorical variables
    categorical_cols = input_df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if col in label_encoders:
            input_df[col] = label_encoders[col].transform(input_df[col].astype(str))
    
    # Scale numerical features
    numerical_cols = input_df.select_dtypes(include=['int64', 'float64']).columns
    input_df[numerical_cols] = scaler.transform(input_df[numerical_cols])

    res = model.predict(input_df)
    
    return res[0]


### 7. Make Predictions for new data

In [None]:

# Example new data for prediction
new_data = {
    'age': 19,
    'sex': 'female',
    'bmi': 27,
    'children': 0,
    'smoker': 'yes',
    'region': 'southwest'
}

res = make_prediction(new_data)
print(res)

########################################################################################################################

## Model Trainer AI Agent

In [None]:
import pandas as pd
df = pd.read_csv("artifacts/after_drop_col.csv")
df

In [6]:
from crewai import Agent, Task, Crew, Process, LLM
from crewai_tools import CodeInterpreterTool, DirectoryReadTool
from textwrap import dedent
import os

llm = LLM(
    model="groq/gemma2-9b-it",
    temperature=0.7
)

from pydantic import BaseModel


class output_format(BaseModel):
    model_code: str

data_path = os.path.join('artifacts','engineered_features.csv')
docs_tool = DirectoryReadTool(directory=data_path)


# Create our ML Coding Agent
ml_coder_agent = Agent(
    role='Machine Learning Engineer',
    goal='Write and optimize machine learning code to solve data science problems',
    backstory=dedent("""
        You are an expert machine learning engineer with extensive experience in 
        implementing ML algorithms, data preprocessing, and model optimization. 
        You excel at writing clean, efficient Python code for ML tasks.
    """),
    tools=[docs_tool , CodeInterpreterTool()],
    verbose=True,
    llm =llm
)


# Create Feature Engineering task
ml_coder_task = Task(
    description="""
    1. Load the preprocessed dataset
    2. Assume target column as {target} and model type {model_type}
    3. Do train test split
    4. provide ML code for {model_type} model
    """,
    agent=ml_coder_agent,
    expected_output="ML model code",
    output_pydantic=output_format
)


# Create and run the crew
crew = Crew(
    agents=[ml_coder_agent],
    tasks=[ml_coder_task],
    process=Process.sequential
)

target = "Price"
model_type = "regression"

result = crew.kickoff(inputs={'target': target, 'model_type':model_type})
print(result)

ImportError: cannot import name 'InstanceOf' from 'pydantic' (C:\Users\mayur\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pydantic\__init__.cp311-win_amd64.pyd)

In [32]:
code = result["model_code"]
print(code)

import pandas as pd

# Load the preprocessed dataset
features = pd.read_csv('/path/to/your/data/engineered_features.csv')

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X = features.drop(columns=['Price'], axis=1)
Y = features['Price']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Choose a regression model (example: Linear Regression)
from sklearn.linear_model import LinearRegression
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)



In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
df = pd.read_csv('engineered_features.csv')

# Assume 'Price' is the target column
y = df['Price']
x = df.drop('Price', axis=1)

# Split the data into training and test sets
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=42)

# Create and train the Random Forest regressor
model = RandomForestRegressor()
model.fit(train_x, train_y)

# Make predictions on the test set
test_predictions = model.predict(test_x)



In [7]:
# Import necessary tools
from crewai_tools import CodeInterpreterTool, DirectoryReadTool
from crewai import Agent, Task, Crew, Process, LLM
from crewai_tools import CodeInterpreterTool, DirectoryReadTool
from textwrap import dedent
import os

llm = LLM(
    model="groq/gemma2-9b-it",
    temperature=0.7
)

data_path = os.path.join('artifacts','after_drop_col.csv')
docs_tool = DirectoryReadTool(directory=data_path)


# Define Agents
ml_code_writer = Agent(
    role='ML Code Writer',
    goal='Write efficient and scalable machine learning model code',
    backstory='An experienced ML engineer with expertise in writing robust ML code.',
    tools=[CodeInterpreterTool(), docs_tool],
    llm = llm,
    verbose=True
)

ml_code_runner = Agent(
    role='ML Model Code Runner',
    goal='Execute machine learning model code and handle runtime environments',
    backstory='A proficient coder with a knack for managing and running complex ML models.',
    tools=[CodeInterpreterTool(), docs_tool],
    llm = llm,
    verbose=True
)

ml_model_evaluator = Agent(
    role='ML Model Evaluator',
    goal='Evaluate the performance of machine learning models',
    backstory='A meticulous analyst specializing in model evaluation and performance metrics.',
    tools=[CodeInterpreterTool(), docs_tool],
    llm = llm,
    verbose=True
)

extra_suggestions_agent = Agent(
    role='Model Improvement Advisor',
    goal='Provide suggestions to improve model accuracy if it is low',
    backstory='A seasoned ML researcher with deep knowledge of optimization techniques.',
    llm = llm,
    verbose=True
)

# Define Tasks
write_ml_code = Task(
    description='Write code for a machine learning model based on the provided specifications.',
    expected_output='A well-documented script implementing the specified ML model.',
    agent=ml_code_writer
)

run_ml_code = Task(
    description='Run the ML model code and ensure it executes correctly in the given environment.',
    expected_output='Logs and results from the model run, including any errors or issues encountered.',
    agent=ml_code_runner
)

evaluate_ml_model = Task(
    description='Evaluate the ML model’s performance using appropriate metrics and datasets.',
    expected_output='A detailed report on the model’s performance, including metrics like accuracy, precision, recall, and F1-score.',
    agent=ml_model_evaluator
)

suggest_improvements = Task(
    description='Analyze the evaluation report and suggest improvements if the model accuracy is low.',
    expected_output='A list of actionable suggestions to improve the model’s accuracy, such as parameter tuning, feature engineering, or algorithm changes.',
    agent=extra_suggestions_agent
)

# Assemble a crew with planning enabled
crew = Crew(
    agents=[ml_code_writer, ml_code_runner, ml_model_evaluator, extra_suggestions_agent],
    tasks=[write_ml_code, run_ml_code, evaluate_ml_model, suggest_improvements],
    verbose=True,
    process=Process.sequential
)

result = crew.kickoff()
print(result)


ImportError: cannot import name 'InstanceOf' from 'pydantic' (C:\Users\mayur\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pydantic\__init__.cp311-win_amd64.pyd)

## Testing

In [10]:
# Must precede any llm module imports
from crewai import Agent, Task, Crew, Process, LLM
from crewai.tools import tool
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain_groq import ChatGroq
from crewai_tools import BaseTool, DirectoryReadTool
import os
import pickle

# Initialize LLM
llm = LLM(
    model="groq/gemma2-9b-it",
    temperature=0.7
)

# llm = LLM(
#     model="ollama/llama3.2",
#     base_url="http://localhost:11434"
# )

class DataPreprocessor(BaseTool):
    name: str = "Data Preprocessor"
    description: str = "Preprocesses data by handling missing values, removing duplicates, and encoding categorical variables."

    def _run(self, file_path: str) -> str:
        try:
            # Load the data
            df = pd.read_csv(file_path)
            
            # Get initial info
            initial_shape = df.shape
            initial_missing = df.isnull().sum().sum()
            
            # Calculate the percentage of missing values
            missing_percentage = (initial_missing / (df.size)) * 100
            
            # Handle missing values
            if missing_percentage < 5:
                df = df.dropna()
            else:
                # Use SimpleImputer for numerical columns
                num_cols = df.select_dtypes(include=['number']).columns
                if not num_cols.empty:
                    num_imputer = SimpleImputer(strategy='mean')
                    df[num_cols] = num_imputer.fit_transform(df[num_cols])
                
                # Use SimpleImputer for categorical columns
                cat_cols = df.select_dtypes(include=['object']).columns
                if not cat_cols.empty:
                    cat_imputer = SimpleImputer(strategy='most_frequent')
                    df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])
            
            # Remove duplicate entries
            df = df.drop_duplicates()
            
            # Identify categorical columns
            categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
            
            # Get final info
            final_shape = df.shape
            final_missing = df.isnull().sum().sum()
            
            # Save the processed data
            processed_file_path = os.path.join('artifacts', 'processed_data.csv')
            os.makedirs(os.path.dirname(processed_file_path), exist_ok=True)
            df.to_csv(processed_file_path, index=False)
            
            # return f"""
            # Data preprocessing completed:
            # - Initial shape: {initial_shape}
            # - Initial missing values: {initial_missing}
            # - Final shape: {final_shape}
            # - Final missing values: {final_missing}
            # - Categorical variables found: {categorical_columns}
            # - Duplicates removed
            # """
            return f"Cleaned data saved to {processed_file_path}"
        except Exception as e:
            return f"Error in preprocessing: {str(e)}"

class FeatureEngineeringTool(BaseTool):
    name: str = "Feature Scaling Tool"
    description: str = "Scales numerical features and encodes categorical values"

    def _run(self, file_path: str, target: str, model:str) -> str:
        try:
            df = pd.read_csv(file_path)
            df_engineered = df.copy()
            
            # Encode categorical variables
            label_encoders = {}
            categorical_cols = df_engineered.select_dtypes(include=['object']).columns
            categorical_cols = [col for col in categorical_cols if col != target]  # Filter out the target column
            for col in categorical_cols:
                le = LabelEncoder()
                df_engineered[col] = le.fit_transform(df_engineered[col].astype(str))
                label_encoders[col] = le

            # Create artifacts directory if it doesn't exist
            os.makedirs('artifacts', exist_ok=True)
            
            # Save the label encoder
            encoder_filename = os.path.join('artifacts', 'label_encoder.pkl')
            with open(encoder_filename, 'wb') as file:
                pickle.dump(label_encoders, file)

            ## Check whether label encoding is necessory or not is model is classification
            dtype_target = df_engineered[target].dtype
            print(dtype_target)
            if dtype_target == "object" and model == "classification":
                print("Label encoding necessory")
                le_target = LabelEncoder()
                df_engineered[target] = le_target.fit_transform(df_engineered[target].astype(str))
                target_encoder_filename = os.path.join('artifacts', 'target_label_encoder.pkl')
                with open(target_encoder_filename, 'wb') as file:
                    pickle.dump(le_target, file)
            else:
                print("Not necessory")
            
            # Scale numerical features
            numerical_cols = df_engineered.select_dtypes(include=['int64', 'float64']).columns
            numerical_cols = [col for col in numerical_cols if col != target]
            if len(numerical_cols) > 0:
                scaler = StandardScaler()
                df_engineered[numerical_cols] = scaler.fit_transform(df_engineered[numerical_cols])
                
                # Save the scaler
                scaler_filename = os.path.join('artifacts', 'scaler.pkl')
                with open(scaler_filename, 'wb') as file:
                    pickle.dump(scaler, file)

            output_path = os.path.join('artifacts', 'engineered_features.csv')
            df_engineered.to_csv(output_path, index=False)
            
            return f"Feature engineering completed. File saved to {output_path}"
            
        except Exception as e:
            return f"Error in feature engineering: {str(e)}"

class CsvRAGtool(BaseTool):
    name: str = "CSV Query Tool"
    description: str = "Analyzes CSV data and answers questions using natural language queries."

    def _run(self, query: str, file_path: str) -> str:
        try:
            llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")
            agent = create_csv_agent(
                llm,
                file_path,
                verbose=True,
                agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                handle_parsing_errors=True,
                allow_dangerous_code=True
            )
            return agent.run(query)
        except Exception as e:
            return f"Error in CSV query: {str(e)}"

# Initialize paths and tools
input_path = os.path.join('artifacts', 'after_drop_col.csv')
processed_file_path = os.path.join('artifacts', 'processed_data.csv')

# Initialize tools
docs_tool_preprocessing = DirectoryReadTool(directory=input_path)
data_processing_tool = DataPreprocessor()
docs_tool_engineering = DirectoryReadTool(directory=processed_file_path)
csv_rag = CsvRAGtool()
feature_eng = FeatureEngineeringTool()

# Create agents
data_preprocessing_agent = Agent(
    role="Data Preprocessing Specialist",
    goal="Load, clean, and perform initial transformations on datasets",
    backstory="Expert in data cleaning and preprocessing using pandas, numpy, and sklearn libraries",
    llm=llm,
    tools=[docs_tool_preprocessing, data_processing_tool],
    verbose=True
)

feature_engineering_agent = Agent(
    role="Feature Engineering Specialist",
    goal="Analyze features and perform feature engineering if required",
    backstory="Expert in feature scaling and encoding",
    tools=[docs_tool_engineering, csv_rag, feature_eng],
    llm=llm,
    verbose=True
)

# Create tasks
data_preprocessing_task = Task(
    description="""
    1. Load the file 'artifacts/after_drop_col.csv'
    2. Handle missing values (remove if <5% missing, else use imputer)
    3. Remove duplicates
    Save the processed dataset.
    """,
    expected_output='Processed dataset saved successfully',
    agent=data_preprocessing_agent
)

feature_engineering_task = Task(
    description="""
    1. Load the cleaned data
    2. Analyze if feature engineering is required
    3. input variable from user target - {target}, model - {model}
    4. If required, perform feature scaling and encoding
    5. Provide justification for decisions made and it should be in short
    """,
    agent=feature_engineering_agent,
    expected_output="Analysis and feature engineering report"
)

# Create and run the crew
crew = Crew(
    agents=[data_preprocessing_agent, feature_engineering_agent],
    tasks=[data_preprocessing_task, feature_engineering_task],
    process=Process.sequential
)

target = 'Price'
model = 'regression'
# Execute the pipeline
try:
    result = crew.kickoff(inputs={'target': target, 'model': model})
    print(result)
except Exception as e:
    print(f"Error executing pipeline: {str(e)}")

ImportError: cannot import name 'InstanceOf' from 'pydantic' (C:\Users\mayur\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\pydantic\__init__.cp311-win_amd64.pyd)

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
import pickle
df = pd.read_csv("artifacts/after_drop_col.csv")
df.head()

In [None]:
df_engineered = df.copy()

target = "Garage"
model = "classification"

## Check whether label encoding is necessory or not is model is classification
dtype_target = df_engineered[target].dtype
if dtype_target == "object" and model == "classification":
    print("Label encoding necessory")
    le_target = LabelEncoder()
    df_engineered[target] = le_target.fit_transform(df_engineered[target].astype(str))
    target_encoder_filename = os.path.join('artifacts', 'target_label_encoder.pkl')
    with open(target_encoder_filename, 'wb') as file:
        pickle.dump(le_target, file)
else:
    print("Not necessory")


In [None]:
import pandas as pd
import os
import pickle
from sklearn.preprocessing import LabelEncoder, StandardScaler
from pydantic import BaseModel

class feature_engineering_tool(BaseModel):
    name: str = "Feature Scaling Tool"
    description: str = "Scales numerical features and encodes categorical values"

    def _run(self, file_path: str, target: str, model: str) -> str:
        try:
            df = pd.read_csv(file_path)
            df_engineered = df.copy()

            # Encode categorical variables
            label_encoders = {}
            categorical_cols = df_engineered.select_dtypes(include=['object']).columns
            categorical_cols = [col for col in categorical_cols if col != target]  # Filter out the target column
            for col in categorical_cols:
                le = LabelEncoder()
                df_engineered[col] = le.fit_transform(df_engineered[col].astype(str))
                label_encoders[col] = le

            # Create artifacts directory if it doesn't exist
            os.makedirs('artifacts', exist_ok=True)

            # Save the label encoder
            encoder_filename = os.path.join('artifacts', 'label_encoder.pkl')
            with open(encoder_filename, 'wb') as file:
                pickle.dump(label_encoders, file)

            ## Check whether label encoding is necessory or not is model is classification
            dtype_target = df_engineered[target].dtype
            print(dtype_target)
            if dtype_target == "object" and model == "classification":
                print("Label encoding necessory")
                le_target = LabelEncoder()
                df_engineered[target] = le_target.fit_transform(df_engineered[target].astype(str))
                target_encoder_filename = os.path.join('artifacts', 'target_label_encoder.pkl')
                with open(target_encoder_filename, 'wb') as file:
                    pickle.dump(le_target, file)
            else:
                print("Not necessory")

            # Scale numerical features
            numerical_cols = df_engineered.select_dtypes(include=['int64', 'float64']).columns
            if numerical_cols.size > 0:
                scaler = StandardScaler()
                df_engineered[numerical_cols] = scaler.fit_transform(df_engineered[numerical_cols])

                # Save the scaler
                scaler_filename = os.path.join('artifacts', 'scaler.pkl')
                with open(scaler_filename, 'wb') as file:
                    pickle.dump(scaler, file)

            output_path = os.path.join('artifacts', 'engineered_features.csv')
            df_engineered.to_csv(output_path, index=False)

            return f"Feature engineering completed. File saved to {output_path}"

        except Exception as e:
            return f"Error in feature engineering: {str(e)}"


# Create an instance of the tool and run it
tool_instance = feature_engineering_tool()
result = tool_instance._run("artifacts/after_drop_col.csv", "Garage", "classification")
print(result)


In [None]:
# Initialize LLM
llm = LLM(
    model="groq/gemma2-9b-it",
    temperature=0.7
)

class CsvRAGtool(BaseTool):
    name: str = "CSV Query Tool"
    description: str = "Analyzes CSV data and provides insights through natural language queries."

    def _run(self, query: str, file_path: str) -> str:
        try:
            llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768")
            agent = create_csv_agent(
                llm,
                file_path,
                verbose=True,
                agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
                handle_parsing_errors=True,
                allow_dangerous_code=True
            )
            return agent.run(query)
        except Exception as e:
            return f"Error in CSV query: {str(e)}"

# Initialize paths and tools
input_path = os.path.join('artifacts', 'after_drop_col.csv')

# Initialize tools
docs_tool_preprocessing = DirectoryReadTool(directory=input_path)
csv_rag = CsvRAGtool()

from pydantic import BaseModel

class PreprocessingSteps(BaseModel):
    handle_missing_values: str
    remove_duplicates: str
    label_encode_categorical_columns: str
    scale_numerical_columns: str

# Create agents
preprocessing_checker_agent = Agent(
    role="Preprocessing Checker",
    goal="Assess and recommend necessary preprocessing steps for the dataset",
    backstory="An expert in data cleaning and preprocessing using pandas, numpy, and sklearn libraries",
    llm=llm,
    tools=[csv_rag, docs_tool_preprocessing],
    verbose=True
)

# Create tasks
preprocessing_checker_task = Task(
    description="""
    1. Load the dataset 'artifacts/after_drop_col.csv'.
    2. Evaluate and determine if the following preprocessing steps are required:
       - Handling missing values: required or not (check for null values).
       - Removing duplicates: required or not
       - Label encoding categorical columns: required or not
       - Scaling numerical columns: required or not
    3. Save the results as a JSON file indicating whether each step is required or not.
    """,
    expected_output='A JSON file indicating the necessity of each preprocessing step',
    agent=preprocessing_checker_agent,
    output_pydantic=PreprocessingSteps,
)

# Create and run the crew
crew = Crew(
    agents=[preprocessing_checker_agent],
    tasks=[preprocessing_checker_task],
    process=Process.sequential
)

try:
    result = crew.kickoff()
    print(result)
except Exception as e:
    print(f"Error executing pipeline: {str(e)}")
