<a href="https://colab.research.google.com/github/oluwafemidiakhoa/MLprject/blob/main/Data_Science__Automation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Install Required Packages
First, ensure you install all necessary packages in your Colab environment.New Section

In [1]:
!pip install virtualenv apify_client apify_shared ipykernel crewai
!pip install crewai-tools langchain langchain-anthropic langchain-cohere
!pip install langchain-community langchain-core langchain-experimental
!pip install langchain-google-community langchain-google-community[places] langchain-groq langchain-openai
!pip install langchain-text-splitters langchainhub langdetect langsmith unstructured gradio scikit-learn pandas xgboost


Collecting virtualenv
  Downloading virtualenv-20.26.3-py3-none-any.whl.metadata (4.5 kB)
Collecting apify_client
  Downloading apify_client-1.7.1-py3-none-any.whl.metadata (5.0 kB)
Collecting apify_shared
  Downloading apify_shared-1.1.2-py3-none-any.whl.metadata (2.5 kB)
Collecting crewai
  Downloading crewai-0.41.1-py3-none-any.whl.metadata (13 kB)
Collecting distlib<1,>=0.3.7 (from virtualenv)
  Downloading distlib-0.3.8-py2.py3-none-any.whl.metadata (5.1 kB)
Collecting httpx>=0.25.0 (from apify_client)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting appdirs<2.0.0,>=1.4.4 (from crewai)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting embedchain<0.2.0,>=0.1.114 (from crewai)
  Downloading embedchain-0.1.119-py3-none-any.whl.metadata (9.1 kB)
Collecting instructor==1.3.3 (from crewai)
  Downloading instructor-1.3.3-py3-none-any.whl.metadata (13 kB)
Collecting json-repair<0.26.0,>=0.25.2 (from crewai)
  Downloading json_repair-0.25

# Step 2: Load API Keys
Set your environment variables for API keys.

## Step 3: Define and Initialize Tools
Define the custom tools for Kaggle dataset downloading, data preprocessing, and model training.

Kaggle Dataset Downloader

In [32]:
from crewai_tools import BaseTool
from kaggle.api.kaggle_api_extended import KaggleApi

class KaggleDatasetDownloader(BaseTool):
    name: str = "Kaggle Dataset Downloader"
    description: str = "Downloads datasets from Kaggle using a provided URL."

    def _run(self, url: str) -> str:
        try:
            api = KaggleApi()
            api.authenticate()
            parts = url.split('/')
            owner = parts[-2]
            dataset_name = parts[-1]
            api.dataset_download_files(f"{owner}/{dataset_name}", path='./raw_data', unzip=True)
            return f"Successfully downloaded dataset: {owner}/{dataset_name} to ./raw_data directory"
        except Exception as e:
            if '403' in str(e):
                return "Error 403: Forbidden. Please check your Kaggle API credentials and dataset permissions."
            else:
                return f"Error downloading dataset: {str(e)}"


## Data Preprocessor

In [33]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

class DataPreprocessor(BaseTool):
    name: str = "Data Preprocessor"
    description: str = "Preprocesses data by handling missing values, removing duplicates, and encoding categorical variables."

    def _run(self, file_path: str) -> str:
        os.makedirs('processed_data', exist_ok=True)

        df = pd.read_csv(file_path)

        initial_shape = df.shape
        initial_missing = df.isnull().sum().sum()

        df = df.dropna()
        df = df.drop_duplicates()

        categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

        label_encoder = LabelEncoder()
        for col in categorical_columns:
            df[col] = label_encoder.fit_transform(df[col])

        final_shape = df.shape
        final_missing = df.isnull().sum().sum()

        processed_file_path = os.path.join('processed_data', 'processed_data.csv')
        df.to_csv(processed_file_path, index=False)

        return f"""
        Data preprocessing completed:
        - Initial shape: {initial_shape}
        - Initial missing values: {initial_missing}
        - Final shape: {final_shape}
        - Final missing values: {final_missing}
        - Categorical variables encoded: {categorical_columns}
        - Duplicates removed
        - Processed data saved to: {processed_file_path}
        """


## Model Training Tool

In [34]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import pickle

class TrainingModelTool(BaseTool):
    name: str = "Random Forest Model Trainer"
    description: str = "Trains a Random Forest model for house price prediction and saves it as a pickle file"

    def _run(self, file_path, target_variable):
        data = pd.read_csv(file_path)
        X = data.drop(target_variable, axis=1)
        y = data[target_variable]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        model = RandomForestRegressor(random_state=42)
        model.fit(X_train_scaled, y_train)

        y_pred = model.predict(X_test_scaled)

        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)

        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)

        os.makedirs('saved_model', exist_ok=True)
        os.makedirs('train_test_data', exist_ok=True)

        model_filename = os.path.join('saved_model', 'random_forest_model.pkl')
        with open(model_filename, 'wb') as file:
            pickle.dump(model, file)

        scaler_filename = os.path.join('saved_model', 'scaler.pkl')
        with open(scaler_filename, 'wb') as file:
            pickle.dump(scaler, file)

        train_data = pd.concat([X_train, y_train], axis=1)
        test_data = pd.concat([X_test, y_test], axis=1)

        train_filename = os.path.join('train_test_data', 'train_data.csv')
        test_filename = os.path.join('train_test_data', 'test_data.csv')

        train_data.to_csv(train_filename, index=False)
        test_data.to_csv(test_filename, index=False)

        report = f"Random Forest Model Training Report:\n\n"
        report += f"Root Mean Squared Error: ${rmse:.2f}\n"
        report += f"R-squared Score: {r2:.4f}\n\n"
        report += "Top 5 Important Features:\n"
        for _, row in feature_importance.head().iterrows():
            report += f"- {row['feature']}: {row['importance']:.4f}\n"
        report += f"\nModel saved as: {os.path.abspath(model_filename)}\n"
        report += f"Scaler saved as: {os.path.abspath(scaler_filename)}\n"
        report += f"Train data saved as: {os.path.abspath(train_filename)}\n"
        report += f"Test data saved as: {os.path.abspath(test_filename)}\n"

        return report


## Step 4: Initialize Agents
Create and initialize the agents.

In [35]:
from crewai import Agent
from langchain_openai import ChatOpenAI
from crewai_tools import SerperDevTool, DirectoryReadTool

# Initialize LLM model
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

# Load tools
serper_search_tool = SerperDevTool()
kaggle_tool = KaggleDatasetDownloader()
data_preprocessor_tool = DataPreprocessor()
model_training_tool = TrainingModelTool()

# Define Data Collection Agent
data_collection_agent = Agent(
    role='Data Acquisition Specialist',
    goal='Find and download appropriate datasets on a given topic',
    backstory='Expert in acquiring datasets from various sources, specializing in climate data',
    tools=[serper_search_tool, kaggle_tool],
    llm=llm,
    verbose=True
)

# Define Data Preprocessing Agent
docs_tool_a = DirectoryReadTool(directory='raw_data')

data_preprocessing_agent = Agent(
    role="Data Preprocessing Specialist",
    goal="Load, clean, and perform initial transformations on datasets",
    backstory="Expert in data cleaning and preprocessing using pandas, numpy, and sklearn libraries",
    llm=llm,
    tools=[docs_tool_a, data_preprocessor_tool],
)

# Define Model Training Agent
docs_tool_b = DirectoryReadTool(directory='processed_data')

model_training_agent = Agent(
    role="Random Forest Model Trainer",
    goal="Train a Random Forest model for the dataset",
    backstory="You are an expert in machine learning, specializing in Random Forest for regression/classification tasks.",
    tools=[docs_tool_b, model_training_tool],
    llm=llm
)


## Step 5: Define and Execute Tasks
Define the tasks for data collection, preprocessing, and model training, then execute them.

In [36]:
from crewai import Crew, Task
from textwrap import dedent

# Define tasks
data_collection_task = Task(
  description=dedent("""
  Search for three appropriate datasets on the topic of {topic} and download one using the Kaggle Dataset Downloader.
  You can search for datasets using refined queries. Note that the Kaggle Dataset Downloader only requires one input, i.e., the URL.
  """),
  expected_output = 'Provide the full description of the downloaded dataset.',
  agent=data_collection_agent,
)

data_preprocessing_task = Task(
  description=dedent("""
  Load the file, handle missing values, remove duplicates, and convert categorical variables to numerical values to make the dataset model-ready.
  """),
  expected_output='Processed dataset saved successfully',
  agent=data_preprocessing_agent,
)

model_training_task = Task(
    description=dedent(f"""
    Load the processed data from the directory. Train a Random Forest model and save the trained model.
    Note that TrainingModelTool._run() has two positional arguments which are file_path and the target_variable.
    """),
    expected_output="Model trained successfully",
    output_file='reports/training_report.txt',
    agent=model_training_agent,
    input_arguments={"file_path": "processed_data/processed_data.csv", "target_variable": "price"} # Ensure you set the correct target variable
)

# Execute workflow
crew = Crew(
    agents=[data_collection_agent, data_preprocessing_agent, model_training_agent],
    tasks=[data_collection_task, data_preprocessing_task, model_training_task],
    verbose=2
)

result = crew.kickoff(inputs={'topic': 'housing'})
print(result)




[1m[95m [2024-07-31 16:07:39][DEBUG]: == Working Agent: Data Acquisition Specialist[00m
[1m[95m [2024-07-31 16:07:39][INFO]: == Starting Task: 
Search for three appropriate datasets on the topic of housing and download one using the Kaggle Dataset Downloader.
You can search for datasets using refined queries. Note that the Kaggle Dataset Downloader only requires one input, i.e., the URL.
[00m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mI need to search for three appropriate datasets on the topic of housing and then download one using the Kaggle Dataset Downloader.

Action:
Search the internet

Action Input:
{"search_query": "housing datasets"}
[0m[95m 


Search results: Title: Housing Datasets - U.S. Census Bureau
Link: https://www.census.gov/topics/housing/data/datasets.html
Snippet: Housing Datasets. A dataset is the assembled result of one data collection operation (for example, the 2010 Census) as a whole or in major ...
---
Title: Housing Prices Datase