In [50]:
from crewai import Agent, Task, Crew, LLM

In [51]:
llm = LLM(
model="vertex_ai/gemini-2.0-flash-thinking-exp-01-21", temperature=0.6)

In [52]:
import os
import PyPDF2
import docx
import pandas as pd

def read_file_content(filepath):
    # Check if the file exists
    if not os.path.isfile(filepath):
        raise FileNotFoundError(f"The file {filepath} does not exist.")
    
    # Get the file extension
    _, file_extension = os.path.splitext(filepath)
    file_extension = file_extension.lower()
    
    if file_extension == '.pdf':
        return read_pdf(filepath)
    elif file_extension == '.docx':
        return read_docx(filepath)
    elif file_extension in ['.xlsx', '.csv']:
        return read_spreadsheet(filepath)
    else:
        raise ValueError("Unsupported file type. Allowed types are: .pdf, .docx, .xlsx, .csv")

def read_pdf(filepath):
    with open(filepath, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        pages_to_read = num_pages // 2  # Read first 50% of pages
        content = []
        for page_num in range(pages_to_read):
            page = reader.pages[page_num]
            content.append(page.extract_text())
    return '\n'.join(content)

def read_docx(filepath):
    doc = docx.Document(filepath)
    paragraphs = doc.paragraphs
    num_paragraphs = len(paragraphs)
    paragraphs_to_read = num_paragraphs // 2  # Read first 50% of paragraphs
    content = [paragraphs[i].text for i in range(paragraphs_to_read)]
    return '\n'.join(content)

def read_spreadsheet(filepath):
    if filepath.endswith('.csv'):
        df = pd.read_csv(filepath)
    else:
        df = pd.read_excel(filepath)
    
    num_rows = len(df)
    rows_to_read = num_rows // 5  # Calculate 20% of the total rows
    df_subset = df.head(rows_to_read)  # Select the first 20% of rows
    
    # Convert the DataFrame subset to a string
    df_string = df_subset.to_string(index=False)
    
    return df_string



In [53]:
# Example usage:
# content = read_file_content('data.csv')
# print(content)


In [54]:
from langchain_google_vertexai import ChatVertexAI

In [55]:
from prompts import *

In [56]:
model = ChatVertexAI(model_name= "gemini-1.5-flash-001")

In [57]:
content1 = read_file_content("FAhad Resume.pdf")
content2 = read_file_content("data.csv")

In [58]:
file_selection_agent = Agent(
    llm=llm,
    role="File Selection Expert",
    goal="Analyze user queries and return relevant file names based on file content summaries.",
    backstory="A highly skilled data indexing expert who can efficiently match user needs with available document summaries.",
    allow_code_execution=False  # No need for code execution, just text processing
)


In [59]:
file_matching_task = Task(
    description="Compare the user query with file content summaries and return a list of relevant file names that can be used for the user query. The file summaries are: {file_summaries}\n The user query is: {user_query}",
    agent=file_selection_agent,
    expected_output="A list of filenames that match the user query."
)


In [60]:
file_selection_crew = Crew(
    agents=[file_selection_agent],
    tasks=[file_matching_task],
    verbose=True
)




In [61]:
file1_summary = model.invoke(file_summary_generation_prompt(content1))
file2_summary = model.invoke(file_summary_generation_prompt(content2))

In [62]:
file_summary_dict = {'FAhad Resume.pdf': file1_summary.content, "data.csv": file2_summary.content}

In [63]:
file_summary_dict

{'FAhad Resume.pdf': 'This is a resume for Fahad Ul Rehman, an AI engineer with 5 years of experience. He specializes in generative AI projects and prompt engineering, with proficiency in FastAPI, MongoDB, and cloud-based AI services. He has worked on various AI applications, including procurement and financial data analysis, computer vision, and natural language processing, demonstrating his skills in machine learning, data analytics, and model development. The document showcases his expertise and career history, highlighting his capabilities and achievements in the field of artificial intelligence. \n',
 'data.csv': 'This document is a partial listing of passenger data from the Titanic.  It includes details such as passenger ID, survival status, passenger class, name, gender, age, number of siblings and spouses aboard, number of parents and children aboard, ticket number, fare, cabin number, and port of embarkation.  This data could be used for analysis of passenger demographics and 

In [64]:
user_query = "Generate a bar chart to show sales data"


In [65]:


inputs = {
    'user_query': user_query,
    'file_summaries': file_summary_dict
}

files_to_use = file_selection_crew.kickoff(inputs=inputs)


[1m[95m# Agent:[00m [1m[92mFile Selection Expert[00m
[95m## Task:[00m [92mCompare the user query with file content summaries and return a list of relevant file names that can be used for the user query. The file summaries are: {'FAhad Resume.pdf': 'This is a resume for Fahad Ul Rehman, an AI engineer with 5 years of experience. He specializes in generative AI projects and prompt engineering, with proficiency in FastAPI, MongoDB, and cloud-based AI services. He has worked on various AI applications, including procurement and financial data analysis, computer vision, and natural language processing, demonstrating his skills in machine learning, data analytics, and model development. The document showcases his expertise and career history, highlighting his capabilities and achievements in the field of artificial intelligence. \n', 'data.csv': 'This document is a partial listing of passenger data from the Titanic.  It includes details such as passenger ID, survival status, passeng

In [67]:
files_to_use.raw

"['data.csv']"

In [None]:
from crewai_tools import CodeInterpreterTool
run_codes = CodeInterpreterTool(unsafe_mode=True)

In [None]:
from crewai import Agent, LLM
from dotenv import load_dotenv
load_dotenv()



data_analyst_agent = Agent(
    llm= llm,
    role="Data Analyst",
    goal="Process user queries and file paths by writing and executing Python code.",
    backstory="You are an expert data analyst who can do detailed data analysis and create visualisations. You load the datasets and then do the analysis and visualization creation by writing and executing python code.",
    allow_code_execution=True,
    tools= [run_codes]
)


In [None]:
from crewai import Task

user_query = "create a pairplot of the data."
file_path = "data.csv"


dataset_data_analysis_task = Task(
    description="{user_query} The data is located at {file_path}.",
    agent=data_analyst_agent,
    expected_output="answer in markdown"
    
)


In [49]:

dataset_data_analysis_crew = Crew(
    agents=[data_analyst_agent],
    tasks=[dataset_data_analysis_task],
    verbose= True
)

result = dataset_data_analysis_crew.kickoff()

NameError: name 'data_analyst_agent' is not defined

In [None]:
inputs = {'user_query': user_query, "file_path":}

In [None]:
print(result)

In [None]:
data_preparation_agent = Agent(
    llm=llm,
    role="Data Preparation Specialist",
    goal="Interpret user queries and extract relevant data from provided file content.",
    backstory="An expert in data extraction and preparation, skilled at understanding user requirements and processing raw data accordingly.",
    allow_code_execution=False  # This agent doesn't execute code
)


In [None]:
data_analysis_agent = Agent(
    llm=llm,
    role="Data Analyst",
    goal="Perform data analysis and create visualizations based on processed data.",
    backstory="A proficient data analyst capable of executing Python code to analyze data and generate insightful visualizations.",
    allow_code_execution=True,
    tools=[CodeInterpreterTool(unsafe_mode=True)]
)


In [None]:
from crewai import Agent, Task, Crew

In [None]:
data_preparation_task = Task(
    description="Interpret the user query and extract the necessary data from the provided file content. The user query is : {user_query}\n The file content is: {file_content}",
    agent=data_preparation_agent,
    expected_output="Processed data ready for analysis."
)

data_analysis_task = Task(
    description="Analyze the processed data and generate visualizations as per the user query.",
    agent=data_analysis_agent,
    expected_output="Data analysis report with visualizations."
)


In [None]:
data_analysis_crew = Crew(
    agents=[data_preparation_agent, data_analysis_agent],
    tasks=[data_preparation_task, data_analysis_task],
    verbose=True
)


In [None]:
user_query = "Generate a bar chart to show the experience in years at each company"
# file_content = """
# Date,Product,Sales
# 2025-01-01,Product A,100
# 2025-01-01,Product B,150
# 2025-01-02,Product A,200
# 2025-01-02,Product B,250
# """

file_content = read_file_content("FAhad Resume.pdf")

inputs = {
    'user_query': user_query,
    'file_content': file_content
}

result = data_analysis_crew.kickoff({
    'user_query': user_query,
    'file_content': file_content
})


In [None]:
print(result)