In [9]:
#simple 3 agent crew for thesis writing
#setup
from crewai import Crew, Agent, Task, LLM
from crewai_tools import SerperDevTool, \
                         ScrapeWebsiteTool
from dotenv import load_dotenv
import os
from pydantic import BaseModel, Field
load_dotenv()
import os
import yaml

gemini_llm = LLM(
    model='gemini/gemini-2.5-flash-lite',
    temperature=0.5,
    api_key=os.getenv('GOOGLE_API_KEY'),
    # timeout=3
)
cohere_llm = LLM(
    model='command-r',
    temperature=0.5,
    api_key=os.getenv('COHERE_API_KEY')
)

search_tool = SerperDevTool()
scrape_tool = ScrapeWebsiteTool()

In [10]:
#to get the info from yaml files
files = {
    'agents': 'yaml/agents.yaml',
    'tasks': 'yaml/tasks.yaml'
}

# Load configurations from YAML files
configs = {}
for config_type, file_path in files.items():
    with open(file_path, 'r') as file:
        configs[config_type] = yaml.safe_load(file)

# Assign loaded configurations to specific variables
agents_config = configs['agents']
tasks_config = configs['tasks']

In [11]:
#agents-----------
researcher = Agent(
    config=agents_config['researcher_agent'],
    tools=[search_tool,scrape_tool],
    llm=cohere_llm
)

writer = Agent(
    config=agents_config['academic_writer_agent'],
    llm=gemini_llm
)

supervisor = Agent(
    config=agents_config['supervisor_agent'],
    llm=gemini_llm
)

#tasks-----------
research = Task(
    config=tasks_config['research'],
    agent=researcher
)

write = Task(
    config=tasks_config['write'],
    agent=writer
)

supervise = Task(
    config=tasks_config['supervisor'],
    agent=supervisor
)

#inputs-----------
inputs = {
    'research_topic':'General Computer Vision Algorithm for QA',
    'academic_level':'PHD'
}

crew = Crew(agents=[researcher, writer, supervisor],
            tasks=[research,write,supervise])

In [13]:
result = crew.kickoff(inputs=inputs)

In [14]:
print(result)

## A Comprehensive Review of General Computer Vision Algorithms for Visual Question Answering: Advances, Challenges, and Future Directions

**Abstract:**
Visual Question Answering (VQA) remains a cornerstone of multimodal AI, bridging computer vision and natural language understanding. This paper provides a PhD-level, comprehensive review of general computer vision algorithms for VQA, covering core components, advanced techniques, and emerging trends. We critically evaluate the technical accuracy and depth of explanations for foundational algorithms (CNNs, Transformers, Attention, GNNs) and identify cutting-edge innovations that are currently underrepresented. The paper also highlights recent (2023-2024) seminal contributions and suggests crucial references to ensure alignment with the latest advancements. By addressing challenges such as ambiguity, commonsense reasoning, and fairness, this review aims to serve as a definitive resource for researchers and practitioners in the field.

*