In [17]:
# Imports
import os
from crewai import Agent, Task, Crew, Process
from langchain_openai import ChatOpenAI
from crewai_tools import SerperDevTool, ScrapeWebsiteTool, WebsiteSearchTool
from crewai_tools import tool
from pydantic import BaseModel, EmailStr, Field
os.environ["OPENAI_API_KEY"] = "sk-proj-ITGhy3LPfyvYrxbBci8pT3BlbkFJP5k2rqmOZhAdrOncxu9X"
from typing import Optional, List
import json
import csv
import re
import pandas as pd

In [2]:
serper_tool = SerperDevTool()
scrape_tool = ScrapeWebsiteTool()
website_search_rag_tool = WebsiteSearchTool()

In [3]:
# Contact search Agent
search_agent = Agent(
    role = 'Contact Searcher', 
    goal = 'Find the contact details of the marketing team or relevant people from a specified client.',
    backstory = 
    """
    You are a dedicated member of the marketing team at Healee, a digital healthcare company. 
    Your expertise lies in leveraging online resources to quickly gather essential information. 
    You have identified {client} as a potential client for the Healee platform. 
    Your task is to find contact information such as email addresses, LinkedIn profiles, Phone no. 
    of key individuals within {client}'s organization who can be approached to set up a meeting. 
    This includes members of their marketing team or other relevant stakeholders. 
    """, 
    tools = [SerperDevTool()], 
    allow_delegation = False
)

# Web Scraper agent
scraper_agent = Agent(
    role='Web Scraper',
    goal='Scrape relevant information from specified websites',
    backstory="""
    You are a skilled web scraper working for the marketing team at Healee, a digital healthcare company. 
    Your task is to extract crucial information from websites to support various marketing and research activities. 
    This includes gathering contact details, company profiles, and other relevant data from the given URLs.
    """,
    tools=[ScrapeWebsiteTool()],
    allow_delegation=False
)

# Final Consolidator Agent
consolidator = Agent(
    role='Result Consolidator',
    goal='Combine all the contact information gathered by the scraper agents into a final, organized list of relevant contacts.',
    backstory=(
        "You are a highly skilled data synthesizer, adept at merging information from multiple sources. "
        "Your expertise lies in compiling comprehensive, clear, and concise lists of relevant contact information. "
        "Your meticulous approach ensures no detail is overlooked, providing a valuable resource for your team."
    ),
    verbose=True,
    allow_delegation=False
)


In [4]:
# Tasks
# Contact Search Task 
contact_search_task = Task(
    description=(
        "Use online resources to identify potential websites, LinkedIn profiles, "
        "and other online platforms where contact information of key individuals "
        "within {client}'s organization can be found. Focus on finding members "
        "of their marketing team or other relevant stakeholders. Your final list "
        "should include URLs or online resources where contact information might "
        "be available. Use general queries to search for information."

    ),
    expected_output=(
        "A list of at most 5 URLs or online resources where contact information "
        "of key individuals from {client} might be available. Ensure the sources "
        "are relevant and up-to-date."
    ),
    agent=search_agent
)

# Web scraping Task
web_scraping_task = Task(
    description=(
        "Scrape the provided URLs or online resources to extract contact details "
        "such as email addresses, LinkedIn and Phone no. profiles of key "
        "individuals within {client}'s organization. Ensure the information is "
        "accurate and relevant to the marketing team or other relevant stakeholders."
    ),
    expected_output=(
        "A detailed list of contact information extracted from the provided URLs. "
        "Include names, email addresses, LinkedIn profiles and Phone no. "
    ),
    agent=scraper_agent
)

In [5]:
# Pydantic for the final consolidation of results
class ContactInfo(BaseModel):
    name: str
    role_in_company: str
    email_address: Optional[EmailStr] = Field(None, alias='email')
    linkedin: Optional[str] = None
    phone_no: Optional[str] = None

# Output function 
def consolidation_task_function(results: List[dict]) -> str:
    contacts = [ContactInfo(**result) for result in results]
    
    # Convert to JSON
    json_output = [contact.model_dump(by_alias=True) for contact in contacts]
    return json.dump(json_output, indent=2)

In [6]:
# Final Consolidation task
consolidation_task = Task(
    description=(
        "Combine all the contact information gathered by the scraper agents into a final, organized list of relevant contacts. "
        "The output should be formatted as a JSON file with the fields: Name, Role in company, Email address, LinkedIn and Phone no. "
        "Leave any of Email address, LinkedIn or Phone no. empty if the information is not found."
        "Note: Do not generate Linkedin URLs, Email Addresses or Phone no. on your own. Just leave the field empty if not found."
    ),
    expected_output=(
        "A JSON object containing the consolidated list of contact information. The fields should include: Name, Role in company, Email address, LinkedIn and Phone no."
    ),
    agent = consolidator, 
    output_function = consolidation_task_function,
)

In [7]:
# Create the crew
crew = Crew(
    agents=[search_agent, scraper_agent, consolidator], 
    tasks=[contact_search_task, web_scraping_task, consolidation_task],
    process=Process.sequential
)

result = crew.kickoff(inputs={'client': 'Sutter Health'})
print(result)

[91m 

Action 'I will start by searching the internet for LinkedIn profiles and other potential websites where I can find this information.

Action: Search the internet' don't exist, these are the only available Actions:
 Search the internet(**kwargs: Any) -> Any - Search the internet(search_query: 'string') - A tool that can be used to search the internet with a search_query.
[00m
[95m 


Search results: Title: Sutter Health Plus health plan media contacts
Link: https://news.sutterhealthplus.org/media-contacts/
Snippet: Call the media relations hotline at 800-428-7377; please note that Sutter will only respond to working media. News Topics. 2019 · 2020 · 2021 · 2022 · 2023 ...
---
Title: Media Contact - Vitals by Sutter Health
Link: https://vitals.sutterhealth.org/media-contact/
Snippet: Media Contact. Media Contacts. (800) 428-7377 (monitored 24/7). Sutter Health Affiliates. Alta Bates Summit Medical Center
---
Title: Sutter Health Marketing Department - RocketReach
Link: https://

In [8]:
result

'```json\n[\n    {\n        "Name": "Grace Davis",\n        "Role in company": "Vice President, Chief Public Affairs and Marketing Officer",\n        "Email address": "",\n        "LinkedIn": "",\n        "Phone no.": "916358XXXX"\n    },\n    {\n        "Name": "Nancy Turner",\n        "Role in company": "Director, Strategy Communications and Clinician Engagement",\n        "Email address": "",\n        "LinkedIn": "",\n        "Phone no.": "530888XXXX, 916283XXXX"\n    },\n    {\n        "Name": "Margie O\'Clair",\n        "Role in company": "VP, Marketing",\n        "Email address": "",\n        "LinkedIn": "https://www.linkedin.com/in/margie-o-clair-584b4412",\n        "Phone no.": "650696XXXX, 707648XXXX, 707554XXXX, 650312XXXX, 916286XXXX, 650934XXXX"\n    },\n    {\n        "Name": "Sapna Parekh",\n        "Role in company": "Director, Brand Marketing",\n        "Email address": "caminomedical.org",\n        "LinkedIn": "",\n        "Phone no.": "408203XXXX, 408723XXXX"\n    },\

In [10]:
def extract_json_from_string(input_string):
    # Regular expression to find JSON within triple backticks
    pattern = r"```json(.*?)```"
    match = re.search(pattern, input_string, re.DOTALL)
    if match:
        json_string = match.group(1).strip()
        try:
            # Parse the JSON string
            json_data = json.loads(json_string)
            
            # Ensure all fields have values, even if they are missing
            for entry in json_data:
                if "LinkedIn" not in entry:
                    entry["LinkedIn"] = ""
                if "Phone no." not in entry:
                    entry["Phone no."] = ""
                if "Email address" not in entry:
                    entry["Email address"] = ""

            return json_data
        except json.JSONDecodeError as e:
            return f"Error decoding JSON: {e}"
    else:
        return "No JSON found within triple backticks."
    
json_out = extract_json_from_string(result)

In [12]:
json_out

{'Name': 'Grace Davis',
 'Role in company': 'Vice President, Chief Public Affairs and Marketing Officer',
 'Email address': '',
 'LinkedIn': '',
 'Phone no.': '916358XXXX'}

In [15]:
def json_to_dataframe(json_data):
    # Check if the JSON data is a list of dictionaries
    if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
        # Convert the list of dictionaries to a pandas DataFrame
        df = pd.DataFrame(json_data)
        return df
    else:
        raise ValueError("Invalid JSON data format. Expected a list of dictionaries.")

In [20]:
df = json_to_dataframe(json_out)
df.head(15)

Unnamed: 0,Name,Role in company,Email address,LinkedIn,Phone no.
0,Grace Davis,"Vice President, Chief Public Affairs and Marke...",,,916358XXXX
1,Nancy Turner,"Director, Strategy Communications and Clinicia...",,,"530888XXXX, 916283XXXX"
2,Margie O'Clair,"VP, Marketing",,https://www.linkedin.com/in/margie-o-clair-584...,"650696XXXX, 707648XXXX, 707554XXXX, 650312XXXX..."
3,Sapna Parekh,"Director, Brand Marketing",caminomedical.org,,"408203XXXX, 408723XXXX"
4,Giovanna Green,Brand Marketing Manager,gmail.com,,"415420XXXX, 415269XXXX, 415896XXXX"
5,Mark Riley,"Manager, Field Marketing",,,"916733XXXX, 707551XXXX, 831477XXXX, 831728XXXX..."
6,Christina Szeto,Brand Marketing,,,"530220XXXX, 408203XXXX, 408867XXXX"
7,Courtney Wilson,Brand Marketing,,,916768XXXX
8,Shannon Hare,Field Marketing,,,
9,Linda Eytcheson,Marketing Coordinator,yahoo.com,,"916733XXXX, 916797XXXX, 916969XXXX, 916972XXXX"
