In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from google.colab import files
import re

# Scraping the links of profiles

In [None]:

# Define headers to mimic a real browser request
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}

# Base URL
base_url = "https://www.psychologytoday.com/us/therapists/texas?page="
all_data = []


for page in range(1, 501):  #till 500
    url = base_url + str(page)
    response = requests.get(url, headers=HEADERS)
    time.sleep(3)

    if response.status_code != 200:
        print(f"❌ Failed to fetch page {page}. Status code:", response.status_code)
        continue

    soup = BeautifulSoup(response.text, "html.parser")

    # Find all therapist listings
    therapists = soup.find_all("div", class_="results-row")

    for therapist in therapists:
        try:
            name_element = therapist.find("a", class_="profile-title")
            name = name_element.text.strip() if name_element else "N/A"

            profile_link_element = therapist.find("a", class_="profile-title")
            profile_url = profile_link_element.get("href") if profile_link_element else "N/A"

            credentials_element = therapist.find("div", class_="profile-subtitle-credentials")
            credentials = credentials_element.text.strip() if credentials_element else "N/A"

            description_element = therapist.find("div", class_="profile-statement")
            description = description_element.text.strip() if description_element else "N/A"

            phone_element = therapist.find("span", class_="results-row-phone")
            phone_number = phone_element.text.strip() if phone_element else "N/A"

            location_element = therapist.find("span", class_="address")
            location = location_element.text.strip() if location_element else "N/A"

            verified = "Verified" if therapist.find("div", class_="verified-badge") else "Not Verified"

            all_data.append({
                "Therapist Name": name,
                "Profile URL": profile_url,
                "Credentials": credentials,
                "Description": description,
                "Phone Number": phone_number,
                "Location": location,
                "Verified": verified
            })
        except Exception as e:
            print(f"Error extracting data: {e}")

    print(f"✅ Page {page} extracted successfully.")
    time.sleep(2)

# Save to CSV
df = pd.DataFrame(all_data)
df.to_csv("therapist_data.csv", index=False)


✅ Page 1 extracted successfully.
✅ Page 2 extracted successfully.
✅ Page 3 extracted successfully.
✅ Page 4 extracted successfully.
✅ Page 5 extracted successfully.
✅ Page 6 extracted successfully.
✅ Page 7 extracted successfully.
✅ Page 8 extracted successfully.
✅ Page 9 extracted successfully.
✅ Page 10 extracted successfully.
✅ Page 11 extracted successfully.
✅ Page 12 extracted successfully.
✅ Page 13 extracted successfully.
✅ Page 14 extracted successfully.
✅ Page 15 extracted successfully.
✅ Page 16 extracted successfully.
✅ Page 17 extracted successfully.
✅ Page 18 extracted successfully.
✅ Page 19 extracted successfully.
✅ Page 20 extracted successfully.
✅ Page 21 extracted successfully.
✅ Page 22 extracted successfully.
✅ Page 23 extracted successfully.
✅ Page 24 extracted successfully.
✅ Page 25 extracted successfully.
✅ Page 26 extracted successfully.
✅ Page 27 extracted successfully.
✅ Page 28 extracted successfully.
✅ Page 29 extracted successfully.
✅ Page 30 extracted suc

# Scraping deatils of each profile

In [None]:
df = pd.read_csv("therapist_data.csv")
split_index = len(df) // 2

profile_urls_list_1 = df["Profile URL"][:split_index].tolist()  # First half
profile_urls_list_2 = df["Profile URL"][split_index:].tolist()  # Second half

## Functions

In [None]:
# General function to extract a list from a given keyword
def extract_list(soup, keyword):
    section = soup.find(lambda tag: tag.name in ["h2", "h3", "h4"] and keyword.lower() in tag.get_text(strip=True).lower())
    if section:
        ul_section = section.find_next_sibling("ul")
        return " | ".join(span.get_text(strip=True) for span in ul_section.find_all("span")) if ul_section else "Not Found"
    return "Not Found"

# Extract fees, payment methods
def extract_fees_payment(soup):
    data = {"Individual Sessions ($)": "Not Found", "Couple Sessions ($)": "Not Found", "Pay By": "Not Found"}
    for item in soup.find_all("li"):
        text = item.get_text(strip=True)
        if "Individual Sessions" in text:
            data["Individual Sessions ($)"] = re.findall(r"\$\d+", text)[0] if re.findall(r"\$\d+", text) else "Not Found"
        elif "Couple Sessions" in text:
            data["Couple Sessions ($)"] = re.findall(r"\$\d+", text)[0] if re.findall(r"\$\d+", text) else "Not Found"
        elif "Pay by" in text:
            data["Pay By"] = text.replace("Pay by ", "").strip()
    return data


# Extract Specialties & Expertise
def extract_specialties_expertise(soup):
    data = {"Top Specialties": "Not Found", "Expertise": "Not Found"}
    for group in soup.find_all("div", class_="attributes-group"):
        heading = group.find("h3")
        if heading:
            key = "Top Specialties" if "Top Specialties" in heading.get_text(strip=True) else "Expertise"
            data[key] = " | ".join(span.get_text(strip=True) for span in group.find_all("span", class_="attribute_base"))
    return data


# Extract Types of Therapy
def extract_types_of_therapy(soup):
    therapy_section = soup.find("div", id="treatment-approach-attributes-section")
    return " | ".join(span.get_text(strip=True) for span in therapy_section.find_all("span", class_="attribute_base")) if therapy_section else "Not Found"


# Extract Personal Statement
def extract_personal_statement(soup):
    statement_section = soup.find("div", class_="personal-statement-container")
    return " ".join(p.get_text(strip=True) for p in statement_section.find_all("span", class_="paragraph")) if statement_section else "Not Found"


# Extract Qualifications (including university, major, graduation year)
def extract_qualifications(soup):
    data = {
        "Verified by Psychology Today": "No",
        "In Practice (years)": "Not Found",
        "Membership": "Not Found",
        "Certificate": "Not Found",
        "Attended University": "Not Found",
        "Major/Degree": "Not Found",
        "Graduation Year": "Not Found",
        "Qualifications": "Not Found"    }

    for item in soup.find_all("li", class_="qualifications-element"):
        text = item.get_text(strip=True)

        if "Verified by" in text:
            data["Verified by Psychology Today"] = "Yes"
        elif "In Practice" in text:
            data["In Practice (years)"] = re.search(r"\d+", text).group() if re.search(r"\d+", text) else "Not Found"
        elif "Membership" in text:
            data["Membership"] = text.replace("Membership with", "").strip()
        elif "Certificate" in text:
            data["Certificate"] = text.replace("Certificate from", "").strip()
        elif "Attended" in text:
            # Extract University
            university_match = re.search(r"Attended\s(.*?),", text)
            if university_match:
                data["Attended University"] = university_match.group(1).strip()

            # Extract Major/Degree
            major_match = re.search(r",\s(.*?)(?:,|\sGraduated)", text)
            if major_match:
                data["Major/Degree"] = major_match.group(1).strip()

            # Extract Graduation Year
            grad_year_match = re.search(r"Graduated\s(\d{4})", text)
            if grad_year_match:
                data["Graduation Year"] = grad_year_match.group(1)
    return data


# Extract State & ZIP Code
def extract_state_zip(soup):
    data = {"State": "Not Found", "ZIP Code": "Not Found"}
    address_section = soup.find("span", class_="address-region address-text")
    if address_section:
        address_text = address_section.get_text(strip=True)
        data["State"] = re.search(r"\b[A-Z]{2}\b", address_text).group() if re.search(r"\b[A-Z]{2}\b", address_text) else "Not Found"
        data["ZIP Code"] = re.search(r"\b\d{5}\b", address_text).group() if re.search(r"\b\d{5}\b", address_text) else "Not Found"
    return data


# Extract client focus details
def extract_client_focus(soup):
    data = {key: "Not Found" for key in ["Age", "Participants", "Communities", "Religion", "I also speak"]}
    for section in soup.find_all("div", class_="client-focus-tile"):
        heading_text = section.find("h3").get_text(strip=True) if section.find("h3") else ""
        items = " | ".join(span.get_text(strip=True).replace(",", "") for span in section.find_all("span", class_="client-focus-description"))
        for key in data.keys():
            if key in heading_text:
                data[key] = items
    return data

# Extract endorsements
def extract_endorsements(soup):
    data = {"Endorsement Count": 0, "Endorsed By": "Not Found"}

    endorsement_badge = soup.find("div", class_="endorsement-count clickable profile-badge")
    if endorsement_badge:
        data["Endorsement Count"] = int(re.search(r"\d+", endorsement_badge.get_text(strip=True)).group()) if re.search(r"\d+", endorsement_badge.get_text(strip=True)) else 0

    endorsers = [
        f"{e.find('div', class_='title').get_text(strip=True)} - {e.find('div', class_='subtitle').get_text(strip=True)}"
        for e in soup.find_all("div", class_="endorsement")
    ]
    data["Endorsed By"] = " | ".join(endorsers) if endorsers else "Not Found"

    return data


## Main Scraping

In [None]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"}

all_therapists_data = []

#for idx, url in enumerate(profile_urls_list_1):
#    print(f"Scraping ({idx+1}/{len(profile_urls_list_1)}): {url}")

for idx, url in enumerate(profile_urls_list_2):                        #Choose which one to do first
    print(f"Scraping ({idx+1}/{len(profile_urls_list_2)}): {url}")

    try:
        response = requests.get(url, headers=HEADERS)
        time.sleep(2)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

            therapist_data = {
                "Profile URL": url,
                "Name": soup.find("h1").get_text(strip=True) if soup.find("h1") else "N/A",
                "Title": soup.find("h2").get_text(strip=True) if soup.find("h2") else "N/A",
                "Location": soup.find("div", class_="address").get_text(strip=True) if soup.find("div", class_="address") else "N/A",
                **extract_state_zip(soup),
                "Phone": soup.find("div", class_="profile-phone").get_text(strip=True) if soup.find("div", class_="profile-phone") else "N/A",
                **extract_fees_payment(soup),
                "Insurance": extract_list(soup, "Insurance"),
                **extract_specialties_expertise(soup),
                "Types of Therapy": extract_types_of_therapy(soup),
                "Personal Statement": extract_personal_statement(soup),
                **extract_qualifications(soup),
                **extract_endorsements(soup),
                **extract_client_focus(soup),
            }

            all_therapists_data.append(therapist_data)

    except Exception as e:
        print(f"⚠️ Error scraping {url}: {e}")


df = pd.DataFrame(all_therapists_data)

#df.to_csv("therapists_profiles_data_1.csv", index=False)
#files.download("therapists_profiles_data_1.csv")

df.to_csv("therapists_profiles_data_2.csv", index=False)
files.download("therapists_profiles_data_2.csv")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Scraping (8/5000): https://www.psychologytoday.com/us/therapists/gene-gibbs-dallas-tx/933321
Scraping (9/5000): https://www.psychologytoday.com/us/therapists/holly-lockett-frisco-tx/91628
Scraping (10/5000): https://www.psychologytoday.com/us/therapists/chen-song-houston-tx/1410919
Scraping (11/5000): https://www.psychologytoday.com/us/therapists/robert-anthony-dallas-tx/400885
Scraping (12/5000): https://www.psychologytoday.com/us/therapists/dana-vanrenterghem-missouri-city-tx/1409266
Scraping (13/5000): https://www.psychologytoday.com/us/therapists/janet-mize-austin-tx/821861
Scraping (14/5000): https://www.psychologytoday.com/us/therapists/shaina-edwards-houston-tx/1458470
Scraping (15/5000): https://www.psychologytoday.com/us/therapists/heather-m-walker-bay-city-tx/1428638
Scraping (16/5000): https://www.psychologytoday.com/us/therapists/michelle-quiroga-austin-tx/315109
Scraping (17/5000): https://www.psychologytoday

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>