<a href="https://colab.research.google.com/github/pallavmarch/Advanced-SQL-50/blob/main/psychologytoday_webscraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Start                            


In [2]:
from os import link
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from google.colab import files
import re
from tqdm import tqdm
import random


# --------- Set this variable to scrape a specific state ---------
selected_state = "Texas"  # Options: Texas, Florida, California, New York

# --------- Configuration Dictionary for States ---------
state_config = {
    "Texas": {
        "base_url": "https://www.psychologytoday.com/us/therapists/texas?page=",
        "link_file": "/content/therapist_link_texas.csv",
        "data_file_1": "therapist_data_texas_1.csv",
        "data_file_2": "therapist_data_texas_2.csv"
    },
    "Florida": {
        "base_url": "https://www.psychologytoday.com/us/therapists/florida?page=",
        "link_file": "/content/therapist_link_florida.csv",
        "data_file_1": "therapist_data_florida_1.csv",
        "data_file_2": "therapist_data_florida_2.csv"
    },
    "California": {
        "base_url": "https://www.psychologytoday.com/us/therapists/california?page=",
        "link_file": "/content/therapist_link_california.csv",
        "data_file_1": "therapist_data_california_1.csv",
        "data_file_2": "therapist_data_california_2.csv"
    },
    "New York": {
        "base_url": "https://www.psychologytoday.com/us/therapists/new-york?page=",
        "link_file": "/content/therapist_link_new_york.csv",
        "data_file_1": "therapist_data_new_york_1.csv",
        "data_file_2": "therapist_data_new_york_2.csv"
    }
}

# --------- Use selected state config ---------
config = state_config[selected_state]
base_url = config["base_url"]
link_file = config["link_file"]
#file_name = config["data_file"]

# --------- Headers ---------
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}

# --------- Initialize Containers ---------
all_therapists_data = []
total_pages = 500  # Set this as per need

all_data = []

# Part 1:
Scraping the links of profiles

In [None]:
for page in tqdm(range(1, total_pages + 1), desc="Page Extraction Progress", total=total_pages, unit="page", colour="red", ncols=100):
    url = base_url + str(page)
    response = requests.get(url, headers=HEADERS)
    time.sleep(random.uniform(1, 2))

    if response.status_code != 200:
        print(f"❌ Failed to fetch page {page}. Status code:", response.status_code)
        continue

    soup = BeautifulSoup(response.text, "html.parser")


    therapists = soup.find_all("div", class_="results-row")

    for therapist in therapists:
        try:

            profile_link_element = therapist.find("a", class_="profile-title")
            profile_url = profile_link_element.get("href") if profile_link_element else "N/A"


            all_data.append({
                "Profile URL": profile_url
                  })
        except Exception as e:
            print(f"Error extracting data: {e}")

    time.sleep(2)

df = pd.DataFrame(all_data)


print(f"Original dataset size: {len(df)}")
duplicate_profiles = df[df['Profile URL'].duplicated(keep=False)]

df_no_duplicates = df.drop_duplicates(subset='Profile URL', keep='first')
num_duplicates_removed = len(df) - len(df.drop_duplicates(subset='Profile URL', keep='first'))

df=df_no_duplicates
print(f"Number of duplicates removed: {num_duplicates_removed}")
print(f"New dataset size: {len(df_no_duplicates)}")

df_no_duplicates.to_csv(link_file, index=False)

Page Extraction Progress: 100%|[31m█████████████████████████████████[0m| 500/500 [51:24<00:00,  6.17s/page][0m

Original dataset size: 10000
Number of duplicates removed: 1776
New dataset size: 8224





# Part 2
Scraping deatils of each profile

In [None]:
csvfile = pd.read_csv(link_file)


split_index = len(csvfile) // 2
profile_urls_list_1 = csvfile["Profile URL"][:split_index].tolist()  # First half
profile_urls_list_2 = csvfile["Profile URL"][split_index:].tolist()  # Second half


current_profiles=profile_urls_list_2  # Options: profile_urls_list_1  |  profile_urls_list_2



if current_profiles == profile_urls_list_1:
    file_name = config["data_file_1"]
elif current_profiles == profile_urls_list_2:
    file_name = config["data_file_2"]

print(f"Length of profile_urls_list_1: {len(profile_urls_list_1)}")
print(f"Length of profile_urls_list_2: {len(profile_urls_list_2)}")
print(f"Data file name: {file_name}")

Length of profile_urls_list_1: 7233
Length of profile_urls_list_2: 7234
Data file name: therapist_data_texas_2.csv


In [None]:
def extract_basic_info(soup, url):
    data = {
        "Profile URL": url,
        "Name": "Not Found",
        "Title": "Not Found",
        "Credential": "Not Found",
        "Phone": "Not Found",
        "Availability": "Not Found"
    }


    name_tag = soup.find("h1")
    if name_tag:
        data["Name"] = name_tag.get_text(strip=True).lower()


    profile_type = soup.find("span", {"data-x": "profile-suffix-profile-type"})
    if profile_type:
        data["Title"] = profile_type.get_text(strip=True)


    academic_tags = soup.find_all("span", {"data-x": "profile-suffix-academic"})
    if academic_tags:
        unique_credentials = list(set(tag.get_text(strip=True) for tag in academic_tags))
        data["Credential"] = ", ".join(sorted(unique_credentials))


    phone_tag = soup.find("div", class_="profile-phone")
    if phone_tag:
        data["Phone"] = phone_tag.get_text(strip=True)

    availability_tag = soup.find("div", class_="at-a-glance_row_appointments")
    if availability_tag:
        availability_text = availability_tag.get_text(strip=True).lower()
        if "available" in availability_text:
            data["Availability"] = availability_text.replace("available ", "")
        else:
            data["Availability"] = availability_text

    return data


def extract_list(soup, keyword):
    section = soup.find(lambda tag: tag.name in ["h2", "h3", "h4"] and keyword.lower() in tag.get_text(strip=True).lower())
    if section:
        ul_section = section.find_next_sibling("ul")
        return " | ".join(span.get_text(strip=True).lower() for span in ul_section.find_all("span")) if ul_section else "Not Found"
    return "Not Found"




def extract_fees_payment(soup):

    data = {key: 0 for key in ["Individual Sessions ($)", "Couple Sessions ($)"]}
    for item in soup.find_all("li"):
        text = item.get_text(strip=True)
        if "Individual Sessions" in text:
            data["Individual Sessions ($)"] = re.findall(r"\d+", text)[0] if re.findall(r"\d+", text) else "Not Found"
        elif "Couple Sessions" in text:
            data["Couple Sessions ($)"] = re.findall(r"\d+", text)[0] if re.findall(r"\d+", text) else "Not Found"
#        elif "Pay by" in text:           data["Pay By"] = text.replace("Pay by ", "").strip()
    return data



def extract_specialties_expertise(soup):

    data = {key: "Not Found" for key in ["Top Specialties", "Expertise"]}
    specialty_section = soup.find("div", id="specialty-attributes-section")

    if specialty_section:
        for group in specialty_section.find_all("div", class_="attributes-group"):
            heading = group.find("h3")

            if heading:
                key = "Top Specialties" if "Top Specialties" in heading.get_text(strip=True) else "Expertise"

                data[key] = " | ".join(span.get_text(strip=True).lower() for span in group.find_all("span", class_="attribute_base"))

    return data



def extract_types_of_therapy(soup):
    therapy_section = soup.find("div", id="treatment-approach-attributes-section")
    return " | ".join(span.get_text(strip=True).lower() for span in therapy_section.find_all("span", class_="attribute_base")) if therapy_section else "Not Found"



def extract_qualifications(soup):

    data = {key: "Not Found" for key in ["Membership", "Certificate", "Attended University", "Major/Degree","Graduation Year"]}
    data["In Practice (years)"] = 0
#    qualifications_list = []

    for item in soup.find_all("li", class_="qualifications-element"):
        text = item.get_text(strip=True)

#        qualifications_list.append(text)
#        if "Verified by" in text:          data["Verified by Psychology Today"] = "Yes"

        if "Membership" in text:
            data["Membership"] = text.replace("Membership with", "").strip().lower()
        elif "Certificate" in text:
            data["Certificate"] = text.replace("Certificate from", "").strip().lower()

    for detail in soup.select("div.details span.primary-details"):

        text = detail.get_text(strip=True)
        if "In Practice" in text:
            data["In Practice (years)"] = re.search(r"\d+", text).group() if re.search(r"\d+", text) else 0
        elif "Attended" in text:
            uni_match = re.search(r"Attended (.*?)(?:,|$)", text)
            degree_match = re.search(r",\s(.*?)(?:,|\sGraduated)", text)
            grad_match = re.search(r"Graduated\s*(\d{4})", text)

            if uni_match:
                data["Attended University"] = uni_match.group(1).strip().lower()
            if degree_match:
                data["Major/Degree"] = degree_match.group(1).strip().lower()
            if grad_match:
                data["Graduation Year"] = grad_match.group(1).lower()

#    data["Qualifications"] = " | ".join(qualifications_list) if qualifications_list else "Not Found"
    return data



def extract_state_zip(soup):
    data = {key: "Not Found" for key in ["City", "State", "ZIP Code"]}

    address_section = soup.find("span", class_="address-region address-text")
    if address_section:
        address_text = address_section.get_text(strip=True)
        match = re.match(r"^(.*?),\s*([A-Z]{2})\s*(\d{5})$", address_text)
        if match:
            data["City"] = match.group(1)
            data["State"] = match.group(2)
            data["ZIP Code"] = match.group(3)

    return data




def extract_client_focus(soup):

    data = {key: "Not Found" for key in ["Age", "Participants", "Communities", "Religion", "I also speak"]}
    for section in soup.find_all("div", class_="client-focus-tile"):
        heading_text = section.find("h3").get_text(strip=True) if section.find("h3") else ""
        items = " | ".join(span.get_text(strip=True).lower().replace(",", "") for span in section.find_all("span", class_="client-focus-description"))
        for key in data.keys():
            if key in heading_text:
                data[key] = items
    return data


# 	def extract_endorsements(soup):
# 	    data = {"Endorsement Count": 0, "Endorsed By": "Not Found"}
# 	    endorsement_badge = soup.find("div", class_="endorsement-count clickable profile-badge")
# 	    if endorsement_badge:
# 	        data["Endorsement Count"] = int(re.search(r"\d+", endorsement_badge.get_text(strip=True)).group()) if re.search(r"\d+", endorsement_badge.get_text(strip=True)) else 0
# 	    endorsers = [
# 	        f"{e.find('div', class_='title').get_text(strip=True)} - {e.find('div', class_='subtitle').get_text(strip=True)}"
# 	        for e in soup.find_all("div", class_="endorsement")
# 	    ]
# 	    data["Endorsed By"] = " | ".join(endorsers) if endorsers else "Not Found"
# 	    return data

#def extract_personal_statement(soup):
#    statement_section = soup.find("div", class_="personal-statement-container")
#    return " ".join(p.get_text(strip=True) for p in statement_section.find_all("span", class_="paragraph")) if statement_section else "Not Found"

In [None]:
for idx, url in enumerate(tqdm(current_profiles, desc="Scraping Progress", colour="red", ncols=100,unit="profile")):

    try:
        response = requests.get(url, headers=HEADERS)
        time.sleep(random.uniform(1, 2))

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

#            h2 = soup.find("h2")
#            if h2:
#                h2_text = h2.get_text(strip=True).lower().split('(')[0]
#                h2_parts = [part.strip() for part in h2_text.split(',')]
#                title = h2_parts[0] if len(h2_parts) > 0 else "N/A"
#                credential = h2_parts[1] if len(h2_parts) > 1 else "N/A"
#            else:
#                title = "N/A"
#                credential = "N/A"

            therapist_data = {

                **extract_basic_info(soup, url),
                **extract_state_zip(soup),
                **extract_fees_payment(soup),
                "Insurance": extract_list(soup, "Insurance"),
                **extract_specialties_expertise(soup),
                "Types of Therapy": extract_types_of_therapy(soup),
                **extract_qualifications(soup),
                **extract_client_focus(soup)

#                "Profile URL": url,
#                "Name": name_tag.get_text(strip=True).lower() if name_tag else "N/A",
#                #"Title": title,
#                #"Credential": credential,
#                "Title": title,
#                "Credential": credential,
#               "Phone": phone_tag.get_text(strip=True) if phone_tag else "N/A",
#               "Availability": availability,
#                **extract_endorsements(soup)
#                "Personal Statement": extract_personal_statement(soup),

            }

            all_therapists_data.append(therapist_data)

    except Exception as e:
        print(f"⚠️ Error scraping {url}: {e}")


df = pd.DataFrame(all_therapists_data)
df.to_csv(file_name, index=False)
files.download(file_name)


Scraping Progress:  45%|[31m█████████████▉                 [0m| 3261/7234 [1:53:03<2:07:29,  1.93s/profile][0m

# Merge

In [5]:
import pandas as pd
from google.colab import files


# --------- Set this variable to scrape a specific state ---------
selected_state = "new_york"  # Options: texas, florida, california, new_york

file_path = '/content/therapist_data_'+selected_state


therapist_data_california = pd.concat([
    pd.read_csv(file_path + "_1.csv"),
    pd.read_csv(file_path + "_2.csv")
], ignore_index=True)



output_path = file_path + ".csv"
therapist_data_california.to_csv(output_path, index=False)


files.download(output_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>