<a href="https://colab.research.google.com/github/pallavmarch/WebScraping_PsychologyToday/blob/main/psychologytoday_webscraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from google.colab import files
import re
from tqdm import tqdm
import random

In [None]:
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
}

base_url = "https://www.psychologytoday.com/us/therapists/texas?page="
all_data = []
total_pages = 500

all_therapists_data = []

# Scraping the links of profiles

In [None]:
for page in tqdm(range(1, total_pages + 1), desc="Page Extraction Progress", total=total_pages, unit="page", colour="red", ncols=100):
    url = base_url + str(page)
    response = requests.get(url, headers=HEADERS)
    time.sleep(random.uniform(1, 2))

    if response.status_code != 200:
        print(f"❌ Failed to fetch page {page}. Status code:", response.status_code)
        continue

    soup = BeautifulSoup(response.text, "html.parser")


    therapists = soup.find_all("div", class_="results-row")

    for therapist in therapists:
        try:

            profile_link_element = therapist.find("a", class_="profile-title")
            profile_url = profile_link_element.get("href") if profile_link_element else "N/A"

#            name_element = therapist.find("a", class_="profile-title")
#            name = name_element.text.strip() if name_element else "N/A"
#            credentials_element = therapist.find("div", class_="profile-subtitle-credentials")
#            credentials = credentials_element.text.strip() if credentials_element else "N/A"
#            description_element = therapist.find("div", class_="profile-statement")
#            description = description_element.text.strip() if description_element else "N/A"
#            phone_element = therapist.find("span", class_="results-row-phone")
#            phone_number = phone_element.text.strip() if phone_element else "N/A"
#            location_element = therapist.find("span", class_="address")
#            location = location_element.text.strip() if location_element else "N/A"
#            verified = "Verified" if therapist.find("div", class_="verified-badge") else "Not Verified"

            all_data.append({
                "Profile URL": profile_url
                # "Therapist Name": name, "Credentials": credentials, "Description": description, "Phone Number": phone_number, "Location": location, "Verified": verified
            })
        except Exception as e:
            print(f"Error extracting data: {e}")

    time.sleep(2)

df = pd.DataFrame(all_data)
df.to_csv("therapist_data.csv", index=False)


### Duplicates?

In [None]:
duplicate_profiles = df[df['Profile URL'].duplicated(keep=False)]
if not duplicate_profiles.empty:
  print("Duplicate Profile URLs found:")
  print(duplicate_profiles['Profile URL'])
else:
  print("No duplicate Profile URLs found.")

In [None]:
df_no_duplicates = df.drop_duplicates(subset='Profile URL', keep='first')
df_no_duplicates.to_csv("therapist_data.csv", index=False)
print("Duplicates removed.")

In [None]:
duplicate_profiles = df_no_duplicates[df_no_duplicates['Profile URL'].duplicated(keep=False)]
if not duplicate_profiles.empty:
  print("Duplicate Profile URLs found:")
  print(duplicate_profiles['Profile URL'])
else:
  print("No duplicate Profile URLs found.")

# Scraping deatils of each profile

In [None]:
df = pd.read_csv("/content/therapist_data.csv")
split_index = len(df) // 2

profile_urls_list_1 = df["Profile URL"][:split_index].tolist()  # First half
profile_urls_list_2 = df["Profile URL"][split_index:].tolist()  # Second half

## Functions

In [None]:
# General function to extract a list from a given keyword
def extract_list(soup, keyword):
    section = soup.find(lambda tag: tag.name in ["h2", "h3", "h4"] and keyword.lower() in tag.get_text(strip=True).lower())
    if section:
        ul_section = section.find_next_sibling("ul")
        return " | ".join(span.get_text(strip=True) for span in ul_section.find_all("span")) if ul_section else "Not Found"
    return "Not Found"

# Extract fees, payment methods
def extract_fees_payment(soup):
    data = {"Individual Sessions ($)": "Not Found", "Couple Sessions ($)": "Not Found", "Pay By": "Not Found"}
    for item in soup.find_all("li"):
        text = item.get_text(strip=True)
        if "Individual Sessions" in text:
            data["Individual Sessions ($)"] = re.findall(r"\$\d+", text)[0] if re.findall(r"\$\d+", text) else "Not Found"
        elif "Couple Sessions" in text:
            data["Couple Sessions ($)"] = re.findall(r"\$\d+", text)[0] if re.findall(r"\$\d+", text) else "Not Found"
        elif "Pay by" in text:
            data["Pay By"] = text.replace("Pay by ", "").strip()
    return data


# Extract Specialties & Expertise
#def extract_specialties_expertise(soup):
#    data = {"Top Specialties": "Not Found", "Expertise": "Not Found"}
#    for group in soup.find_all("div", class_="attributes-group"):
#        heading = group.find("h3")
#        if heading:
#            key = "Top Specialties" if "Top Specialties" in heading.get_text(strip=True) else "Expertise"
#            data[key] = " | ".join(span.get_text(strip=True) for span in group.find_all("span", class_="attribute_base"))
#    return data


def extract_specialties_expertise(soup):
    data = {"Top Specialties": "Not Found", "Expertise": "Not Found"}

    # Locate the specialties and expertise section
    specialty_section = soup.find("div", id="specialty-attributes-section")

    if specialty_section:
        # Iterate over each attribute group (Top Specialties & Expertise)
        for group in specialty_section.find_all("div", class_="attributes-group"):
            heading = group.find("h3")

            if heading:
                key = "Top Specialties" if "Top Specialties" in heading.get_text(strip=True) else "Expertise"

                # Extract all span elements with class "attribute_base"
                data[key] = " | ".join(span.get_text(strip=True) for span in group.find_all("span", class_="attribute_base"))

    return data


# Extract Types of Therapy
def extract_types_of_therapy(soup):
    therapy_section = soup.find("div", id="treatment-approach-attributes-section")
    return " | ".join(span.get_text(strip=True) for span in therapy_section.find_all("span", class_="attribute_base")) if therapy_section else "Not Found"


# Extract Personal Statement
def extract_personal_statement(soup):
    statement_section = soup.find("div", class_="personal-statement-container")
    return " ".join(p.get_text(strip=True) for p in statement_section.find_all("span", class_="paragraph")) if statement_section else "Not Found"


# Extract Qualifications (including university, major, graduation year)
def extract_qualifications(soup):
    data = {
        "Verified by Psychology Today": "No",
        "In Practice (years)": "Not Found",
        "Membership": "Not Found",
        "Certificate": "Not Found",
        "Attended University": "Not Found",
        "Major/Degree": "Not Found",
        "Graduation Year": "Not Found",
        "Qualifications": "Not Found"    }

    qualifications_list = []  # Store all qualifications for the "Qualifications" column


    for item in soup.find_all("li", class_="qualifications-element"):
        text = item.get_text(strip=True)
        qualifications_list.append(text)

        if "Verified by" in text:
            data["Verified by Psychology Today"] = "Yes"
        elif "In Practice" in text:
            data["In Practice (years)"] = re.search(r"\d+", text).group() if re.search(r"\d+", text) else "Not Found"
        elif "Membership" in text:
            data["Membership"] = text.replace("Membership with", "").strip()
        elif "Certificate" in text:
            data["Certificate"] = text.replace("Certificate from", "").strip()
        elif "Attended" in text:
            # Extract University
            university_match = re.search(r"Attended(.*?)(?:,|\s-)", text)
            if university_match:
                data["Attended University"] = university_match.group(1).strip()

            # Extract Major/Degree
            major_match = re.search(r",\s(.*?)(?:,|\sGraduated)", text)
            if major_match:
                data["Major/Degree"] = major_match.group(1).strip()

            # Extract Graduation Year
            grad_year_match = re.search(r"Graduated(\d{4})", text)
            if grad_year_match:
                data["Graduation Year"] = grad_year_match.group(1)

    data["Qualifications"] = " | ".join(qualifications_list) if qualifications_list else "Not Found"

    return data


# Extract State & ZIP Code
def extract_state_zip(soup):
    data = {"State": "Not Found", "ZIP Code": "Not Found"}
    address_section = soup.find("span", class_="address-region address-text")
    if address_section:
        address_text = address_section.get_text(strip=True)
        data["State"] = re.search(r"\b[A-Z]{2}\b", address_text).group() if re.search(r"\b[A-Z]{2}\b", address_text) else "Not Found"
        data["ZIP Code"] = re.search(r"\b\d{5}\b", address_text).group() if re.search(r"\b\d{5}\b", address_text) else "Not Found"
    return data


# Extract client focus details
def extract_client_focus(soup):
    data = {key: "Not Found" for key in ["Age", "Participants", "Communities", "Religion", "I also speak"]}
    for section in soup.find_all("div", class_="client-focus-tile"):
        heading_text = section.find("h3").get_text(strip=True) if section.find("h3") else ""
        items = " | ".join(span.get_text(strip=True).replace(",", "") for span in section.find_all("span", class_="client-focus-description"))
        for key in data.keys():
            if key in heading_text:
                data[key] = items
    return data

# Extract endorsements
def extract_endorsements(soup):
    data = {"Endorsement Count": 0, "Endorsed By": "Not Found"}

    endorsement_badge = soup.find("div", class_="endorsement-count clickable profile-badge")
    if endorsement_badge:
        data["Endorsement Count"] = int(re.search(r"\d+", endorsement_badge.get_text(strip=True)).group()) if re.search(r"\d+", endorsement_badge.get_text(strip=True)) else 0

    endorsers = [
        f"{e.find('div', class_='title').get_text(strip=True)} - {e.find('div', class_='subtitle').get_text(strip=True)}"
        for e in soup.find_all("div", class_="endorsement")
    ]
    data["Endorsed By"] = " | ".join(endorsers) if endorsers else "Not Found"

    return data


## Main Scraping

In [None]:
for idx, url in enumerate(tqdm(profile_urls_list_2, desc="Scraping Progress", colour="red", ncols=100,unit="profile")):

    try:
        response = requests.get(url, headers=HEADERS)
        time.sleep(random.uniform(1, 2))

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")

            therapist_data = {
                "Profile URL": url,
                "Name": soup.find("h1").get_text(strip=True) if soup.find("h1") else "N/A",
                "Title": soup.find("h2").get_text(strip=True) if soup.find("h2") else "N/A",
                "Location": soup.find("div", class_="address").get_text(strip=True) if soup.find("div", class_="address") else "N/A",
                **extract_state_zip(soup),
                "Phone": soup.find("div", class_="profile-phone").get_text(strip=True) if soup.find("div", class_="profile-phone") else "N/A",
                **extract_fees_payment(soup),
                "Insurance": extract_list(soup, "Insurance"),
                **extract_specialties_expertise(soup),
                "Types of Therapy": extract_types_of_therapy(soup),
                "Personal Statement": extract_personal_statement(soup),
                **extract_qualifications(soup),
                **extract_endorsements(soup),
                **extract_client_focus(soup),
            }

            all_therapists_data.append(therapist_data)

    except Exception as e:
        print(f"⚠️ Error scraping {url}: {e}")


df = pd.DataFrame(all_therapists_data)

#df.to_csv("therapists_profiles_data_1.csv", index=False)
#files.download("therapists_profiles_data_1.csv")

df.to_csv("therapists_profiles_data_2.csv", index=False)
files.download("therapists_profiles_data_2.csv")
df

Scraping Progress:  14%|[31m████▌                             [0m| 614/4537 [31:57<3:16:29,  3.01s/profile][0m

⚠️ Error scraping https://www.psychologytoday.com/us/therapists/amina-omar-ali-houston-tx/1111230: 'NoneType' object has no attribute 'get_text'


Scraping Progress:  20%|[31m██████▊                           [0m| 909/4537 [47:05<3:08:26,  3.12s/profile][0m

⚠️ Error scraping https://www.psychologytoday.com/us/therapists/online-support-groups-houston-tx/1143473: 'NoneType' object has no attribute 'get_text'


Scraping Progress:  30%|[31m█████████▏                     [0m| 1346/4537 [1:09:30<2:52:04,  3.24s/profile][0m

⚠️ Error scraping https://www.psychologytoday.com/us/therapists/new-season-counseling-houston-tx/909737: 'NoneType' object has no attribute 'get_text'


Scraping Progress:  33%|[31m██████████▏                    [0m| 1489/4537 [1:16:57<2:51:42,  3.38s/profile][0m

⚠️ Error scraping https://www.psychologytoday.com/us/therapists/jb-mccall-ma-sap-lpc-grand-prairie-tx/767770: 'NoneType' object has no attribute 'get_text'


Scraping Progress:  50%|[31m███████████████▍               [0m| 2267/4537 [1:57:05<2:01:28,  3.21s/profile][0m

⚠️ Error scraping https://www.psychologytoday.com/us/therapists/christy-graham-denton-tx/138276: 'NoneType' object has no attribute 'get_text'


Scraping Progress:  59%|[31m██████████████████▍            [0m| 2695/4537 [2:19:14<1:39:39,  3.25s/profile][0m

⚠️ Error scraping https://www.psychologytoday.com/us/therapists/mayra-cano-san-antonio-tx/268445: 'NoneType' object has no attribute 'get_text'


Scraping Progress:  65%|[31m████████████████████           [0m| 2936/4537 [2:31:46<1:18:08,  2.93s/profile][0m

⚠️ Error scraping https://www.psychologytoday.com/us/therapists/cameron-house-bryan-tx/354678: 'NoneType' object has no attribute 'get_text'


Scraping Progress:  65%|[31m████████████████████▏          [0m| 2946/4537 [2:32:15<1:16:29,  2.88s/profile][0m

⚠️ Error scraping https://www.psychologytoday.com/us/therapists/mylene-mackey-san-antonio-tx/779220: 'NoneType' object has no attribute 'get_text'


Scraping Progress:  67%|[31m████████████████████▋          [0m| 3034/4537 [2:36:49<1:17:38,  3.10s/profile][0m

⚠️ Error scraping https://www.psychologytoday.com/us/therapists/crystal-ross-houston-tx/1250674: 'NoneType' object has no attribute 'get_text'


Scraping Progress:  71%|[31m█████████████████████▉         [0m| 3205/4537 [2:45:41<1:08:37,  3.09s/profile][0m

⚠️ Error scraping https://www.psychologytoday.com/us/therapists/kevin-r-mack-houston-tx/844923: 'NoneType' object has no attribute 'get_text'


Scraping Progress:  79%|[31m██████████████████████████▏      [0m| 3593/4537 [3:05:34<48:46,  3.10s/profile][0m

⚠️ Error scraping https://www.psychologytoday.com/us/therapists/melissa-garcia-houston-tx/1136025: 'NoneType' object has no attribute 'get_text'


Scraping Progress:  84%|[31m███████████████████████████▊     [0m| 3829/4537 [3:17:49<37:57,  3.22s/profile][0m

⚠️ Error scraping https://www.psychologytoday.com/us/therapists/emma-taylor-houston-tx/1156543: 'NoneType' object has no attribute 'get_text'


Scraping Progress:  89%|[31m█████████████████████████████▍   [0m| 4047/4537 [3:29:10<24:11,  2.96s/profile][0m

⚠️ Error scraping https://www.psychologytoday.com/us/therapists/steven-moss-houston-tx/1082445: 'NoneType' object has no attribute 'get_text'


Scraping Progress: 100%|[31m█████████████████████████████████[0m| 4537/4537 [3:54:30<00:00,  3.10s/profile][0m


Unnamed: 0,Profile URL,Name,Title,Location,State,ZIP Code,Phone,Individual Sessions ($),Couple Sessions ($),Pay By,...,Major/Degree,Graduation Year,Qualifications,Endorsement Count,Endorsed By,Age,Participants,Communities,Religion,I also speak
0,https://www.psychologytoday.com/us/therapists/...,Jeremy Capello,"Psychologist,PhD","Jeremy Capello1304 Goeth CircleAustin, TX 7874...",TX,78746,(512) 772-5397,$150,Not Found,"American Express, Cash, Check, Discover, Healt...",...,Not Found,2008,Verified byPsychology TodayLicensed by State o...,2,Michael Uebel - Clinical Social Work/Therapist...,Adults | Elders (65+),Individuals | Couples,Bisexual Allied | Gay Allied | HIV / AIDS Alli...,Not Found,Not Found
1,https://www.psychologytoday.com/us/therapists/...,Awakening Connections,"Licensed Professional Counselor,BSW,,MS,,LPC","Virtual OnlyHouston, TX 77096(346) 460-4798",TX,77096,(346) 460-4798,$110,$150,"ACH Bank transfer, American Express, Discover,...",...,Not Found,Not Found,Verified byPsychology TodayLicensed by State o...,1,"360 Therapy & Psychological Services, LLC - Ps...",Preteen | Teen | Adults | Elders (65+),Individuals | Couples | Group,Sex Worker Allied | Single Mother,Not Found,Not Found
2,https://www.psychologytoday.com/us/therapists/...,Johanna Austin-Azadi,"Marriage & Family Therapist,MA,LMFT","Austin Therapy and CoachingAustin, TX 78722(51...",TX,78722,(512) 359-3503,$375,Not Found,"American Express, Cash, Check, Discover, Maste...",...,Not Found,Not Found,Verified byPsychology TodayLicensed by State o...,1,Shannon Haragan - Licensed Professional Counse...,Adults,Individuals | Couples | Family,Not Found,Not Found,Not Found
3,https://www.psychologytoday.com/us/therapists/...,Stacia Daniel,"Psychologist,PhD","True Life Counseling18838 Stone Oak Parkway, S...",TX,78258,(210) 209-8626,Not Found,Not Found,"Cash, Check, Mastercard, Visa",...,Not Found,1997,Verified byPsychology TodayLicensed by State o...,0,Not Found,Adults | Elders (65+),Individuals,Not Found,Christian,Not Found
4,https://www.psychologytoday.com/us/therapists/...,Marcos Kito Holtzman,"Licensed Professional Counselor,MEd,LPC-S,LCDC...","New Wave Behavioral HealthBrownsville, TX 7852...",TX,78520,(512) 729-7544,$175,$200,"American Express, Cash, Check, Discover, Healt...",...,Brownsville,2001,Verified byPsychology TodayLicensed by State o...,2,David Armando Tejeda - Licensed Professional C...,Teen | Adults,Individuals | Couples | Family,Aviation Professionals | Gay Allied | HIV / AI...,Christian,Spanish
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4519,https://www.psychologytoday.com/us/therapists/...,"Renewing The Mind Counseling Services Tx, LLC","Licensed Professional Counselor,MS, LPC,LCPC,L...","Renewing The Mind Counseling Services Tx, LLCH...",TX,77338,(281) 957-1270,$175,Not Found,"American Express, Cash, Discover, Health Savin...",...,MS,2019,Verified byPsychology TodayLicensed by State o...,0,Not Found,Teen | Adults | Elders (65+),Individuals | Group,Gay Allied | HIV / AIDS Allied | Immuno-disord...,Christian,Not Found
4520,https://www.psychologytoday.com/us/therapists/...,Brenda Lisbeth Medina,"Marriage & Family Therapist Associate,MS(she, ...","Therapy Works4645 Avon LaneSuite 120AFrisco, T...",TX,75033,(903) 496-3243,$100,$100,"Cash, Visa, Zelle",...,Not Found,Not Found,Verified byPsychology TodayPrelicense by State...,1,Ana Marcela Rodriguez - Marriage & Family Ther...,Teen | Adults,Individuals | Couples | Family,Not Found,Not Found,Spanish
4521,https://www.psychologytoday.com/us/therapists/...,Anna Graybeal,"Psychologist,PhD,CGP,SEP","4315 Guadalupe StSuite 208Austin, TX 78751(512...",TX,78751,(512) 540-5328,$180,Not Found,"Cash, Check",...,Not Found,Not Found,Verified byPsychology TodayLicensed by State o...,0,Not Found,Not Found,Individuals | Couples | Group,Not Found,Not Found,Not Found
4522,https://www.psychologytoday.com/us/therapists/...,Kayleen Thorson,"Clinical Social Work/Therapist,LCSW(she, her)","1525 Lakeville Dr. Ste 114Kingwood, TX 77339(9...",TX,77339,(936) 296-3101,$125,$175,"American Express, Cash, Check, Discover, Maste...",...,Not Found,Not Found,Verified byPsychology TodayLicensed by State o...,0,Not Found,Preteen | Teen | Adults,Individuals | Couples | Family,Gay Allied | Sex Worker Allied | Single Mother...,Not Found,Not Found
