In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import asyncio
import aiohttp
import time

In [None]:
doktorlar = pd.read_csv("doktorlar_id.csv")

doktorlar.tail()

In [None]:
# print the first 5 rows profile links
for i in range(9941, 9952):
    print(doktorlar["profile"][i])

In [None]:
error_indices = []

In [None]:
import json

# get the details of the profile
async def get_profile_details(url, session, i):
    try:
        async with session.get(url) as response:
            html = await response.text()
            soup = BeautifulSoup(html, "html.parser")

            doctor_details = {}

            expert_profile_header = soup.find("div", class_="expert-profile-header")
            doctor_details['data_id'] = expert_profile_header["data-id"]

            expert_point = expert_profile_header.find("div", class_="expert-point")
            if expert_point:
                doctor_details['expert_point'] = expert_point.text.strip()
            else:
                doctor_details['expert_point'] = None

            services = expert_profile_header.find_all("a", class_="service-list-item")
            doctor_details['services'] = json.dumps([service.text.strip() for service in services])

            about_modal = soup.find("div", id="aboutModal")
            about_content = about_modal.find("div", id="aboutContent")
            doctor_details['about_content'] = about_content.text.strip()

            eed_items = soup.find_all("div", class_="eed-item")
            doctor_details['eed_items'] = json.dumps([item.text.strip() for item in eed_items])

            ds_expert_interests = soup.find("div", class_="ds-expert-interest")
            expert_interests = ds_expert_interests.find_all("a")
            doctor_details['expert_interests'] = json.dumps([interest.text.strip() for interest in expert_interests])

            experience_company = soup.find("div", class_="experience-company")
            if experience_company:
                doctor_details['experience_company'] = experience_company.text.strip()
            else:
                doctor_details['experience_company'] = None

            # get inside this tag too <script type="application/ld+json">

            application_ld_json = soup.find("script", type="application/ld+json")
            doctor_details['application_ld_json'] = application_ld_json.text.strip()

            # add the details to the doktorlar dataframe
            doktorlar.loc[i, 'data_id'] = doctor_details['data_id']
            doktorlar.loc[i, 'expert_point'] = doctor_details['expert_point']
            doktorlar.loc[i, 'services'] = doctor_details['services']
            doktorlar.loc[i, 'about_content'] = doctor_details['about_content']
            doktorlar.loc[i, 'eed_items'] = doctor_details['eed_items']
            doktorlar.loc[i, 'expert_interests'] = doctor_details['expert_interests']
            doktorlar.loc[i, 'experience_company'] = doctor_details['experience_company']
            doktorlar.loc[i, 'application_ld_json'] = doctor_details['application_ld_json']

            print(f"Profile {i} details added to the dataframe")

    except Exception as e:
        print(f"Error: {e}")
        error_indices.append(i)

In [None]:
start_time = time.time()
async with aiohttp.ClientSession() as session:
    tasks = []
    for i in range(13538, len(doktorlar)):
        url = doktorlar["profile"][i]
        tasks.append(get_profile_details(url, session, i))
    await asyncio.gather(*tasks)
end_time = time.time()

print(f"Time taken to get the details of profiles: {end_time - start_time} seconds")

In [None]:
# get the error indices by checking data_id column in the doktorlar dataframe
error_indices = doktorlar[doktorlar["data_id"].isnull()].index.tolist()
print(len(error_indices))

# call the function again for the error indices
start_time = time.time()
async with aiohttp.ClientSession() as session:
    tasks = []
    for i in error_indices:
        url = doktorlar["profile"][i]
        tasks.append(get_profile_details(url, session, i))
    await asyncio.gather(*tasks)
end_time = time.time()

print(f"Time taken to get the details of profiles: {end_time - start_time} seconds")

In [None]:
# save the doktorlar dataframe to a csv file
doktorlar.to_csv("doktorlar_details.csv", index=False)

In [None]:
doktorlar.head()

In [None]:
# copy id, data_id, application_ld_json columns to a new dataframe
doktorlar_data_id = doktorlar[["id", "data_id", "application_ld_json"]]

In [None]:
doktorlar_data_id.to_csv("doktorlar_data_id.csv", index=False)
doktorlar_data_id.head()

In [None]:
# remove application_ld_json column from the doktorlar dataframe
# doktorlar.drop(columns=["application_ld_json"], inplace=True)
doktorlar.to_csv("doktorlar_details.csv", index=False, encoding="utf-8")
doktorlar.head()

In [None]:
# fix unicode characters in the dataframe and save it to a csv file
doktorlar = pd.read_csv("doktorlar_details.csv", encoding="utf-8")
doktorlar.to_csv("doktorlar_details.csv", index=False, encoding="utf-8")