<a href="https://colab.research.google.com/github/mdazar687/data_scraping/blob/main/working_scraping_indCareer_website.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 aiohttp nest_asyncio pandas openpyxl



In [None]:
import requests
from bs4 import BeautifulSoup
import aiohttp
import asyncio
import logging
import nest_asyncio
import pandas as pd
import time

# Setup logging
logging.basicConfig(level=logging.INFO)

def get_college_links(page_number):
    url = f"https://www.indcareer.com/find/all-colleges?page={page_number}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all list items in the 'list-group' class
    list_group = soup.find_all('ul', class_='list-group')
    college_links = []

    for ul in list_group:
        h4_tags = ul.find_all('h4')
        for h4 in h4_tags:
            a_tag = h4.find('a')
            if a_tag and 'href' in a_tag.attrs:
                college_links.append("https://www.indcareer.com" + a_tag['href'])

    return college_links

def get_all_college_links(max_pages):
    all_college_links = []
    for page_number in range(1, max_pages + 1):
        logging.info(f"Processing page {page_number}")
        college_links = get_college_links(page_number)
        all_college_links.extend(college_links)

    return all_college_links

# Async function to get the university link, college name, and city from a college page
async def get_college_info(session, college_url):
    async with session.get(college_url) as response:
        soup = BeautifulSoup(await response.text(), 'html.parser')

        # Get the college name
        caption = soup.find('caption', class_='fn org')
        if caption and caption.find('b'):
            college_name = caption.find('b').text.strip()
        else:
            college_name = None

        # Get the university website and city
        website_address = None
        city = None
        table = soup.find('tbody')
        if table:
            for tr in table.find_all('tr'):
                th = tr.find('th')
                if th and 'Website' in th.text:
                    td = tr.find('td')
                    if td and td.find('a', href=True):
                        website_address = td.find('a')['href']

                if th and 'City' in th.text:
                    td = tr.find('td')
                    if td:
                        city = ', '.join([a.text.strip() for a in td.find_all('a')])

        if website_address:
            logging.info(f"Found data for college: {college_url}")
            return {
                'College Name': college_name,
                'City': city,
                'Website Address': website_address
            }
        else:
            logging.warning(f"No university link found for college: {college_url}")
            return None

# Async function to fetch all college info in batches with delay
async def fetch_all_college_info(college_links, batch_size=100, delay=2):
    async with aiohttp.ClientSession() as session:
        all_results = []
        for i in range(0, len(college_links), batch_size):
            batch = college_links[i:i + batch_size]
            tasks = [get_college_info(session, link) for link in batch]
            results = await asyncio.gather(*tasks)
            all_results.extend(results)
            logging.info(f"Processed batch {i // batch_size + 1}")
            time.sleep(delay)  # Delay between batches
        return all_results

def main():
    # Set the maximum number of pages you want to scrape
    max_pages = 1368

    # Get all college links
    all_college_links = get_all_college_links(max_pages)

    # Apply nest_asyncio to handle nested event loops
    nest_asyncio.apply()

    # Run the asynchronous fetching of college info
    loop = asyncio.get_event_loop()
    college_info_list = loop.run_until_complete(fetch_all_college_info(all_college_links))

    # Filter out None results
    college_info_list = [info for info in college_info_list if info is not None]

    # Create a DataFrame and save to Excel
    df = pd.DataFrame(college_info_list)
    df.to_excel('college_info.xlsx', index=False)

if __name__ == "__main__":
    main()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


# New section