In [9]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import WebDriverException

In [10]:
# Initialize Chrome WebDriver
driver = webdriver.Chrome()

# Read URLs from CSV into a DataFrame, including bus_link and bus_route
urls_df = pd.read_csv('P:/Capstone Guvi/Redbus Data Scrapping/all_bus_routes.csv', usecols=['route_link', 'route_name'])

# Initialize a list to store all DataFrames
all_bus_dfs = []

# Loop through each URL in the DataFrame
for i, row in urls_df.iterrows():
    bus_url = str(row['route_link']).strip()  # Convert to string and strip whitespace
    bus_route_name = row['route_name']
    
    try:
        # Check if URL is valid (not NaN or malformed)
        if not bus_url or bus_url.lower() == 'nan':
            print(f"Skipping invalid URL at index {i}: {bus_url}")
            continue

        # Navigate to the URL
        driver.get(bus_url)
        print(f"Scraping data from: {bus_url}")

        # Wait for the page to load (adjust as needed)
        time.sleep(5)

        # Scroll down to the bottom of the page to load all content
        actions = ActionChains(driver)
        last_height = driver.execute_script("return document.body.scrollHeight")

        while True:
            actions.send_keys(Keys.PAGE_DOWN).perform()
            time.sleep(2)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Extract the web elements
        bus_results = driver.find_elements(By.XPATH, '//div[@class="clearfix bus-item"]')

        # Initialize an empty list to store the results for the current URL
        bus_data_list = []

        for bus in bus_results:
            try:
                bus_name = bus.find_element(By.XPATH, './/div[@class="travels lh-24 f-bold d-color"]').text.strip()
            except:
                bus_name = None

            try:
                bus_type = bus.find_element(By.XPATH, './/div[@class="bus-type f-12 m-top-16 l-color evBus"]').text.strip()
            except:
                bus_type = None

            try:
                depart_time = bus.find_element(By.XPATH, './/div[@class="dp-time f-19 d-color f-bold"]').text.strip()
            except:
                depart_time = None

            try:
                arr_time = bus.find_element(By.XPATH, './/div[@class="bp-time f-19 d-color disp-Inline"]').text.strip()
            except:
                arr_time = None

            try:
                duration = bus.find_element(By.XPATH, './/div[@class="dur l-color lh-24"]').text.strip()
            except:
                duration = None

            try:
                fare = bus.find_element(By.CSS_SELECTOR, 'span.f-19.f-bold').text.strip()
            except:
                fare = None

            try:
                rating = bus.find_element(By.XPATH, './/div[contains(@class, "rating-sec") and contains(@class, "lh-24")]').text.strip()
            except:
                rating = None

            try:
                seat_availability = bus.find_element(By.XPATH, './/div[contains(@class, "seat-left") and contains(@class, "m-top-16")]').text.strip().split()[0]
            except:
                seat_availability = None

            # Append data as a tuple to the list, including bus_url and bus_route_name
            bus_data_list.append((bus_name, bus_type, depart_time, arr_time, duration, fare, rating, seat_availability, bus_url, bus_route_name))

        # Create DataFrame from the list of tuples for the current URL
        bus_df = pd.DataFrame(bus_data_list, columns=["Bus_Name", "Bus_Type", "Departure_Time", "Arrival_Time", "Duration", "Fare", "Rating", "Seat_Availability", "Bus_Link", "Bus_Route"])

        # Append the DataFrame to the list of all DataFrames
        all_bus_dfs.append(bus_df)

    except WebDriverException as e:
        print(f"Error navigating to URL: {bus_url}")
        print(str(e))  # Print the exception details for debugging purposes

# Close the Selenium driver
driver.quit()

# Concatenate all DataFrames in the list into a single DataFrame
final_bus_data = pd.concat(all_bus_dfs, ignore_index=True)

# Save the concatenated DataFrame to a CSV file
final_bus_data.to_csv('P:/Capstone Guvi/Redbus Data Scrapping/redbus_all_bus_data.csv', index=False)

# Print the final concatenated DataFrame
print(final_bus_data)


Cache folder (C:\Users\abdpa\.cache\selenium) cannot be created: Cannot create a file when that file already exists. (os error 183)
Cache folder (C:\Users\abdpa\.cache\selenium) cannot be created: Cannot create a file when that file already exists. (os error 183)
Cache folder (C:\Users\abdpa\.cache\selenium) cannot be created: Cannot create a file when that file already exists. (os error 183)


Scraping data from: https://www.redbus.in/bus-tickets/pune-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-pune
Scraping data from: https://www.redbus.in/bus-tickets/mumbai-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/bangalore-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-bangalore
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-mumbai
Scraping data from: https://www.redbus.in/bus-tickets/pandharpur-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-pandharpur
Scraping data from: https://www.redbus.in/bus-tickets/solapur-to-goa
Scraping data from: https://www.redbus.in/bus-tickets/calangute-goa-to-goa-airport
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-kolhapur-maharashtra
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-solapur
Scraping data from: https://www.redbus.in/bus-tickets/goa-to-sangli
Scraping data from: https://www.redbus.in/bus-tickets/goa-airport-to-calang

In [12]:
final_bus_data

Unnamed: 0,Bus_Name,Bus_Type,Departure_Time,Arrival_Time,Duration,Fare,Rating,Seat_Availability,Bus_Link,Bus_Route
0,Kadamba Transport Corporation Limited (KTCL) -...,A/C Sleeper (2+1),19:00,07:10,12h 10m,1000,3.7,,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
1,IntrCity SmartBus,Bharat Benz A/C Seater /Sleeper (2+1),21:00,08:05,11h 05m,399,4.4,,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
2,IntrCity SmartBus,Bharat Benz A/C Sleeper (2+1),21:30,08:20,10h 50m,605,4.2,,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
3,Ashray Travels,Bharat Benz A/C Sleeper (2+1),21:00,08:30,11h 30m,699,4.4,,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
4,Ashray Travels,Non A/C Seater / Sleeper (2+1),21:15,07:45,10h 30m,349,4.2,,https://www.redbus.in/bus-tickets/pune-to-goa,Pune to Goa
...,...,...,...,...,...,...,...,...,...,...
1933,Jujhar Travels,Volvo A/C Semi Sleeper (2+2),14:13,18:18,04h 05m,650,3.8,30,https://www.redbus.in/bus-tickets/chandigarh-t...,Chandigarh to Firozpur
1934,Jujhar Travels,Volvo A/C Semi Sleeper (2+2),15:25,19:30,04h 05m,650,3.6,39,https://www.redbus.in/bus-tickets/chandigarh-t...,Chandigarh to Firozpur
1935,Jujhar Travels,Volvo A/C Semi Sleeper (2+2),17:25,21:20,03h 55m,650,4.3,45,https://www.redbus.in/bus-tickets/chandigarh-t...,Chandigarh to Firozpur
1936,Orbit Aviation Pvt. Ltd.,A/C Seater (2+2),17:55,21:55,04h 00m,666,3.1,45,https://www.redbus.in/bus-tickets/chandigarh-t...,Chandigarh to Firozpur
