In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from datetime import datetime

def setup_driver():
    """Set up Chrome driver with options"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Run in headless mode
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    
    # Initialize the Chrome WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def scrape_trustpilot_reviews(url, num_pages=5):
    """Scrape reviews using Selenium"""
    driver = setup_driver()
    reviews_data = []
    
    try:
        for page in range(1, num_pages + 1):
            page_url = f"{url}?page={page}"
            print(f"\nFetching page {page}...")
            
            driver.get(page_url)
            # Wait for reviews to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "article"))
            )
            
            # Find all review containers
            review_containers = driver.find_elements(By.TAG_NAME, "article")
            print(f"Found {len(review_containers)} review containers on page {page}")
            
            for container in review_containers:
                try:
                    # Get reviewer name
                    reviewer_name = container.find_element(
                        By.CSS_SELECTOR, 
                        "span[class*='typography_heading-xxs']"
                    ).text.strip()
                    
                    # Get rating (1-5 stars)
                    try:
                        rating_img = container.find_element(
                            By.CSS_SELECTOR, 
                            "img[alt*='Rated']"
                        )
                        alt_text = rating_img.get_attribute('alt')
                        rating = int(alt_text.split('Rated ')[1].split(' out')[0])
                    except:
                        rating = None
                    
                    # Get review text
                    review_element = container.find_element(
                        By.CSS_SELECTOR, 
                        "p[class*='typography_body-l']"
                    )
                    review_text = review_element.text.strip()
                    
                    # Get date
                    date_element = container.find_element(By.TAG_NAME, "time")
                    date_str = date_element.get_attribute('datetime')
                    date = datetime.strptime(date_str.split('T')[0], '%Y-%m-%d').strftime('%Y-%m-%d')
                    
                    reviews_data.append({
                        'reviewer_name': reviewer_name,
                        'rating': rating,
                        'review_text': review_text,
                        'date': date
                    })
                    
                    print(f"Processed review - Name: {reviewer_name}, Rating: {rating}")
                    
                except Exception as e:
                    print(f"Error processing review: {str(e)}")
                    continue
            
            print(f"Completed scraping page {page}")
            time.sleep(2)  # Be respectful with rate limiting
            
    except Exception as e:
        print(f"Error during scraping: {str(e)}")
    
    finally:
        driver.quit()
    
    # Convert to DataFrame
    df = pd.DataFrame(reviews_data)
    
    # Save to CSV
    if not df.empty:
        df.to_csv('redbus_reviews.csv', index=False, encoding='utf-8')
        print(f"\nTotal reviews scraped: {len(df)}")
        print("\nSample of scraped data:")
        print(df.head())
    else:
        print("\nNo reviews were scraped.")
    
    return df

# Usage
if __name__ == "__main__":
    url = "https://www.trustpilot.com/review/redbus.in"
    reviews_df = scrape_trustpilot_reviews(url, num_pages=5)


Fetching page 1...
Found 20 review containers on page 1
Processed review - Name: khushboo Gupta, Rating: 1
Processed review - Name: Gayatri Sen, Rating: 1
Processed review - Name: Megha, Rating: 1
Processed review - Name: Trishali Mukherjee, Rating: 1
Processed review - Name: Padmashri V, Rating: 1
Processed review - Name: isak mulla, Rating: 1
Processed review - Name: gopalakrishna s, Rating: 1
Processed review - Name: Guna Captain guna, Rating: 1
Processed review - Name: Shiksha, Rating: 5
Processed review - Name: Sharing is Caring., Rating: 1
Processed review - Name: Sarjit Rathi, Rating: 1
Processed review - Name: Libin Joseph, Rating: 1
Processed review - Name: Patricia, Rating: 3
Processed review - Name: Raghesh K, Rating: 1
Processed review - Name: nanda singh, Rating: 1
Processed review - Name: Reshal Dsouza, Rating: 1
Processed review - Name: Harsha Amin, Rating: 1
Processed review - Name: Joshifnb, Rating: 3
Processed review - Name: Oorvi, Rating: 1
Processed review - Name: 

In [3]:
reviews_df.shape

(100, 4)

In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from datetime import datetime

def setup_driver():
    """Set up Chrome driver with options"""
    chrome_options = Options()
    chrome_options.add_argument('--headless')  # Run in headless mode
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    
    # Initialize the Chrome WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def scrape_trustpilot_reviews(url, num_pages=5):
    """Scrape reviews using Selenium"""
    driver = setup_driver()
    reviews_data = []
    
    try:
        for page in range(1, num_pages + 1):
            page_url = f"{url}?page={page}"
            print(f"\nFetching page {page}...")
            
            driver.get(page_url)
            # Wait for reviews to load
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "article"))
            )
            
            # Find all review containers
            review_containers = driver.find_elements(By.TAG_NAME, "article")
            print(f"Found {len(review_containers)} review containers on page {page}")
            
            for container in review_containers:
                try:
                    # Get reviewer name
                    reviewer_name = container.find_element(
                        By.CSS_SELECTOR, 
                        "span[class*='typography_heading-xxs']"
                    ).text.strip()
                    
                    # Get rating (1-5 stars)
                    try:
                        rating_img = container.find_element(
                            By.CSS_SELECTOR, 
                            "img[alt*='Rated']"
                        )
                        alt_text = rating_img.get_attribute('alt')
                        rating = int(alt_text.split('Rated ')[1].split(' out')[0])
                    except:
                        rating = None
                    
                    # Get review text
                    review_element = container.find_element(
                        By.CSS_SELECTOR, 
                        "p[class*='typography_body-l']"
                    )
                    review_text = review_element.text.strip()
                    
                    # Get date
                    date_element = container.find_element(By.TAG_NAME, "time")
                    date_str = date_element.get_attribute('datetime')
                    date = datetime.strptime(date_str.split('T')[0], '%Y-%m-%d').strftime('%Y-%m-%d')
                    
                    reviews_data.append({
                        'reviewer_name': reviewer_name,
                        'rating': rating,
                        'review_text': review_text,
                        'date': date
                    })
                    
                    print(f"Processed review - Name: {reviewer_name}, Rating: {rating}")
                    
                except Exception as e:
                    print(f"Error processing review: {str(e)}")
                    continue
            
            print(f"Completed scraping page {page}")
            time.sleep(2)  # Be respectful with rate limiting
            
    except Exception as e:
        print(f"Error during scraping: {str(e)}")
    
    finally:
        driver.quit()
    
    # Convert to DataFrame
    df = pd.DataFrame(reviews_data)
    
    # Save to CSV
    if not df.empty:
        df.to_csv('redbus_reviews.csv', index=False, encoding='utf-8')
        print(f"\nTotal reviews scraped: {len(df)}")
        print("\nSample of scraped data:")
        print(df.head())
    else:
        print("\nNo reviews were scraped.")
    
    return df

# Usage
if __name__ == "__main__":
    url = "https://www.trustpilot.com/review/redbus.in"
    reviews_df = scrape_trustpilot_reviews(url, num_pages=12)


Fetching page 1...
Found 20 review containers on page 1
Processed review - Name: khushboo Gupta, Rating: 1
Processed review - Name: Gayatri Sen, Rating: 1
Processed review - Name: Megha, Rating: 1
Processed review - Name: Trishali Mukherjee, Rating: 1
Processed review - Name: Padmashri V, Rating: 1
Processed review - Name: isak mulla, Rating: 1
Processed review - Name: gopalakrishna s, Rating: 1
Processed review - Name: Guna Captain guna, Rating: 1
Processed review - Name: Shiksha, Rating: 5
Processed review - Name: Sharing is Caring., Rating: 1
Processed review - Name: Sarjit Rathi, Rating: 1
Processed review - Name: Libin Joseph, Rating: 1
Processed review - Name: Patricia, Rating: 3
Processed review - Name: Raghesh K, Rating: 1
Processed review - Name: nanda singh, Rating: 1
Processed review - Name: Reshal Dsouza, Rating: 1
Processed review - Name: Harsha Amin, Rating: 1
Processed review - Name: Joshifnb, Rating: 3
Processed review - Name: Oorvi, Rating: 1
Processed review - Name: 

In [7]:
reviews_df.shape

(221, 4)

In [8]:
reviews_df.head()

Unnamed: 0,reviewer_name,rating,review_text,date
0,khushboo Gupta,1,You guys are lier and\nRedbus customer service...,2024-12-28
1,Gayatri Sen,1,Horrible service. Why the hell u guys are oper...,2024-12-26
2,Megha,1,I am highly disappointed with Redbus we have b...,2024-12-22
3,Trishali Mukherjee,1,We booked a bus of Ankita travels from digha t...,2024-12-22
4,Padmashri V,1,Worst experience there was many cockroaches ne...,2024-12-26


In [10]:
reviews_df[reviews_df['rating'] == 5]

Unnamed: 0,reviewer_name,rating,review_text,date
8,Shiksha,5,It was an excellent experience\nThankyou Red bus,2024-12-27
107,Datta Gaddamanugu,5,Yolo bus journey from Chennai to Bangalore was...,2024-03-19
110,a.rasathi avinashi,5,Its was nice journey.. Good response... Happy ...,2024-11-13
122,vandana singh,5,Very good service cooperative staff driver and...,2024-04-26
171,Lucky Balachauria,5,Very good experience,2024-12-16
173,Ajith kumar,5,I travelled in Punchiri travels from banglore ...,2024-06-02
174,Yogesh Kawale,5,Great service and efficient communication begi...,2024-03-10
177,Deepak Gupta,5,provide the good transport services,2024-05-28
180,Sachin Patil,5,"Babu travel , very good experience.",2024-06-15
181,Yeangpong Konyak,5,It was very good 😊,2024-12-12


In [11]:
reviews_df[reviews_df['rating'] == 5].shape

(16, 4)

In [15]:
reviews_df['rating'].value_counts(sort=False)

rating
1    197
5     16
3      2
2      5
4      1
Name: count, dtype: int64

In [17]:
# Get value counts of ratings
rating_counts = reviews_df['rating'].value_counts().sort_index()

# To see the results sorted from rating 1 to 5
print(rating_counts)

rating
1    197
2      5
3      2
4      1
5     16
Name: count, dtype: int64


In [18]:
reviews_df.to_csv('E:/Datasets/Redbus Review/reviews.csv', index=False)