### Importing Libraries

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import re
import time

### Headless Driver

In [91]:
#this will stop driver from opening
options = Options()
options.headless = True
options.add_argument('window-size=1920x1080')

### Creating Driver

In [92]:
#can use both urls 
website = 'https://www.audible.in/search'
# website = 'https://www.audible.in/charts/best' #for demo use this url as it contains only 5 pages
driver = webdriver.Chrome(options=options)
driver.get(website)
# driver.maximize_window()

### Pagination

In [93]:
#pagination
pagination = driver.find_element(By.XPATH,'//ul[contains(@class,"pagingElements")]')
pages = pagination.find_elements(By.TAG_NAME,'li')
last_page = int(pages[-2].text)

### Creating Empty Lists to store scrapped data

In [94]:
heading = []
subtitle = []
author = []
narrator = []
Durations = []
Released_Date = []
language = []
stars = []

### Scrapping Logic

In [None]:
#commented part in this code is for implicit wait 
current_page = 1
while current_page <= last_page:
  #time.sleep(2)
  container = WebDriverWait(driver,5).until(EC.presence_of_element_located((By.CLASS_NAME, 'adbl-impression-container ')))
  #container = driver.find_element(By.CLASS_NAME, 'adbl-impression-container ')
  products = WebDriverWait(container,5).until(EC.presence_of_all_elements_located((By.XPATH, './/li[contains(@class,"productListItem")]')))
  #products = container.find_elements(By.XPATH, './/li[contains(@class,"productListItem")]')

  for product in products:
      heading_element = product.find_elements(By.XPATH, './/h3[contains(@class,"bc-heading")]')
      heading.append(heading_element[0].text if heading_element else 'No Heading')

      subtitle_element = product.find_elements(By.XPATH, './/li[contains(@class,"subtitle")]')
      subtitle.append(subtitle_element[0].text if subtitle_element else 'No Subtitle')

      author_element = product.find_elements(By.XPATH, './/li[contains(@class,"authorLabel")]')
      author.append(author_element[0].text.replace("Written by: ", "") if author_element else 'No Author')

      narrator_element = product.find_elements(By.XPATH, './/li[contains(@class,"narratorLabel")]')
      narrator.append(narrator_element[0].text.replace("Narrated by: ", "") if narrator_element else 'No Narrator')

      duration_element = product.find_elements(By.XPATH, './/li[contains(@class,"runtimeLabel")]')
      Durations.append(duration_element[0].text.replace("Length: ", "") if duration_element else 'No Duration')

      released_element = product.find_elements(By.XPATH, './/li[contains(@class,"releaseDateLabel")]')
      Released_Date.append(released_element[0].text.replace("Release Date: ", "") if released_element else 'No Release Date')

      language_element = product.find_elements(By.XPATH, './/li[contains(@class,"languageLabel")]')
      language.append(language_element[0].text.replace("Language: ", "") if language_element else 'No Language')

      star_element = product.find_elements(By.XPATH, './/li[contains(@class,"ratingsLabel")]')
      stars.append(star_element[0].text.replace("\n","_") if star_element else 'No Ratings')
      
  current_page = current_page + 1
  try:    
    next_page = driver.find_element(By.XPATH,'//span[contains(@class,"nextButton")]')
    next_page.click()    
  except:
    pass

# Closing WebDriver
driver.quit()


### **Data Cleaning**

### removing the numerical values from the heading

In [96]:
heading = [re.sub(r"^\d+\. ", "", h) for h in heading]

### converting 'hr and min' format to 'mins' format

In [None]:
Duration = []
for duration in Durations:
    match = re.findall(r'(\d+) hrs? and (\d+) mins?', duration)
    if match:
        hours, minutes = map(int, match[0]) 
        total_minutes = (hours * 60) + minutes 
        Duration.append(total_minutes)
    else:
        Duration.append("Invalid Format")

### converting star column into rating and rating_count

In [None]:
rating_count = []
rating = []


for star in stars:
    match = re.search(r'(\d+(\.\d+)?) out of (\d+) stars_(\d{1,3}(?:,\d{3})*) ratings', star)
    if match:
        rating.append(match.group(1)) 
        rating_count.append(match.group(4)) 
    else:
        rating.append("No Rating")
        rating_count.append("No Ratings_count")

## Storing all lists into Dictionary

In [99]:
ebooks_data = {
    "Heading": heading,
    "Subtitle": subtitle,
    "Author": author,
    "Narrator": narrator,
    "Duration": Duration,
    "Released_Date": Released_Date,
    "Language": language,
    "Rating": rating,
    "Rating_count": rating_count
}

### Converting Dictionary into Dataframe to into Excel File

In [None]:
ebooks_data = pd.DataFrame(ebooks_data)

ebooks_data.to_csv("ebooks_data.csv")