In [None]:
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
import requests
import re

### Define Functions to Extract Data from HTML

In [None]:
# url = "https://www.amazon.com/Data-Science-Business-Data-Analytic-Thinking/dp/B08VL5K5ZX/ref=sr_1_13?crid=YS0ZZ9RZH5I9&keywords=data+analytics+books&qid=1673030834&sprefix=data+analytics+books%2Caps%2C307&sr=8-13"

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

# page = requests.get(url, headers=headers)

# soup1 = BeautifulSoup(page.content, "html.parser")

# soup2 = BeautifulSoup(soup1.prettify(), "html.parser")

In [None]:
def get_title(soup):
    try:
        title = soup.find("span", attrs={"id":'productTitle'}).get_text().strip()
        return title
    except:
        pass
    try:
        title = soup.find(id="title").get_text().strip()
        return title
    except:
        return ""

def get_price(soup):
    try:
        price = soup.find(id="price").get_text().strip()
        return price
    except:
        pass
    try:
        price = soup.find(class_="a-color-price").get_text().strip()
        return price
    except:
        ""

def get_author(soup):
    try:
        authors_ls = soup.find(id="followTheAuthor_feature_div").find_all(class_="a-size-base a-link-normal a-text-normal")
        authors_ls = [author.get_text().strip() for author in authors_ls]
        return authors_ls
    except:
        pass
    try:
        authors_ls = re.sub('\s+',' ', soup.find(id="audibleProductTitle_byline").get_text())
        return [authors_ls]
    except:
        return ""

def get_product_details(soup):
    try:
        ls = soup.find(id="detailBullets_feature_div").get_text().replace("\n", "").replace("  ", "").replace("\u200f:\u200e","").split(" ")
    except:
        return "", ""
    try:
        language_idx = ls.index("Language")
        language = ls[language_idx+1]
    except: language = ""
    try:
        page_idx = ls.index("pages")
        pages = ls[page_idx-1]
    except: pages = ""
    return language, pages

def get_costumer_ratings(soup):
    try:

        overall_rating = soup.find(class_="a-fixed-left-grid-col aok-align-center a-col-right").get_text().strip()
        star_ratios = [rating.get_text().strip() for rating in soup.find(id="histogramTable").find_all(class_="a-text-right a-nowrap")]
        return overall_rating, star_ratios
    except:
        return "", ""

def get_total_review(soup):
    try:
        reviews = soup.find(class_="a-row a-spacing-medium averageStarRatingNumerical").get_text().strip()
        return reviews
    except:
        pass
    try:
        reviews = soup.find(id="detailBullets_averageCustomerReviews").get_text().strip()
        return reviews
    except:
        return ""

def get_costumer_comments(soup):
    try:
        comments = [comment.get_text().strip() for comment in soup.find_all("span", {"data-hook": "review-body"})]
        return comments
    except:
        return ""

def get_books_links(soup):
    books = soup.find(class_="s-main-slot s-result-list s-search-results sg-row").find_all(class_="a-link-normal s-no-outline")
    links = []
    for book in books:
        links.append("https://www.amazon.com/" + book["href"])
    return links

### Scraping Loops

**Want to scrape Amazon books data about following topics**:
- Data Science           (Search word: "data science books")
- Data Analytics         (Search word: "data analytics books")
- Machine Learning       (Search word: "machine learning books")
- Data Engineering       (Search word: "data engineering books")

In [None]:
# define dictionary to store data
data_science = {}
# scrape data
# each page consists 60 books, so let's get first five page
url = "https://www.amazon.com/s?k=data+science+books&crid=N6CUDUWC74ES&qid=1674379592&sprefix=data+science+books%2Caps%2C902&ref=sr_pg_1"
book_id = 0
pbar_page = tqdm(position=0, desc='Page bar', total = 3)
pbar_book = tqdm(position=1, desc='Book bar', total = 150, leave=True)
for page_number in range(2,6,1):                                            # loop for number of pages
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, "html.parser")
    soup = BeautifulSoup(soup.prettify(), "html.parser")                    # get page html
    book_links = get_books_links(soup)                                      # get all book urls from whole page
    for book_link in book_links:                                            # iterate through each book
        book_page = requests.get(book_link, headers=headers)
        book_soup = BeautifulSoup(book_page.content, "html.parser")
        book_soup = BeautifulSoup(book_soup.prettify(), "html.parser")      # get single book's html
        title = get_title(book_soup)                                        # get data
        price = get_price(book_soup)
        authors = get_author(book_soup)
        language, pages = get_product_details(book_soup)
        costumer_ratings = get_costumer_ratings(book_soup)
        reviews = get_total_review(book_soup)
        comments = get_costumer_comments(book_soup)
        data_science[book_id] = {"Title": title,                            # save scrapped data into dictionary
                         "Authors": authors,
                         "Price": price,
                         "Language": language,
                         "NumOfPages": pages,
                         "CostumerRatings": costumer_ratings,
                         "NumOfReviews": reviews,
                         "BookLink": book_link,
                         "Comments": comments}
        book_id+=1
        pbar_book.update(1)
        if book_id>300:
            print("Scraping data from site is done!")
            break
    if book_id>300:
        break
    page_label = f"Go to page {page_number}"
    url = "https://www.amazon.com/" + soup.find(attrs={"aria-label": page_label})["href"]
    pbar_page.update(1)

In [None]:
data_analytics = {}
url = "https://www.amazon.com/s?k=data+analytics+books&crid=YS0ZZ9RZH5I9&qid=1674379655&sprefix=data+analytics+books%2Caps%2C307&ref=sr_pg_1"
book_id = 0
pbar_page = tqdm(position=0, desc='Page bar', total = 3)
pbar_book = tqdm(position=1, desc='Book bar', total = 150, leave=True)
for page_number in range(2, 6, 1):                                            # loop for number of pages
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, "html.parser")
    soup = BeautifulSoup(soup.prettify(), "html.parser")                    # get page html
    book_links = get_books_links(soup)                                      # get all book urls from whole page
    for book_link in book_links:                                            # iterate through each book
        book_page = requests.get(book_link, headers=headers)
        book_soup = BeautifulSoup(book_page.content, "html.parser")
        book_soup = BeautifulSoup(book_soup.prettify(), "html.parser")      # get single book's html
        title = get_title(book_soup)                                        # get data
        price = get_price(book_soup)
        authors = get_author(book_soup)
        language, pages = get_product_details(book_soup)
        costumer_ratings = get_costumer_ratings(book_soup)
        reviews = get_total_review(book_soup)
        comments = get_costumer_comments(book_soup)
        data_analytics[book_id] = {"Title": title,
                         "Authors": authors,
                         "Price": price,
                         "Language": language,
                         "NumOfPages": pages,
                         "CostumerRatings": costumer_ratings,
                         "NumOfReviews": reviews,
                         "BookLink": book_link,
                         "Comments": comments}
        book_id+=1
        pbar_book.update(1)
        if book_id>300:
            print("Scraping data from site is done!")
            break
    if book_id>300:
        break
    page_label = f"Go to page {page_number}"
    url = "https://www.amazon.com/" + soup.find(attrs={"aria-label": page_label})["href"]
    pbar_page.update(1)

In [None]:
machine_learning = {}
url = "https://www.amazon.com/s?k=machine+learning+books&crid=3QIJGWAEPCFZC&qid=1674379714&sprefix=machine+learningbooks%2Caps%2C267&ref=sr_pg_1"
book_id = 0
pbar_page = tqdm(position=0, desc='Page bar', total = 3)
pbar_book = tqdm(position=1, desc='Book bar', total = 150, leave=True)
for page_number in range(2,6,1):                                            # loop for number of pages
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, "html.parser")
    soup = BeautifulSoup(soup.prettify(), "html.parser")                    # get page html
    book_links = get_books_links(soup)                                      # get all book urls from whole page
    for book_link in book_links:                                            # iterate through each book
        book_page = requests.get(book_link, headers=headers)
        book_soup = BeautifulSoup(book_page.content, "html.parser")
        book_soup = BeautifulSoup(book_soup.prettify(), "html.parser")      # get single book's html
        title = get_title(book_soup)                                        # get data
        price = get_price(book_soup)
        authors = get_author(book_soup)
        language, pages = get_product_details(book_soup)
        costumer_ratings = get_costumer_ratings(book_soup)
        reviews = get_total_review(book_soup)
        comments = get_costumer_comments(book_soup)
        machine_learning[book_id] = {"Title": title,
                         "Authors": authors,
                         "Price": price,
                         "Language": language,
                         "NumOfPages": pages,
                         "CostumerRatings": costumer_ratings,
                         "NumOfReviews": reviews,
                         "BookLink": book_link,
                         "Comments": comments}
        book_id+=1
        pbar_book.update(1)
        if book_id>300:
            print("Scraping data from site is done!")
            break
    if book_id>300:
        break
    page_label = f"Go to page {page_number}"
    url = "https://www.amazon.com/" + soup.find(attrs={"aria-label": page_label})["href"]
    pbar_page.update(1)

In [None]:
data_engineering = {}
# here each page consists 22 books, so I'll iterate through more pages than other cases
url = "https://www.amazon.com/s?k=data+engineering+books&crid=23GGSOIXPQKBV&qid=1674379745&sprefix=data+engineering+books%2Caps%2C261&ref=sr_pg_1"
book_id = 0
pbar_page = tqdm(position=0, desc='Page bar', total = 3)
pbar_book = tqdm(position=1, desc='Book bar', total = 150, leave=True)
for page_number in range(2,20,1):                                            # loop for number of pages
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, "html.parser")
    soup = BeautifulSoup(soup.prettify(), "html.parser")                    # get page html
    book_links = get_books_links(soup)                                      # get all book urls from whole page
    for book_link in book_links:                                            # iterate through each book
        book_page = requests.get(book_link, headers=headers)
        book_soup = BeautifulSoup(book_page.content, "html.parser")
        book_soup = BeautifulSoup(book_soup.prettify(), "html.parser")      # get single book's html
        title = get_title(book_soup)                                        # get data
        price = get_price(book_soup)
        authors = get_author(book_soup)
        language, pages = get_product_details(book_soup)
        costumer_ratings = get_costumer_ratings(book_soup)
        reviews = get_total_review(book_soup)
        comments = get_costumer_comments(book_soup)
        data_engineering[book_id] = {"Title": title,
                         "Authors": authors,
                         "Price": price,
                         "Language": language,
                         "NumOfPages": pages,
                         "CostumerRatings": costumer_ratings,
                         "NumOfReviews": reviews,
                         "BookLink": book_link,
                         "Comments": comments}
        book_id+=1
        pbar_book.update(1)
        if book_id>300:
            print("Scraping data from site is done!")
            break
    if book_id>300:
        break
    page_label = f"Go to page {page_number}"
    url = "https://www.amazon.com/" + soup.find(attrs={"aria-label": page_label})["href"]
    pbar_page.update(1)

### Create Dataframe & Save Data

In [2]:
# gets dictionaries from scraping loops, creates dataframe, do little cleaning and saves it
def dataframe_saver(data_science, data_analytics, machine_learning, data_engineering):
    df_ds = pd.DataFrame(data_science).T
    df["search_word"] = "Data Science"
    df_da = pd.DataFrame(data_analytics).T
    df["search_word"] = "Data Analytics"
    df_ml = pd.DataFrame(machine_learning).T
    df["search_word"] = "Machine Learning"
    df_de = pd.DataFrame(data_engineering).T
    df["search_word"] = "Data Engineering"
    df = pd.concat([df_ds, df_da, df_ml, df_de])

    ls = ["Title", "Authors", "CostumerRatings"]                                     # turn list's into string
    for columns in ls:
        df[columns] = df[columns].apply(lambda x: "  ".join(map(str, x)))
    columns = list(df.columns)                                                       # replace "" with np.nan
    for column in columns:
        df.loc[df[column]==""][column] = np.nan
    df["Language"] = df['Language'].fillna("English")                                # replace null values with "English" as all others have "English"
    df.to_excel("Data\data.xlsx")