# Data Engineering Project

In [2]:
!pip install bs4 requests pandas




In [63]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

### Figuring all the tags and attrigbutes we are going to need for scrapping data

In [64]:
URL = "https://www.amazon.in/s?k=gcp+professional+data+engineering+books&crid=3M8UI1BF4LLQ&sprefix=data+engineering+book%2Caps%2C374&ref=nb_sb_ss_ts-doa-p_1_21"


In [65]:
# https://www.whatismybrowser.com

Headers = {'User-Agent':'','Accept-Language':'en-US, en;q=0.5'}

In [67]:
htmlpage = requests.get(URL,headers=Headers)
htmlpage

<Response [200]>

In [68]:
mySoup = BeautifulSoup(htmlpage.content,'html.parser')

In [70]:
links = mySoup.findAll('a',attrs={'class':'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})

In [71]:
link = links[0].get('href')

In [72]:
product_link = "https://www.amazon.com" + link
product_link

'https://www.amazon.com/Official-Google-Certified-Professional-Engineer/dp/1119618436/ref=sr_1_1?crid=3M8UI1BF4LLQ&keywords=gcp+professional+data+engineering+books&qid=1689145504&sprefix=data+engineering+book%2Caps%2C374&sr=8-1'

In [81]:
new_webpage = requests.get(product_link,headers=Headers)

In [115]:
new_webpage.status_code

200

In [173]:
new_soup = BeautifulSoup(new_webpage.content,'html.parser')

In [84]:
new_soup.find('span',attrs={'id':'productTitle'}).text.strip()

'Official Google Cloud Certified Professional Data Engineer Study Guide'

In [85]:
new_soup.find('span',attrs={'id':'price'}).text.strip()

'$33.49'

In [100]:
new_soup.find('span',attrs={'id':'acrPopover'}).find('span',attrs = {'class':'a-icon-alt'}).text

'4.5 out of 5 stars'

In [102]:
new_soup.find('span',attrs={'id':'acrCustomerReviewText'}).text

'208 ratings'

In [104]:
new_soup.find('div',attrs={'id':'availability'}).find('span',attrs={'class':'a-size-medium a-color-success'}).text.strip()

'In Stock'

### Systemetic code 
--------------------

- ##### Utility Functions

In [105]:
def get_title(soup):
    """
    This function scrapes the title of the product.
    """

    try:
        product_title = soup.find('span',attrs={'id':'productTitle'}).text.strip()
    except AttributeError:
        product_title = ""

    return product_title

def get_price(soup):
    """
    This function scrapes the price of the product.
    """ 

    try:
        product_price = soup.find('span',attrs={'id':'price'}).text.strip()
    except AttributeError:
        product_price = ""

    return product_price

def get_rating(soup):
    """
    This function scrapes the rating out of 5 stars.
    """

    try:
        product_rating = soup.find('span',attrs={'id':'acrPopover'}).find('span',attrs = {'class':'a-icon-alt'}).text
    except AttributeError:
        product_rating = ""

    return product_rating

def get_review_count(soup):
    """
    This function scrapes the reviews count.
    """

    try:
        product_review_count = soup.find('span',attrs={'id':'acrCustomerReviewText'}).text
    except AttributeError:
        product_review_count = ''

    return product_review_count

def get_availability(soup):
    """
    This function  
    """

    try:
        product_availability = soup.find('div',attrs={'id':'availability'}).find('span',attrs={'class':'a-size-medium a-color-success'}).text.strip()
    except:
        product_availability = ''
        
    return product_availability
        

- ##### Initializing Attributes

In [155]:
import os
from dotenv import load_dotenv


load_dotenv('/mnt/d/code/data-engineering-projects/Web-Scrapping-Project/.env')

# Headers to request
HEADERS = ({'User-Agent':os.getenv('USER_AGENT'), 'Accept-Language': 'en-US, en;q=0.5'})

# The webpage URL
URL = "https://www.amazon.in/s?k=headphones&page="

- ##### Fetching the HTML Web Pages 

In [167]:
webpages = []

# HTTP Request
for i in range(1,20):
    webpage = requests.get(URL+str(i), headers=HEADERS)
    if webpage.status_code == 200:
        webpages.append(webpage)
    print(webpage.status_code)

webpages


200
503
503
200
200
503
503
503
503
200
200
503
200
503
503
200
503
503
503


[<Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>,
 <Response [200]>]

- ##### Creating BeautifulSoup object for parsing webpage

In [168]:
links = []

for webpage in webpages:
    # Soup Object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Fetch links as List of Tag Objects
    links += soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'})

links

[<a class="a-link-normal s-no-outline" href="/Official-Google-Certified-Professional-Engineer/dp/1119618436/ref=sr_1_1?crid=3M8UI1BF4LLQ&amp;keywords=gcp+professional+data+engineering+books&amp;qid=1689155655&amp;sprefix=data+engineering+book%2Caps%2C374&amp;sr=8-1" target="_blank"><div class="a-section aok-relative s-image-fixed-height"><img alt="GOOGLE CLOUD CERTIFIED PROFESSIONAL DATA" class="s-image" data-image-index="1" data-image-latency="s-product-image" data-image-load="" data-image-source-density="1" src="https://m.media-amazon.com/images/I/71rBrgbl9YL._AC_UY218_.jpg" srcset="https://m.media-amazon.com/images/I/71rBrgbl9YL._AC_UY218_.jpg 1x, https://m.media-amazon.com/images/I/71rBrgbl9YL._AC_UY327_QL65_.jpg 1.5x, https://m.media-amazon.com/images/I/71rBrgbl9YL._AC_UY436_QL65_.jpg 2x, https://m.media-amazon.com/images/I/71rBrgbl9YL._AC_UY545_QL65_.jpg 2.5x, https://m.media-amazon.com/images/I/71rBrgbl9YL._AC_UY654_QL65_.jpg 3x"/></div></a>,
 <a class="a-link-normal s-no-outlin

- ##### Filtering all the product links present in webpage

In [169]:
# Store the links
links_list = []

# Loop for extracting links from Tag Objects
for link in links:
        links_list.append(link.get('href'))
links_list

['/Official-Google-Certified-Professional-Engineer/dp/1119618436/ref=sr_1_1?crid=3M8UI1BF4LLQ&keywords=gcp+professional+data+engineering+books&qid=1689155655&sprefix=data+engineering+book%2Caps%2C374&sr=8-1',
 '/Google-Cloud-Platform-Data-Engineering-ebook/dp/B07ZGB4F7T/ref=sr_1_2?crid=3M8UI1BF4LLQ&keywords=gcp+professional+data+engineering+books&qid=1689155655&sprefix=data+engineering+book%2Caps%2C374&sr=8-2',
 '/Data-Engineering-Google-Cloud-Platform/dp/1800561326/ref=sr_1_3?crid=3M8UI1BF4LLQ&keywords=gcp+professional+data+engineering+books&qid=1689155655&sprefix=data+engineering+book%2Caps%2C374&sr=8-3',
 '/Google-Professional-Engineer-Certification-Incredibly-ebook/dp/B0B4XW1Q9R/ref=sr_1_4?crid=3M8UI1BF4LLQ&keywords=gcp+professional+data+engineering+books&qid=1689155655&sprefix=data+engineering+book%2Caps%2C374&sr=8-4',
 '/professional-engineer-mondaishu-mogimondaisyu-Japanese-ebook/dp/B09RHX4NQ4/ref=sr_1_5?crid=3M8UI1BF4LLQ&keywords=gcp+professional+data+engineering+books&qid=1689

- ##### Now fetching all required data for every product

In [170]:
d = {"title":[], "price":[], "rating":[], "reviews":[],"availability":[]}

# Loop for extracting product details from each link 
for link in links_list:
    new_webpage = requests.get("https://www.amazon.in" + link, headers=HEADERS)

    retry_count = 10
    while(retry_count and new_webpage.status_code != 200):
        new_webpage = requests.get("https://www.amazon.in" + link, headers=HEADERS)
    retry_count-=1
    if retry_count == 0:
        print('Unable to fetch data for {}, try after sometime!!'.format(link))
        continue

    new_soup = BeautifulSoup(new_webpage.content, "html.parser")



    # Function calls to display all necessary product information
    d['title'].append(get_title(new_soup))
    d['price'].append(get_price(new_soup))
    d['rating'].append(get_rating(new_soup))
    d['reviews'].append(get_review_count(new_soup))
    d['availability'].append(get_availability(new_soup))


- ##### Creating Pandas Data Frame 

In [171]:
amazon_df = pd.DataFrame.from_dict(d)
amazon_df['title'].replace('', np.nan, inplace=True)
amazon_df = amazon_df.dropna(subset=['title'])
amazon_df.to_csv("amazon_data.csv", header=True, index=False)

### Final Result

In [172]:
amazon_df

Unnamed: 0,title,price,rating,reviews,availability
0,GOOGLE CLOUD CERTIFIED PROFESSIONAL DATA,"₹3,136.00",4.5 out of 5 stars,204 ratings,In stock
1,Google Cloud Platform for Data Engineering: Le...,,3.3 out of 5 stars,3 ratings,
2,Data Engineering with Google Cloud Platform: A...,"₹3,228.00",4.7 out of 5 stars,13 ratings,In stock
4,google cloud professional data engineer mogi m...,,3.7 out of 5 stars,9 ratings,
6,Professional Cloud Architect – Google Cloud Ce...,"₹2,601.00",4.3 out of 5 stars,55 ratings,In stock
...,...,...,...,...,...
106,"Big Data and Analytics, 2ed | IM | BS | e",₹568.00,4.4 out of 5 stars,96 ratings,In stock
107,BigQuery for Data Warehousing: Managed Data An...,"₹4,521.00",3.2 out of 5 stars,5 ratings,In stock
109,Data Analytics with Google Cloud Platform,₹719.00,3.7 out of 5 stars,32 ratings,In stock
110,"Software Engineering, 10/e",₹689.00,4.2 out of 5 stars,169 ratings,
