# WEBSCRAPING - INTRODUCTION

In [None]:
# Installing packages - one to read and navigate HTML content easily (BeautifulSoup) and getting web pages from the internet (requests)
from bs4 import BeautifulSoup
import requests

In [None]:
# Defining the URL of the website you want to scrape
url ='https://www.scrapethissite.com/pages/ajax-javascript/#2015'

In [None]:
page = requests.get(url) 
page
# response 404 : Server cannot be found
# response 204 : No content on the web page
# repsonse 100 : bad request
# response 200 : OK – The request has succeeded

In [None]:
soup = BeautifulSoup(page.text , 'html')

In [None]:
 # Printing the raw HTML content of the page
print(soup)

# For below code - <title>: title of the webpage, <meta name="description">: brief description of the page's content, <ul class="nav nav-tabs"> - navigation bar with tabs, <table class="table"> - table in web page

In [None]:
# Printing the HTML content of the page formatted nicely
print(soup.prettify())

# WEBSCRAPING - WIKIPEDIA (THE WORLD'S BILLIONAIRES)

In [None]:
# Installing packages
from bs4 import BeautifulSoup
import requests

In [None]:
url = 'https://en.wikipedia.org/wiki/The_World%27s_Billionaires'

page = requests.get(url)
page

In [None]:
soup = BeautifulSoup(page.text, 'html')

In [None]:
print(soup)

In [None]:
# find() is used to locate particular elements in HTML doc. We are finding table in this case
# find() function gets the first match
soup.find('table')

In [None]:
# While find_all() function collects all matches, Extracts all occurrences of the third table element
soup.find_all('table')[2]

In [None]:
# Finds the first table element with the class "wikitable sortable" 
soup.find('table', class_ = 'wikitable sortable')

In [None]:
table_3rd = soup.find_all('table')[2]

In [None]:
print(table_3rd)

# In the below code, <th> stands for table header, <td>: Stands for table data, <tr> stands for table row

In [None]:
# Finding all the headings in the 3rd table
world_billionaires = table_3rd.find_all('th')

In [None]:
world_billionaires

In [None]:
# Extracting the text content with '.text' and removing whitespace with '.strip()''
world_table_billionaires = [title.text.strip() for title in world_billionaires]

print(world_table_billionaires)

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(columns = world_table_billionaires)

df

In [None]:
column_data = table_3rd.find_all('tr')

In [None]:
# Extracting the text from each 'td' tag, striping any whitespace to compile all the rows of the table
for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_rows = [data.text.strip() for data in row_data]
    print(individual_rows)

In [None]:
length = len(df)
df.loc[length] = individual_rows

In [None]:
# Saving the table in csv format - path provided
import csv

csv_file_path = r'C:\Users\14086\Downloads\World_Billionaires.csv'

# Write the first row using the list world_table_billionaires - titles, add other rows - individual_rows above
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(world_table_billionaires) 

    for row in column_data[1:]: 
        row_data = row.find_all('td')
        individual_rows = [data.text.strip() for data in row_data]
        writer.writerow(individual_rows)


# AMAZON - WEBSCRAPING + SELENIUM

In [16]:
# Installing packages - selenium - automating web browser interaction,pandas - data manipulation, beautiful soup - parsing HTML content
!pip install selenium chromedriver-autoinstaller pandas beautifulsoup4

# Importing required libraries
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
import chromedriver_autoinstaller
from IPython.display import display

# Automatically checks for the ChromeDriver version and installs it if not present or outdated.
chromedriver_autoinstaller.install()

# Chrome options - Webdriver for faster and smoother execution of tasks - based on tips 
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless') 
chrome_options.add_argument('--no-sandbox') 
chrome_options.add_argument('--disable-dev-shm-usage') 



In [17]:
# This function is created to format a search term into an Amazon search URL by replacing spaces with plus signs
def get_url(search_term):
    search_term = search_term.replace(' ', '+')
    return f"https://www.amazon.com/s?k={search_term}"

In [18]:
# We have set search_term as 'monitor', to create an Amazon search URL
search_term = 'monitor'
url = get_url(search_term)
print(url)

https://www.amazon.com/s?k=monitor


In [19]:
# Initialize Selenium WebDriver which navigates to the specified web page
driver = webdriver.Chrome()
driver.get(url)

# Reading The HTML website and finding all product listings in the Amazon search results for monitor
soup = BeautifulSoup(driver.page_source, 'html.parser')
results = soup.find_all('div', {'data-component-type': 's-search-result'})

# Printing the no. of search results found
print(len(results))

0


In [20]:
import re
from bs4 import BeautifulSoup

# The extract_record function parses each Amazon product listing and extracts key details

def extract_record(item):
    # Extracting Description -  an <h2> tag that contains an <a> tag
    description = item.h2.a.text.strip()

    # Extracting URL - finds the href attribute of the same <a> tag
    url = "https://www.amazon.com" + item.h2.a.get('href')

    # Extracting Price - finds the whole number part of the price within a span element and the fractional part within a span with the class a-price-fraction
    # item.find(name, attrs) function - name - name of the tag, attrs - eg, class, id
    price = item.find('span', 'a-price-whole')
    price_fraction = item.find('span', 'a-price-fraction')
    if price and price_fraction:
        price_whole = price.text.strip().replace(',', '')
        # Check if price whole already contains a decimal
        if '.' in price_whole:
            price = price_whole + price_fraction.text.strip()
        else:
            price = price_whole + '.' + price_fraction.text.strip()
    else:
        price = ''

    # Extracting Ratings - <i> element with a class of a-icon-star-small is rating
    rating = item.find('i', {'class': 'a-icon-star-small'}) or ''
    if rating:
        rating = rating.text.strip()

    # Extracting Review Counts - a span with the class a-size-base
    review_count = item.find('span', {'class': 'a-size-base'}) or ''
    if review_count:
        review_count = review_count.text.strip()

    # Extracting Display Size - all span elements with the class a-text-bold for a text containing 'inches'
    display_size = ''
    features = item.find_all('span', {'class': 'a-text-bold'})
    for feature in features:
        if 'inches' in feature.text:
            display_size = feature.text.strip()
            break 

    # Extract Resolution - all span elements with the class a-text-bold
    resolution = ''
    features = item.find_all('span', {'class': 'a-text-bold'})
    for feature in features:
        # a pattern for resolution (numbers (one or more digits (\d+)) followed by 'p')
        if re.search(r'\d+p', feature.text):
            resolution = feature.text.strip()
            break 

    # Return extracted details as a dictionary
    return {
        'Description': description,
        'Price(USD)': price,
        'Rating': rating,
        'Review Count': review_count,
        'Display Size': display_size,
        'Resolution': resolution,
        'Url of the Product': url
    }


In [21]:
# Main function to search a query  - opens Chrome via Selenium, navigates to the search results, parses the page with BeautifulSoup to find products, and extracts details with extract_record
def search_query(search_term):
    driver = webdriver.Chrome(options=chrome_options)
    records = []
    url = get_url(search_term)

    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    results = soup.find_all('div', {'data-component-type': 's-search-result'})

    for item in results:
        record = extract_record(item)
        if record:
            records.append(record)

    driver.close()
    
    # Found records are saved in a dataFrame, exported to CSV
    if records:
        df = pd.DataFrame(records)
        filename = f"{search_term.replace(' ', '_')}.csv"
        df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")
        display(df) # Displaying the results
    else:
        print("No records found.")


In [22]:
search_query('Monitor')

Data saved to Monitor.csv


Unnamed: 0,Description,Price(USD),Rating,Review Count,Display Size,Resolution,Url of the Product
0,"LG 22MR410-B 22-inch FHD Computer Monitor, 100...",69.99,4.5 out of 5 stars,Amazon's Choice: Overall Pick,,FHD 1080p,https://www.amazon.com/LG-22MR410-B-Computer-B...
1,PHILIPS 22 inch Class Thin Full HD (1920 x 108...,69.99,4.6 out of 5 stars,2987,21.5 inches,FHD 1080p,https://www.amazon.com/PHILIPS-Computer-Monito...
2,Sceptre New 27-inch Gaming Monitor 100Hz 1ms D...,109.97,4.6 out of 5 stars,Amazon's Choice: Popular Brand Pick,27 inches,FHD 1080p,https://www.amazon.com/Sceptre-DisplayPort-Fre...
3,Portable-Monitor-for-Laptop - 15.6'' 1080P FHD...,72.15,4.6 out of 5 stars,2504,15.6 inches,FHD 1080p,https://www.amazon.com/VILVA-Portable-Monitor-...
4,"Dell S2421HS Full HD 1920 x 1080, 24-Inch 1080...",99.99,4.6 out of 5 stars,1310,24 inches,FHD 1080p,https://www.amazon.com/Dell-S2421HS-Adjustable...
5,Sceptre Curved 24-inch Gaming Monitor 1080p R1...,89.97,4.6 out of 5 stars,20916,24 inches,FHD 1080p,https://www.amazon.com/Sceptre-Curved-Monitor-...
6,Acer Nitro KG241Y Sbiip 23.8” Full HD (1920 x ...,109.99,4.5 out of 5 stars,5188,23.8 inches,FHD 1080p,https://www.amazon.com/Acer-Monitor-FreeSync-T...
7,"SANSUI Monitor 24 inch 100Hz PC Monitor, VESA,...",78.98,4.3 out of 5 stars,3023,24 inches,FHD 1080p,https://www.amazon.com/SANSUI-Monitor-24-Ultra...
8,Dell SE2722HX Monitor - 27 inch FHD (1920 x 10...,109.99,4.6 out of 5 stars,3237,27 inches,FHD 1080p,https://www.amazon.com/Dell-inch-Monitor-1920-...
9,"Acer Nitro 31.5"" FHD 1920 x 1080 1500R Curved ...",184.2,4.6 out of 5 stars,1844,31.5 inches,FHD 1080p,https://www.amazon.com/Monitor-FreeSync-Premiu...
