In [None]:
# BeautifulSoup is a good way to parse HTML text.
# It's good way to traverse HTML text within Python

In [None]:
# Need web client to grab something from the Internet, for this use a package called urllib.
# Inside of urllib, there is a module called request and inside that module is a function called urlopen.

In [1]:
from urllib.request import urlopen as uReq  

In [2]:
from bs4 import BeautifulSoup as soup

In [3]:
my_url = 'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card'

In [4]:
my_url

'https://www.newegg.com/Video-Cards-Video-Devices/Category/ID-38?Tpk=graphics%20card'

In [5]:
# Open web client
# Open up a connection , grab the web page & basically just download it. So, it's a client

uClient = uReq(my_url) 

In [6]:
# Offloads the content into a variable

page_html = uClient.read()

In [7]:
# Close the client

uClient.close()

In [8]:
# HTML parsing

page_soup = soup(page_html,"html.parser")

In [9]:
# Accessing h1 tag as below

page_soup.h1

<h1 class="page-title-text">Video Cards &amp; Video Devices</h1>

In [10]:
# Accessing p tag as below 

page_soup.p

<p>Newegg.com - A great place to buy computers, computer parts, electronics, software, accessories, and DVDs online. With great prices, fast shipping, and top-rated customer service - Newegg shopping upgraded ™</p>

In [11]:
# Grabs each product

containers = page_soup.findAll("div",{"class" : "item-container"})

In [12]:
# Check length of containers to see how many things it found

len(containers)

12

In [13]:
# first container

container = containers[0]

In [14]:
# Accessing a tag of container1 as below

container.a

<a class="item-img" href="https://www.newegg.com/gigabyte-radeon-rx-5700-xt-gv-r57xtgaming-oc-8gd/p/N82E16814932208?Item=N82E16814932208">
<div class="item-badges">
</div>
<img alt="GIGABYTE Radeon RX 5700 XT GAMING OC 8G Graphics Card, PCIe 4.0, 8GB 256-Bit GDDR6, GV-R57XTGAMING OC-8GD Video Card" class="lazy-img" data-effect="fadeIn" data-src="//c1.neweggimages.com/NeweggImage/ProductImageCompressAll300/14-932-208-V10.jpg" src="//c1.neweggimages.com/WebResource/Themes/2005/Nest/blank.gif" title="GIGABYTE Radeon RX 5700 XT GAMING OC 8G Graphics Card, PCIe 4.0, 8GB 256-Bit GDDR6, GV-R57XTGAMING OC-8GD Video Card">
</img></a>

In [15]:
# Accessing div tag of container1 as below

container.div

<div class="item-badges">
</div>

In [16]:
# Accessing img tag of container1 as below

container.a.img

<img alt="GIGABYTE Radeon RX 5700 XT GAMING OC 8G Graphics Card, PCIe 4.0, 8GB 256-Bit GDDR6, GV-R57XTGAMING OC-8GD Video Card" class="lazy-img" data-effect="fadeIn" data-src="//c1.neweggimages.com/NeweggImage/ProductImageCompressAll300/14-932-208-V10.jpg" src="//c1.neweggimages.com/WebResource/Themes/2005/Nest/blank.gif" title="GIGABYTE Radeon RX 5700 XT GAMING OC 8G Graphics Card, PCIe 4.0, 8GB 256-Bit GDDR6, GV-R57XTGAMING OC-8GD Video Card">
</img>

In [17]:
# Getting title of container

container.a.img["title"]

'GIGABYTE Radeon RX 5700 XT GAMING OC 8G Graphics Card, PCIe 4.0, 8GB 256-Bit GDDR6, GV-R57XTGAMING OC-8GD Video Card'

In [18]:
# Finding tag 'a' with class item-title

title_container = container.findAll("a",{"class" : "item-title"})
title_container

[<a class="item-title" href="https://www.newegg.com/gigabyte-radeon-rx-5700-xt-gv-r57xtgaming-oc-8gd/p/N82E16814932208?Item=N82E16814932208" title="View Details">GIGABYTE Radeon RX 5700 XT GAMING OC 8G Graphics Card, GV-R57XTGAMING OC-8GD</a>]

In [20]:
# Getting text of title container1
title_container[0].text

'GIGABYTE Radeon RX 5700 XT GAMING OC 8G Graphics Card, GV-R57XTGAMING OC-8GD'

In [22]:
# Finding tag 'li' with class price-ship

shipping_container = container.findAll("li",{"class" : "price-ship"})
shipping_container

[<li class="price-ship">
         Free Shipping
     </li>]

In [23]:
# Getting text of shipping container1

shipping_container[0].text

'\r\n        Free Shipping\r\n    '

In [24]:
# Removing extra spaces in text

shipping_container[0].text.strip()

'Free Shipping'

In [26]:
# Creating new csv file

filename = "products.csv"
f = open(filename,"w")

In [27]:
# Defining headers for columns and writing it in csv file

headers = "brand, product_name, shipping\n"
f.write(headers)

30

In [28]:
# Applying loop over all containers present in html page and 
# extracting brand,product_name & shipping price and writing it in csv file

for container in containers:
    brand = container.a.img["title"]
    
    title_container = container.findAll("a",{"class" : "item-title"})
    product_name = title_container[0].text
    
    shipping_container = container.findAll("li",{"class" : "price-ship"})
    shipping = shipping_container[0].text.strip()
    
    print("brand: " + brand)
    print("product_name: " + product_name)
    print("shipping: " + shipping)
    
    f.write(brand.replace(",","") + "," + product_name.replace(",","") + "," + shipping + "\n")
    
f.close()

brand: GIGABYTE Radeon RX 5700 XT GAMING OC 8G Graphics Card, PCIe 4.0, 8GB 256-Bit GDDR6, GV-R57XTGAMING OC-8GD Video Card
product_name: GIGABYTE Radeon RX 5700 XT GAMING OC 8G Graphics Card, GV-R57XTGAMING OC-8GD
shipping: Free Shipping
brand: MSI GeForce GTX 1660 SUPER DirectX 12 GTX 1660 SUPER VENTUS XS OC 6GB 192-Bit GDDR6 PCI Express 3.0 x16 HDCP Ready Video Card
product_name: MSI GeForce GTX 1660 SUPER DirectX 12 GTX 1660 SUPER VENTUS XS OC Video Card
shipping: Free Shipping
brand: EVGA GeForce GTX 1660 Ti SC ULTRA GAMING, 06G-P4-1667-KR, 6GB GDDR6, Dual Fan, Metal Backplate
product_name: EVGA GeForce GTX 1660 Ti SC ULTRA GAMING, 06G-P4-1667-KR, 6GB GDDR6, Dual Fan, Metal Backplate
shipping: Free Shipping
brand: ASUS ROG STRIX AMD Radeon RX 5700 XT Overclocked 8G GDDR6 HDMI DisplayPort Gaming Graphics Card (ROG-STRIX-RX5700XT-O8G-GAMING)
product_name: ASUS ROG Strix Radeon RX 5700 XT ROG-STRIX-RX5700XT-O8G-GAMING Video Card
shipping: Free Shipping
brand: MSI GeForce RTX 2060 Dir