In [1]:
from bs4 import BeautifulSoup

# Getting start with bs4

In [5]:
html_doc = """
<html>
    <head>
        <title>The Dormouse's story</title>
    </head>
    <body>
        <p class="title">
            <b>The Dormouse's story</b>
        </p>
        <p class="story">Once upon a time there were three little sisters; and their names were
        <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
        <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
        <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
        and they lived at the bottom of a well.</p>
        <p class="story">...</p>
    </body> 
</html>
"""

In [6]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [7]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
        and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



In [8]:
soup.title

<title>The Dormouse's story</title>

In [16]:
soup.title.name

'title'

In [12]:
soup.title.string

"The Dormouse's story"

In [11]:
soup.title.parent.name

'head'

In [25]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [22]:
soup.find_all('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
         <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
         <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
         <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
         and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [26]:
soup.p['class']

['title']

In [27]:
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [18]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [29]:
soup.find(id="link3")

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [30]:
element = soup.find(id="link3")
element.get('class')

['sister']

In [31]:
for link in soup.find_all('a'):
    print(link.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [32]:
print(soup.get_text())




The Dormouse's story


The Dormouse's story
Once upon a time there were three little sisters; and their names were
        Elsie,
        Lacie and
        Tillie;
        and they lived at the bottom of a well.
...





# Requests & bs4

In [41]:
import requests
url = "https://shop.9arm.co/"
res = requests.get(url)

In [42]:
res.status_code

200

In [43]:
res.text

'<!doctype html>\n<html class="no-js" lang="en">\n  <head>\n    <meta charset="utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name="viewport" content="width=device-width,initial-scale=1">\n    <meta name="theme-color" content="">\n    <link rel="canonical" href="https://shop.9arm.co/">\n    <link rel="preconnect" href="https://cdn.shopify.com" crossorigin><link rel="icon" type="image/png" href="//shop.9arm.co/cdn/shop/files/favicon.png?crop=center&height=32&v=1680448710&width=32"><link rel="preconnect" href="https://fonts.shopifycdn.com" crossorigin><title>\n      9ARM Merchandise\n</title>\n\n    \n      <meta name="description" content="รวมสินค้า Merchandise สุดพิเศษจากนายอาร์มเท่านั้น หาพิเศษกว่านี้ไม่มีอีกแล้ว มาร่วมกันเป็นท่อน้ำเลี้ยงกันได้ที่นี่">\n    \n\n    \n\n<meta property="og:site_name" content="9ARM Merchandise">\n<meta property="og:url" content="https://shop.9arm.co/">\n<meta property="og:title" content="9ARM Merchandise">\n<meta property="

In [44]:
html_doc = res.text
html_doc

'<!doctype html>\n<html class="no-js" lang="en">\n  <head>\n    <meta charset="utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name="viewport" content="width=device-width,initial-scale=1">\n    <meta name="theme-color" content="">\n    <link rel="canonical" href="https://shop.9arm.co/">\n    <link rel="preconnect" href="https://cdn.shopify.com" crossorigin><link rel="icon" type="image/png" href="//shop.9arm.co/cdn/shop/files/favicon.png?crop=center&height=32&v=1680448710&width=32"><link rel="preconnect" href="https://fonts.shopifycdn.com" crossorigin><title>\n      9ARM Merchandise\n</title>\n\n    \n      <meta name="description" content="รวมสินค้า Merchandise สุดพิเศษจากนายอาร์มเท่านั้น หาพิเศษกว่านี้ไม่มีอีกแล้ว มาร่วมกันเป็นท่อน้ำเลี้ยงกันได้ที่นี่">\n    \n\n    \n\n<meta property="og:site_name" content="9ARM Merchandise">\n<meta property="og:url" content="https://shop.9arm.co/">\n<meta property="og:title" content="9ARM Merchandise">\n<meta property="

In [45]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [46]:
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <meta content="" name="theme-color"/>
  <link href="https://shop.9arm.co/" rel="canonical"/>
  <link crossorigin="" href="https://cdn.shopify.com" rel="preconnect"/>
  <link href="//shop.9arm.co/cdn/shop/files/favicon.png?crop=center&amp;height=32&amp;v=1680448710&amp;width=32" rel="icon" type="image/png"/>
  <link crossorigin="" href="https://fonts.shopifycdn.com" rel="preconnect"/>
  <title>
   9ARM Merchandise
  </title>
  <meta content="รวมสินค้า Merchandise สุดพิเศษจากนายอาร์มเท่านั้น หาพิเศษกว่านี้ไม่มีอีกแล้ว มาร่วมกันเป็นท่อน้ำเลี้ยงกันได้ที่นี่" name="description"/>
  <meta content="9ARM Merchandise" property="og:site_name"/>
  <meta content="https://shop.9arm.co/" property="og:url"/>
  <meta content="9ARM Merchandise" property="og:title"/>
  <meta content="website" pro

In [47]:
soup.title

<title>
      9ARM Merchandise
</title>

In [None]:
soup.find_all("h3", {"class" : "card__heading h5"})

In [59]:
product_names = []
for i in soup.find_all("h3", {"class" : "card__heading h5"}):
  product_names.append(i.text.replace("\n", "").strip())

In [60]:
product_names

['Chat Shirt',
 'Fork You T-Shirt',
 'Crew Shirt',
 'Text Shirt',
 'สายแลนที่ระลึก',
 'Sticker นายอาร์มแบบเท่ๆ',
 'ถุงผ้า Emoji',
 'Mouse Pad Mark I',
 'แก้วร้อนเย็นแห่งนายทุน']

In [51]:
prices = []
for i in soup.find_all("span", {"class" : "price-item price-item--regular"}):
  prices.append(i.text.replace("\n", "").strip())
prices

['From 490.00 ฿ THB',
 'From 490.00 ฿ THB',
 'From 490.00 ฿ THB',
 'From 490.00 ฿ THB',
 '299.00 ฿ THB',
 '189.00 ฿ THB',
 '250.00 ฿ THB',
 '990.00 ฿ THB',
 '490.00 ฿ THB']

In [52]:
import re
product_prices = []
for i in soup.find_all("span", {"class" : "price-item price-item--regular"}):
  text = i.text.replace("\n", "").strip()
  text = re.findall(r"\d+\.\d+", text)[0]
  product_prices.append(float(text))

In [53]:
product_prices

[490.0, 490.0, 490.0, 490.0, 299.0, 189.0, 250.0, 990.0, 490.0]

In [54]:
import pandas as pd
data = {"product_name":product_names, "product_price":product_prices}
data

{'product_name': ['Chat Shirt',
  'Fork You T-Shirt',
  'Crew Shirt',
  'Text Shirt',
  'สายแลนที่ระลึก',
  'Sticker นายอาร์มแบบเท่ๆ',
  'ถุงผ้า Emoji',
  'Mouse Pad Mark I',
  'แก้วร้อนเย็นแห่งนายทุน'],
 'product_price': [490.0,
  490.0,
  490.0,
  490.0,
  299.0,
  189.0,
  250.0,
  990.0,
  490.0]}

In [55]:
df = pd.DataFrame(data)
df

Unnamed: 0,product_name,product_price
0,Chat Shirt,490.0
1,Fork You T-Shirt,490.0
2,Crew Shirt,490.0
3,Text Shirt,490.0
4,สายแลนที่ระลึก,299.0
5,Sticker นายอาร์มแบบเท่ๆ,189.0
6,ถุงผ้า Emoji,250.0
7,Mouse Pad Mark I,990.0
8,แก้วร้อนเย็นแห่งนายทุน,490.0


In [56]:
# df.to_csv("9arm_products.csv")

In [57]:
res = requests.get("https://shopee.co.th/")
res.status_code

200

In [58]:
res.text

'<!doctype html>\n<html dir="ltr">\n<head>\n<link rel="preconnect" href="//down-th.img.susercontent.com/">\n<link rel="preconnect" href="//deo.shopeemobile.com/shopee/">\n<link rel="preconnect" href="//cv.shopee.co.th/">\n<meta charset="utf-8">\n<meta name="viewport" content="width=device-width,initial-scale=1,maximum-scale=1,minimum-scale=1,user-scalable=no,viewport-fit=cover">\n<meta name="shopee:git-sha" content="1c914377bf69ecd6f8e9f06f2656c93b7e70be49">\n<meta name="shopee:version" content="rw-v5.13.0">\n<link rel="icon" type="image/png" sizes="32x32" href="https://deo.shopeemobile.com/shopee/shopee-mobilemall-live-sg/assets/icon_favicon_1_32.0Wecxv.png">\n<link rel="icon" type="image/png" sizes="96x96" href="https://deo.shopeemobile.com/shopee/shopee-mobilemall-live-sg/assets/icon_favicon_1_96.wI1aMs.png">\n<link rel="preload" href="https://deo.shopeemobile.com/shopee/shopee-mobilemall-live-sg/assets/bundle.f54fd6be20250676.css" as="style" data-modern="true">\n<link rel="modulepr