# Web Scraping

In [1]:
# Web Scraping

# Web Scraping in Amazon

# I changed to Baseball Express to see if I could use it instead since the CAPTCHA on Amazon kept me from completing the challenge

In [2]:
# Import Libraries

from bs4 import BeautifulSoup
import requests
import smtplib
import time
import datetime

In [3]:
# Connect to the website

# This is just the url from Baseball Express for the Bat

URL = 'https://www.baseballexpress.com/warstic-2025-bonesaber-hybrid-3-bbcor-baseball-bat'

# Here's how to get the headers
    # Need the "User Agent" information
    # Get that by going here - httpbin.org/get

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

page = requests.get(URL, headers = headers)

soup1 = BeautifulSoup(page.content, "html.parser")

print(soup1)

# Printed the webpage out to make sure the current code worked

<!DOCTYPE html>

<html lang="en">
<head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# product: http://ogp.me/ns/product#">
<script>
    var BASE_URL = 'https\u003A\u002F\u002Fwww.baseballexpress.com\u002F';
    var require = {
        'baseUrl': 'https\u003A\u002F\u002Fcdn.baseballexpress.com\u002Fstatic\u002Fversion1741164158\u002Ffrontend\u002FTeamExpress\u002Fluma_child\u002Fen_US'
    };</script> <meta charset="utf-8"/><script type="text/javascript">(window.NREUM||(NREUM={})).init={privacy:{cookies_enabled:true},ajax:{deny_list:["bam.nr-data.net"]},distributed_tracing:{enabled:true}};(window.NREUM||(NREUM={})).loader_config={agentID:"1588779945",accountID:"2926140",trustKey:"2926140",xpid:"Vg8FV1dXCBABUVFQBQAEVlUJ",licenseKey:"NRJS-e31c9772e38cc743199",applicationID:"1502213018"};;/*! For license information please see nr-loader-spa-1.287.0.min.js.LICENSE.txt */
<meta content="Warstic 2025 Bonesaber Hybrid -3 BBCOR Baseball Bat" name="title"/>
<meta content="Get Warstic 20

In [4]:
# Formatting the first run of the html to get it to look more like html code layout

soup2 = BeautifulSoup(soup1.prettify(), "html.parser")

print(soup2)

<!DOCTYPE html>

<html lang="en">
<head prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb# product: http://ogp.me/ns/product#">
<script>
   var BASE_URL = 'https\u003A\u002F\u002Fwww.baseballexpress.com\u002F';
    var require = {
        'baseUrl': 'https\u003A\u002F\u002Fcdn.baseballexpress.com\u002Fstatic\u002Fversion1741164158\u002Ffrontend\u002FTeamExpress\u002Fluma_child\u002Fen_US'
    };
  </script>
<meta charset="utf-8"/>
<script type="text/javascript">
   (window.NREUM||(NREUM={})).init={privacy:{cookies_enabled:true},ajax:{deny_list:["bam.nr-data.net"]},distributed_tracing:{enabled:true}};(window.NREUM||(NREUM={})).loader_config={agentID:"1588779945",accountID:"2926140",trustKey:"2926140",xpid:"Vg8FV1dXCBABUVFQBQAEVlUJ",licenseKey:"NRJS-e31c9772e38cc743199",applicationID:"1502213018"};;/*! For license information please see nr-loader-spa-1.287.0.min.js.LICENSE.txt */
  </script>
<meta content="Warstic 2025 Bonesaber Hybrid -3 BBCOR Baseball Bat" name="title"/>
<meta cont

In [10]:
# Now we want to get the data we want from the webpage

# From the webpage, we want to get the title of the shirt which is "Got Data Funny Business Data Analyst T-Shirt"

title = soup2.find('span', {'class' : 'base'}).get_text()

print(title)


          Warstic 2025 Bonesaber Hybrid -3 BBCOR Baseball Bat
         


In [12]:
# Now we want title and price

title = soup2.find('span', {'class' : 'base'}).get_text()
price = soup2.find('span', {'class' : 'price'}).get_text()

print(title)
print(price)


          Warstic 2025 Bonesaber Hybrid -3 BBCOR Baseball Bat
         

             $399.00
            


In [15]:
# Now we want to remove the additoinal data that is coming trough in the span tag and remove the dollar sign for data export later

# Note:  We ad the index to remove the dollar sign.  Remember the index starts with 0 so we want position 1 on

title = soup2.find('span', {'class' : 'base'}).get_text()
price = soup2.find('span', {'class' : 'price'}).get_text()

title = title.strip()
price = price.strip()[1:]

print(title)
print(price)

Warstic 2025 Bonesaber Hybrid -3 BBCOR Baseball Bat
399.00


In [16]:
# Creating a csv that will auto append data into the csv

import csv

header = ['Product Title', 'Price']
data = [title, price]

with open('BaseballExpressWebScraperDataset.csv', 'w', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerow(data)

# Ran this and is worked - here's the file path on my computer (C:\Users\mhudd)

In [17]:
# Very useful to add a timestamp for the imported data so you you can see how recent the data is

import datetime

today = datetime.date.today()

print(today)

2025-04-17


In [28]:
# So here is the same code as above with the date timestamp code added

import csv
import datetime

today = datetime.date.today()

header = ['Product Title', 'Price', 'Date']
data = [title, price, today]

with open('BaseballExpressWebScraperDataset.csv', 'w', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerow(data)

# I ran this and it worked same filepath as above

In [29]:
# we can also run this here in the console so we don't have to open the csv file everytime

import pandas as pd

df = pd.read_csv(r'C:\Users\mhudd\BaseballExpressWebScraperDataset.csv')

print(df)

# You can see the output is the same as it would be if we opened the csv file the data is in row 0

                                       Product Title  Price        Date
0  Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17


In [33]:
# Now we need to append data to our file so we can see data over time

# We changed the 'w' in the with statement to 'a+'
# We also removed the write header line as we don't need it anymore as we are appending data and not overwriting the data now

with open('BaseballExpressWebScraperDataset.csv', 'a+', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(data)

# I ran this a few more times and can see that is works in the csv file

In [34]:
# Here I'll pring the results in the console so we can see the results here

# It is the same data row after row because we ran it multiple times back to back on the same day and nothing has changed
# If we run this in 24 hours the date will have changed and if the price has changed we would see that in the output as well

import pandas as pd

df = pd.read_csv(r'C:\Users\mhudd\BaseballExpressWebScraperDataset.csv')

print(df)

                                       Product Title  Price        Date
0  Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
1  Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
2  Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
3  Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
4  Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17


In [39]:
# We don't want to come run this manually everyday so we want a way to run this automatically

# We are combining everything we've done above

from bs4 import BeautifulSoup
import requests
import smtplib
import time
import datetime

def check_price():
    URL = 'https://www.baseballexpress.com/warstic-2025-bonesaber-hybrid-3-bbcor-baseball-bat'

    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

    page = requests.get(URL, headers = headers)

    soup1 = BeautifulSoup(page.content, "html.parser")

    soup2 = BeautifulSoup(soup1.prettify(), "html.parser")

    title = soup2.find('span', {'class' : 'base'}).get_text()
    price = soup2.find('span', {'class' : 'price'}).get_text()

    title = title.strip()
    price = price.strip()[1:]

    import datetime

    today = datetime.date.today()

    import csv

    header = ['Product Title', 'Price', 'Date']
    data = [title, price, today]

    with open('BaseballExpressWebScraperDataset.csv', 'a+', newline='', encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(data)

# Ran this an it did not fail but I did not see any new data appended?

In [41]:
# Now we want to set this up to run daily so we don't have to run manually

# This initial code will run every 5 seconds.  Modify the 5 in the time.sleep call to change the interval
# while(True):
#     check_price()
#    time.sleep(5)

while(True):
    check_price()
    time.sleep(5)

# This worked and appended new data at 5 second intervals to the file.  This will run in the background as long as the kernel is connected.

# The time.sleep() is set in seconds so to run every 24 hours the formula is (60 seconds * 60 minutes/hour * 24 hours/day = 86400 seconds/day

# If we wanted it to run every hour it would be 3600 seconds
# If we wanted it to run every 12 hours it would be 43200 seconds

# Ran this again successfully

KeyboardInterrupt: 

In [42]:
# The error shows becasue I have to stop the program running to terminate it from running every 5 seconds

In [43]:
# I'll run this in the terminal to show the results worked

import pandas as pd

df = pd.read_csv(r'C:\Users\mhudd\BaseballExpressWebScraperDataset.csv')

print(df)

                                        Product Title  Price        Date
0   Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
1   Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
2   Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
3   Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
4   Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
5   Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
6   Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
7   Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
8   Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
9   Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
10  Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
11  Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal...  399.0  2025-04-17
12  Warstic 2025 Bonesaber Hybrid -3 BBCOR Basebal.

In [44]:
# Now the data is in a csv and can be analysed in excel or imported into SQL, Tableau, PowerBI for analysis and visualization.