<b><h3>Webscraping data about watches with BeautifulSoup</b></h3>

<h3>Importing libraries</h3>

In [53]:
import time
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from IPython.display import Image

<h3>Initializing connection with the website</h3>

In [3]:
url = 'https://zegarownia.pl/zegarki-meskie?gclid=Cj0KCQjwz6ShBhCMARIsAH9A0qVz9m-gtxikJUcJ5kSEf7vmqbuyHQsyRptXkNUZhSwMc2OuuU7gB1gaAjcjEALw_wcB'
response = requests.get(url)
response.status_code

200

<h3>Creating soup and watch objects</h3>
1. soup object parses the data</br>
2. watch object finds all div tags with a specific class so as to find all watches on the page and later use that to extract detailed data</br>

In [4]:
soup = BeautifulSoup(response.content,'html.parser')
watch = soup.find_all('div',class_='product details product-item-details')

<h3>The source code:</h3>

In [66]:
Image(url="watch2.png",width=1500, height=500)

In [12]:
u = []
for page in range(1,10):
    # url is an f-string because we want to loop through pages from 1 to 9, so the url has to change
    url = f'https://zegarownia.pl/zegarki-meskie?p={page}'
    response = requests.get(url)
    soup = BeautifulSoup(response.content,'html.parser')
    watch = soup.find_all('div',class_='product details product-item-details')
    for item in watch:
        manufacturer = item.find('span',class_='product-item-link__manufacturer').text.strip()
        product_item_name = item.find('span',class_='product-item-link__name').text.strip()
        product_code = item.find('span',class_='product-item-link__sku').text.strip()
        shipment = item.find('div',class_='product-item__free-delivery-mobile').text.strip()
        # not all items are on sale and one solution to that is to try to find final_price and old_price at the same time,
        # and if that can't be done (because there is no sale for a particular item)
        # then we obtain final_price from another class (products with no sale store price in a different class)
        try:
            final_price = item.find('span',class_='special-price').text.strip()
            old_price = item.find('span',class_='old-price').text.strip()
        except:
            final_price = item.find('span',class_='price-container price-final_price tax weee').text.strip()
            old_price = np.NaN
        # finally the data can be appended and used to make a dataframe out of it
        u.append({'manufacturer':manufacturer,
                  'product_item_name':product_item_name,
                  'product_code':product_code,
                  'final_price':final_price,
                  'old_price':old_price,
                  'shipment':shipment})
    time.sleep(np.random.randint(3,10))
    print(f'Getting page {page}. Waiting...')

Getting page 1. Waiting...
Getting page 2. Waiting...
Getting page 3. Waiting...
Getting page 4. Waiting...
Getting page 5. Waiting...
Getting page 6. Waiting...
Getting page 7. Waiting...
Getting page 8. Waiting...
Getting page 9. Waiting...


<h3>Data cleaning</h3>

In [14]:
watch_df = pd.DataFrame(u)

In [15]:
watch_df.head(5)

Unnamed: 0,manufacturer,product_item_name,product_code,final_price,old_price,shipment
0,Suunto,9 Peak All Black,SS050522000,"1 564,00 zł","1 869,00 zł",Darmowa dostawa 24H
1,Timex,Expedition Military Allied,T2N721,"721,00 zł",,Darmowa dostawa 24H
2,Suunto,9 Baro All Black Wrist HR GPS,SS050019000,"1 225,00 zł","1 399,00 zł",Darmowa dostawa 24H
3,Timex,Chrono FullBlack,TW2T21200SS,"289,00 zł","549,00 zł",Darmowa dostawa 24H
4,Suunto,9 Peak Granite Blue Titanium,SS050520000,"1 847,00 zł","2 339,00 zł",Darmowa dostawa 24H


In [16]:
watch_df['shipment'].unique()

array(['Darmowa dostawa 24H'], dtype=object)

In [18]:
watch_df['Free_24h_shipment'] = np.where(watch_df['shipment']=='Darmowa dostawa 24H',1,0)

In [21]:
watch_df = watch_df.iloc[:,[0,1,2,3,4,6]]

In [22]:
watch_df.head(5)

Unnamed: 0,manufacturer,product_item_name,product_code,final_price,old_price,Free_24h_shipment
0,Suunto,9 Peak All Black,SS050522000,"1 564,00 zł","1 869,00 zł",1
1,Timex,Expedition Military Allied,T2N721,"721,00 zł",,1
2,Suunto,9 Baro All Black Wrist HR GPS,SS050019000,"1 225,00 zł","1 399,00 zł",1
3,Timex,Chrono FullBlack,TW2T21200SS,"289,00 zł","549,00 zł",1
4,Suunto,9 Peak Granite Blue Titanium,SS050520000,"1 847,00 zł","2 339,00 zł",1


In [23]:
watch_df['final_price'].to_list()

['1\xa0564,00\xa0zł',
 '721,00\xa0zł',
 '1\xa0225,00\xa0zł',
 '289,00\xa0zł',
 '1\xa0847,00\xa0zł',
 '1\xa0299,00\xa0zł',
 '2\xa0777,00\xa0zł',
 '2\xa0949,00\xa0zł',
 '21\xa0490,00\xa0zł',
 '14\xa0670,00\xa0zł',
 '4\xa0790,00\xa0zł',
 '4\xa0790,00\xa0zł',
 '4\xa0800,00\xa0zł',
 '3\xa0573,00\xa0zł',
 '6\xa0700,00\xa0zł',
 '13\xa0000,00\xa0zł',
 '13\xa0000,00\xa0zł',
 '5\xa0700,00\xa0zł',
 '1\xa0499,00\xa0zł',
 '6\xa0100,00\xa0zł',
 '1\xa0831,00\xa0zł',
 '1\xa0680,00\xa0zł',
 '1\xa0580,00\xa0zł',
 '2\xa0680,00\xa0zł',
 '1\xa0422,00\xa0zł',
 '1\xa0206,00\xa0zł',
 '749,00\xa0zł',
 '5\xa0190,00\xa0zł',
 '5\xa0020,00\xa0zł',
 '4\xa0600,00\xa0zł',
 '899,00\xa0zł',
 '3\xa0600,00\xa0zł',
 '615,00\xa0zł',
 '475,00\xa0zł',
 '439,00\xa0zł',
 '5\xa0690,00\xa0zł',
 '6\xa0990,00\xa0zł',
 '453,00\xa0zł',
 '2\xa0300,00\xa0zł',
 '2\xa0239,00\xa0zł',
 '2\xa0155,00\xa0zł',
 '1\xa0466,00\xa0zł',
 '2\xa0519,00\xa0zł',
 '792,00\xa0zł',
 '1\xa0799,00\xa0zł',
 '839,00\xa0zł',
 '690,00\xa0zł',
 '169,00\xa0zł',


In [41]:
watch_df[['final_price','old_price']] = watch_df[['final_price','old_price']].apply(lambda x: x.str.replace('\xa0','').str.replace('zł','').str.replace(',','.'))

In [42]:
watch_df.dtypes

manufacturer         object
product_item_name    object
product_code         object
final_price          object
old_price            object
Free_24h_shipment     int32
dtype: object

In [44]:
watch_df[['final_price','old_price']] = watch_df[['final_price','old_price']].apply(pd.to_numeric)

In [45]:
watch_df.dtypes

manufacturer          object
product_item_name     object
product_code          object
final_price          float64
old_price            float64
Free_24h_shipment      int32
dtype: object

In [46]:
watch_df.head(5)

Unnamed: 0,manufacturer,product_item_name,product_code,final_price,old_price,Free_24h_shipment
0,Suunto,9 Peak All Black,SS050522000,1564.0,1869.0,1
1,Timex,Expedition Military Allied,T2N721,721.0,,1
2,Suunto,9 Baro All Black Wrist HR GPS,SS050019000,1225.0,1399.0,1
3,Timex,Chrono FullBlack,TW2T21200SS,289.0,549.0,1
4,Suunto,9 Peak Granite Blue Titanium,SS050520000,1847.0,2339.0,1


In [52]:
watch_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   manufacturer       891 non-null    object 
 1   product_item_name  891 non-null    object 
 2   product_code       891 non-null    object 
 3   final_price        891 non-null    float64
 4   old_price          226 non-null    float64
 5   Free_24h_shipment  891 non-null    int32  
dtypes: float64(2), int32(1), object(3)
memory usage: 38.4+ KB


### <br>Check whether or not a particular item is currently on sale</br>

In [80]:
watch_df['On_sale'] = np.where(watch_df['old_price'].isna()==True,0,1)

In [81]:
watch_df.head(5)

Unnamed: 0,manufacturer,product_item_name,product_code,final_price,old_price,Free_24h_shipment,On_sale
0,Suunto,9 Peak All Black,SS050522000,1564.0,1869.0,1,1
1,Timex,Expedition Military Allied,T2N721,721.0,,1,0
2,Suunto,9 Baro All Black Wrist HR GPS,SS050019000,1225.0,1399.0,1,1
3,Timex,Chrono FullBlack,TW2T21200SS,289.0,549.0,1,1
4,Suunto,9 Peak Granite Blue Titanium,SS050520000,1847.0,2339.0,1,1


### <br>Calculating the discount</br>

In [88]:
watch_df['discount'] = round(np.abs((watch_df['final_price']/watch_df['old_price'])-1),4)

In [89]:
watch_df.head(10)

Unnamed: 0,manufacturer,product_item_name,product_code,final_price,old_price,Free_24h_shipment,On_sale,discount
0,Suunto,9 Peak All Black,SS050522000,1564.0,1869.0,1,1,0.1632
1,Timex,Expedition Military Allied,T2N721,721.0,,1,0,
2,Suunto,9 Baro All Black Wrist HR GPS,SS050019000,1225.0,1399.0,1,1,0.1244
3,Timex,Chrono FullBlack,TW2T21200SS,289.0,549.0,1,1,0.4736
4,Suunto,9 Peak Granite Blue Titanium,SS050520000,1847.0,2339.0,1,1,0.2103
5,Suunto,7 Titanium Matte Black,SS050568000,1299.0,2199.0,1,1,0.4093
6,Błonie,Cyberpunk 2077 Limited Edition,CYBERPUNK-T-2077,2777.0,,1,0,
7,Coros,Apex 2 Pro Kilian Jornet Limited Edition,WAPX2P-KJ,2949.0,,1,0,
8,Oris,Big Crown Calibre 473,01 473 7786 4065-07 5 19 22FC,21490.0,,1,0,
9,Orient Star,Contemporary Full Skeleton,RE-AZ0101N00B (RE-AZ0101N),14670.0,16300.0,1,1,0.1


<h3>The data can be now exported e.g. to an xlsx file:</h3>

In [69]:
watch_df.to_excel('watches_data.xlsx',index=False)