In [5]:
import re
import json

import requests
from IPython.core.display import display, HTML
from bs4 import BeautifulSoup
from requests.compat import urljoin, quote_plus
import pandas as pd
from typing import List
import numpy as np
import time
import logging

In [6]:
def euro_redirect(page_number: str):
    """
    Since href for each page is a single endpoint,
    we need to redirect correctly to get contents of website
    """
    
    return f'https://www.euro.com.pl/telefony-komorkowe,strona-{page_number}.bhtml'

In [116]:
def get_items(page_number: str):
    """
    Load single page structure and get all the information about the products
    """
    # load structure
    html_data = requests.get(euro_redirect(page_number)).content.decode("utf8")
    soup = BeautifulSoup(html_data)
    
    # initialize frame
    final_elements = pd.DataFrame()
    for cat in soup.find_all('div',{'class':'product-row'}):
        try:
            final_element = pd.DataFrame([
                                [cat.find('h2',{'class':'product-name'}).find('a').text, 
                                 cat.find('a', {'class':'product-brand'}).text,
                                 cat.find('div', {'class':'price-normal selenium-price-normal'}).text] +\
                                [x.text for x in cat.find_all('span',{'class':'attribute-value'})]],
                                columns = ['Nazwa','Marka','Cena']+[re.sub('\s+',' ', x.text).strip() for x in cat.find_all('span',{'class':'attribute-name'})]
                            )

            final_elements = final_elements.append(final_element, sort = False)
        except: 
            continue
    
    final_elements.replace('\s+', ' ',regex=True, inplace=True)
    final_elements.Cena = [''.join(re.findall("[0-9, ]+", item)).replace(" ", "") for item in final_elements.Cena]
    final_elements = final_elements.apply(lambda x: x.str.strip())
    final_elements.Cena = pd.to_numeric(final_elements.Cena.str.replace(',', '.'))
    
        
    return final_elements

In [117]:
def get_full_data(pages):
    full_data = pd.DataFrame()
    for page_number in pages:
        page_data = get_items(page_number)
        full_data = full_data.append(page_data, sort = False)

    full_data.reset_index(drop= True, inplace = True)    
        
    return full_data

In [124]:
pages = [str(x) for x in range(1,25)]

In [125]:
pages

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24']

In [126]:
full_df = get_full_data(pages)

In [127]:
full_df

Unnamed: 0,Nazwa,Marka,Cena,Aparaty tylny/przedni,Pojemność baterii,Pamięć,Wyświetlacz,System operacyjny,Procesor
0,Xiaomi Mi 10T 6+128GB (czarny),Xiaomi,1999.00,64 Mpix + 13 Mpix + 5 Mpix / 20 Mpix,5000 mAh,6 GB / 128 GB,"6,67 "", 2400 x 1080 pikseli, IPS",Android 10,8-rdzeniowy Qualcomm Snapdragon 865
1,Samsung Galaxy M21 (czarny),Samsung,899.00,48 Mpix + 5 Mpix + 8 Mpix / 20 Mpix,6000 mAh,4 GB / 64 GB,"6,4 "", 2340 x 1080 pikseli, 16 mln kolorów SUP...",Android 10,8-rdzeniowy Samsung Exynos 9611
2,Samsung Galaxy Note20 (szary),Samsung,3749.00,64 Mpix + 12 Mpix + 12 Mpix / 10 Mpix,4300 mAh,8 GB / 256 GB,"6,7 "", 2400 x 1080 pikseli, Super AMOLED Plus",Android 10,8-rdzeniowy Samsung Exynos 990
3,Xiaomi Redmi 9 4+64 (szary),Xiaomi,619.00,13 Mpix + 8 Mpix + 2 Mpix + 2 Mpix / 8 Mpix,5020 mAh,4 GB / 64 GB,"6,53 "", 2340 x 1080 pikseli, Full HD+",Android 10,8-rdzeniowy MediaTek Helio G80
4,Xiaomi Redmi Note 8 Pro 6/64GB (zielony),Xiaomi,769.00,64 Mpix + 8 Mpix + 2 Mpix + 2 Mpix / 20 Mpix,4500 mAh,6 GB / 64 GB,"6,53 "", 2340 x 1080 pikseli, Full HD+",Android 9 Pie,8-rdzeniowy MediaTek Helio G90T
5,Samsung Galaxy M31s (czarny),Samsung,1399.00,64 Mpix + 12 Mpix + 5 Mpix + 5 Mpix / 32 Mpix,6000 mAh,6 GB / 128 GB,"6,5 "", 2340 x 1080 pikseli, 16 mln kolorów SUP...",Android 10,8-rdzeniowy Samsung Exynos 9611
6,Samsung Galaxy A21s (czarny),Samsung,699.00,48 Mpix + 8 Mpix + 2 Mpix + 2 Mpix / 13 Mpix,5000 mAh,3 GB / 32 GB,"6,5 "", 1600 x 720 pikseli, 16 mln kolorów","Android 10 + One UI 2,0",8-rdzeniowy Samsung Exynos 850
7,Xiaomi Redmi Note 9 4+128 (szary),Xiaomi,799.00,48 Mpix + 8 Mpix + 2 Mpix + 2 Mpix / 13 Mpix,5020 mAh,4 GB / 128 GB,"6,53 "", 2340 x 1080 pikseli, Full HD+",Android 10,8-rdzeniowy MediaTek Helio G85
8,Motorola Moto E7 Plus 4/64GB (niebieski),Motorola,599.00,48 Mpix + 2 Mpix / 8 Mpix,5000 mAh,4 GB / 64 GB,"6,5 "", 1600 x 720 pikseli, HD+",Android 10,8-rdzeniowy Qualcomm Snapdragon 460
9,Xiaomi Redmi 9C 2/32GB (szary),Xiaomi,480.00,13 Mpix + 2 Mpix / 5 Mpix,5000 mAh,2 GB / 32 GB,"6,53 "", 1600 x 720 pikseli, HD+",Android 10,8-rdzeniowy MediaTek Helio G35


In [129]:
full_df.to_csv('sample.csv')

In [130]:
full_df.Cena

0      1999.00
1       899.00
2      3749.00
3       619.00
4       769.00
5      1399.00
6       699.00
7       799.00
8       599.00
9       480.00
10     3199.00
11     1980.00
12     1699.00
13     1099.00
14     1969.00
15      799.00
16      699.00
17     2599.00
18      899.00
19     3999.00
20      899.00
21     1369.00
22      499.00
23     1999.00
24     2969.00
25     1199.00
26     3949.00
27     2999.00
28     2599.00
29     8799.00
        ...   
616     199.00
617     639.00
618     139.00
619     199.00
620    1799.00
621     149.00
622     899.00
623    3249.00
624     139.99
625     379.00
626     699.00
627     399.00
628     229.00
629      44.99
630    2329.00
631     196.99
632     115.19
633     799.00
634     699.00
635      69.00
636     769.00
637      49.00
638     949.00
639    2099.00
640    3149.00
641     319.00
642     169.99
643     669.00
644     699.00
645     899.00
Name: Cena, Length: 646, dtype: float64