# Web Scraping

In [1]:
import requests
from bs4 import BeautifulSoup as soup # serve per analizzare il contenuto della pagina web

url = "https://www.amazon.it/s?k=occhiali+da+sole&crid=2X6UPH6E2SOS9&sprefix=%2Caps%2C584&ref=nb_sb_ss_recent_1_0_recent"

HEADERS = {
    'User-Agent': ('Mozilla/5.0 (X11; Linux x86_64)'
                    'AppleWebKit/537.36 (KHTML, like Gecko)'
                    'Chrome/44.0.2403.157 Safari/537.36'),
    'Accept-Language': 'en-US, en;q=0.5'
}

In [2]:
def scrape_page(page_url, _records = None):
    if _records is None:
        _records = []
    html = requests.get(url, headers = HEADERS) # richiesta GET per scaricare il contenuto della pagina
    page_soup = soup(html.text)
    containers = page_soup.findAll('div',{'class':'puis-card-container s-card-container s-overflow-hidden aok-relative puis-expand-height puis-include-content-margin puis puis-vwvhvgkypx2z322f29wcc4lx0s s-latency-cf-section puis-card-border'})
    
    # 'findAll' restituirà una Resultset di oggetti BeautifulSoup corrispondenti agli elementi trovati che soddisfano i criteri specificati.
    
    for container in containers:
        manufacturer = container.findAll('span',{'class':'a-size-base-plus a-color-base'})[0].text   # nome
        model = container.findAll('span',{'class':'a-size-base-plus a-color-base a-text-normal'})[0].text  # modello 
        img_url = container.findAll('div',{'class':'a-section aok-relative s-image-tall-aspect'})[0].img['src']   # immagine
        try:
            price = container.findAll('span',{'class':'a-price-whole'})[0].text   # prezzo
            price = float(price.replace(',','.'))
        except:
            price = 0
        try:
            review_count = container.findAll('span',{'class':'a-size-base s-underline-text'})[0].text   # numero di recensioni
            review_count = int(review_count.replace('.', ''))
        except:
            review_count = 0
        try:
            rating = container.findAll('span',{'class':'a-icon-alt'})[0].text   # valutazione
            rating = float(rating.replace(' su 5 stelle', '').replace(',', '.'))
        except:
            rating=0
        _records.append([manufacturer, model, img_url, price, review_count, rating])
        
    return _records

In [3]:
import pandas as pd

records = scrape_page(url)
data = pd.DataFrame(records, columns=['manufacturer','model','img_url','price','review_count','rating']) # costruisco il DataFrame

In [4]:
data.info()
print("\n\n")
data.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   manufacturer  60 non-null     object 
 1   model         60 non-null     object 
 2   img_url       60 non-null     object 
 3   price         60 non-null     float64
 4   review_count  60 non-null     int64  
 5   rating        60 non-null     float64
dtypes: float64(2), int64(1), object(3)
memory usage: 2.9+ KB





Unnamed: 0,manufacturer,model,img_url,price,review_count,rating
0,grinderPUNCH,Occhiali da sole XL da uomo con montatura larg...,https://m.media-amazon.com/images/I/511UwgCobA...,19.99,3863,4.1
1,grinderPUNCH,Occhiali da sole da uomo Super Dark Black Lens...,https://m.media-amazon.com/images/I/71yZOxJWmK...,19.99,263,4.5
2,grinderPUNCH,Occhiali da sole neri super scuri da uomo - st...,https://m.media-amazon.com/images/I/712I90-woi...,19.99,1367,4.1
3,grinderPUNCH,Confezione da 2 occhiali da sole polarizzati d...,https://m.media-amazon.com/images/I/71D7cexzDU...,19.99,1533,4.1
4,Polaroid,Occhiali da Sole Uomo,https://m.media-amazon.com/images/I/61j9TzijGC...,29.0,771,4.2
5,LINVO,Occhiali da Sole Polarizzati per Uomo Donna Pr...,https://m.media-amazon.com/images/I/41nt78vZ4o...,18.99,2084,4.3
6,Long Keeper,Occhiali da Sole Rettangolari Vintage Donna Ae...,https://m.media-amazon.com/images/I/51Y2bPPOEK...,13.97,3902,4.2
7,Vans,"Spicoli 4 Shades, Occhiali da sole",https://m.media-amazon.com/images/I/61OHQPhKf9...,18.0,8646,4.3
8,Hawkers,One Occhiali da Sole Uomo,https://m.media-amazon.com/images/I/61+cN8Fw2P...,24.95,18541,4.4
9,Polaroid,PLD 2050/S C55,https://m.media-amazon.com/images/I/61djOnMdkt...,30.74,913,4.3


In [5]:
import os

dest_dir = 'amazon_img'
os.makedirs(dest_dir, exist_ok=True)

In [6]:
import re  # libreria per le espressioni regolari
from os.path import join  # funzione per unire percorsi di directory
from urllib.request import urlretrieve as retrieve  # funzione per scaricare file da un URL

def download_images(data, dest_dir, fname="img_{id:05d}.{ext:s}"):
    data = data.copy()
    data['img_path'] = None  # Crea una nuova colonna 'img_path' nel DataFrame
    
    for i, row in data.iterrows():
        ext = re.search('[^.]+$', row['img_url']).group() # estensione
        fullpath = join(dest_dir, fname.format(id=int(row.name), ext=ext)) # percorso
        retrieve(row['img_url'], fullpath) # scarica l'immagine
        data.at[i, 'img_path'] = fullpath # Aggiungi il percorso al DataFrame usando 'at'

    return data

In [7]:
data2 = download_images(data, 'amazon_img')
data2

Unnamed: 0,manufacturer,model,img_url,price,review_count,rating,img_path
0,grinderPUNCH,Occhiali da sole XL da uomo con montatura larg...,https://m.media-amazon.com/images/I/511UwgCobA...,19.99,3863,4.1,amazon_img/img_00000.jpg
1,grinderPUNCH,Occhiali da sole da uomo Super Dark Black Lens...,https://m.media-amazon.com/images/I/71yZOxJWmK...,19.99,263,4.5,amazon_img/img_00001.jpg
2,grinderPUNCH,Occhiali da sole neri super scuri da uomo - st...,https://m.media-amazon.com/images/I/712I90-woi...,19.99,1367,4.1,amazon_img/img_00002.jpg
3,grinderPUNCH,Confezione da 2 occhiali da sole polarizzati d...,https://m.media-amazon.com/images/I/71D7cexzDU...,19.99,1533,4.1,amazon_img/img_00003.jpg
4,Polaroid,Occhiali da Sole Uomo,https://m.media-amazon.com/images/I/61j9TzijGC...,29.0,771,4.2,amazon_img/img_00004.jpg
5,LINVO,Occhiali da Sole Polarizzati per Uomo Donna Pr...,https://m.media-amazon.com/images/I/41nt78vZ4o...,18.99,2084,4.3,amazon_img/img_00005.jpg
6,Long Keeper,Occhiali da Sole Rettangolari Vintage Donna Ae...,https://m.media-amazon.com/images/I/51Y2bPPOEK...,13.97,3902,4.2,amazon_img/img_00006.jpg
7,Vans,"Spicoli 4 Shades, Occhiali da sole",https://m.media-amazon.com/images/I/61OHQPhKf9...,18.0,8646,4.3,amazon_img/img_00007.jpg
8,Hawkers,One Occhiali da Sole Uomo,https://m.media-amazon.com/images/I/61+cN8Fw2P...,24.95,18541,4.4,amazon_img/img_00008.jpg
9,Polaroid,PLD 2050/S C55,https://m.media-amazon.com/images/I/61djOnMdkt...,30.74,913,4.3,amazon_img/img_00009.jpg
