# Web Scraping

### HTTP
- HyperText Transfer Protocol
- http requests
    - GET
    - POST
    
    
![url](https://doepud.co.uk/images/blogs/complex_url.png)

In [None]:
# pip3 install requests
# pip3 install beautifulsoup4

In [122]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [2]:
data = requests.get("https://en.wikipedia.org/wiki/Top-level_domain")

In [3]:
data.status_code

200

## Status code 200 = OK

In [4]:
data.text[:200]

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Top-level domain - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF='

### Bonus: Downloading Images via Python

In [5]:
r2d2 = requests.get("https://www.sideshow.com/storage/product-images/2172/r2-d2-deluxe_star-wars_gallery_5c4fb7e7e5e21.jpg")

In [6]:
img_bytes = r2d2.content

In [7]:
with open("data/r2d2.jpg","wb+") as file:
    file.write(img_bytes)

### Back to our usual

In [10]:
url = "https://www.zalando.es/zapatillas-mujer/"

In [19]:
# We have found out Zalando is "protecting" it's data
# We are going undercover, making it think, out python module requests is a browser
disguise = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"}

In [20]:
data = requests.get(url, headers=disguise)

In [21]:
data

<Response [200]>

In [24]:
soup = BeautifulSoup(data.text)

In [27]:
a = soup.select("div")

In [30]:
len(a)

579

### CSS Selectors
Lets use some [selectors](https://www.w3schools.com/cssref/css_selectors.asp) to make our searches mores specific

In [83]:
products = soup.select("div.qMZa55.SQGpu8.iOzucJ.JT3_zV.DvypSJ")

In [84]:
len(products)

32

## Extra tool
Type this on the console of the ChromeDevTools to check what your selector is acctually selecting:

`document.querySelectorAll('a').forEach(elm => elm.style.background = 'red')`

In [85]:
products = [p.select("div._0xLoFW._78xIQ-.EJ4MLB.JT3_zV") for p in products]

In [100]:

data_scraped = []
for p in products:
    manufacturer, price = p[0].select("span")[:2]
    manufacturer = manufacturer.text, 
    price = float(re.findall(r"[\d,]+",price.text)[0].replace(",","."))
    model = p[0].select("h3")[0].text
    element = {
        "manufacturer":manufacturer,
        "price":price,
        "model":model
              }
    data_scraped.append(element)

In [102]:
data_scraped[0]

{'manufacturer': ('Liu Jo Jeans',),
 'price': 144.95,
 'model': 'MAXI  - Zapatillas - black'}

In [116]:
def get_products_page(endpoint,pg_number):
    url = "https://www.zalando.es/"
    disguise = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"}
    params = {"p":pg_number}
    data = requests.get(url+endpoint, headers=disguise,params=params)
    soup = BeautifulSoup(data.text)
    products = soup.select("div.qMZa55.SQGpu8.iOzucJ.JT3_zV.DvypSJ")
    products = [p.select("div._0xLoFW._78xIQ-.EJ4MLB.JT3_zV") for p in products]
    data_scraped = []
    for p in products:
        manufacturer, price = p[0].select("span")[:2]
        manufacturer = manufacturer.text 
        price = float(re.findall(r"[\d,]+",price.text)[0].replace(",","."))
        model = p[0].select("h3")[0].text
        element = {
            "manufacturer":manufacturer,
            "price":price,
            "model":model
                  }
        data_scraped.append(element)
    return data_scraped

In [120]:
endpoint = "zapatillas-mujer/"
all_womens_shoes = [] 
for i in range(1,83):
    all_womens_shoes += get_products_page(endpoint,i)

In [123]:
df = pd.DataFrame(all_womens_shoes)

In [124]:
df

Unnamed: 0,manufacturer,price,model
0,G-Star,109.95,ROVIC II - Zapatillas - light liquid pink/bisque
1,Anna Field,37.99,Zapatillas - black
2,Converse,89.95,CHUCK TAYLOR MOVE PLATFORM - Zapatillas altas ...
3,Converse,81.95,RUN STAR HIKE - Zapatillas altas - white/black
4,Nike Sportswear,84.95,BLAZER MID '77 - Zapatillas altas - white/blac...
...,...,...,...
2619,Tommy Hilfiger,69.95,ICONIC KESHA SLIP ON - Mocasines - white
2620,New Look,16.99,MARKED TOE CAP TOP UP - Zapatillas - white
2621,Next,30.00,WITH LEOPARD BACK SIGNATURE - Zapatillas - white
2622,Tamaris,49.95,Zapatillas - olive


In [None]:
"div.cat_catalog-ec2An > div > div > div.qMZa55._8P5KBX.pbIgly > div > div > div > div > div.adFHlH._0xLoFW._7ckuOK.mROyo1 > div > article"