# Web scraping

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
import scrapy
from scrapy.crawler import CrawlerProcess

# Exercici 1

Realitza web scraping d'una pàgina de la borsa de Madrid (https://www.bolsamadrid.es) utilitzant BeautifulSoup i Selenium.

Buscarem els valors de les accions de les diferents companyies del IBEX35 amb BeautifulSoup i Selenium

## BeautifulSoup

In [2]:
# Bolsa madrid link
url = 'https://www.bolsamadrid.es'

In [3]:
# Download the HTML
html = requests.get(url)

In [4]:
# Parse the HTML
soup = BeautifulSoup(html.content, 'html.parser')

In [5]:
# Get the Acciones links
links = [link.get('href') for link in soup.find_all('a', string='Acciones')]

In [6]:
# Accciones link
url_acciones = url + links[0]

In [7]:
# Download the HTML
html_acciones = requests.get(url_acciones)

In [8]:
# Parse the HTML
soup_acciones = BeautifulSoup(html_acciones.content, 'html.parser')

In [9]:
# Get the table of Acciones
table_acciones = soup_acciones.find(id='ctl00_Contenido_tblAcciones')

In [10]:
# Check table code
print(table_acciones.prettify())

<table cellpadding="3" cellspacing="0" class="TblPort" id="ctl00_Contenido_tblAcciones" width="100%">
 <tr align="center">
  <th scope="col">
   Nombre
  </th>
  <th scope="col">
   Últ.
  </th>
  <th scope="col">
   % Dif.
  </th>
  <th scope="col">
   Máx.
  </th>
  <th scope="col">
   Mín.
  </th>
  <th scope="col">
   Volumen
  </th>
  <th scope="col">
   Efectivo (miles €)
  </th>
  <th scope="col">
   Fecha
  </th>
  <th class="Ult" scope="col">
   Hora
  </th>
 </tr>
 <tr align="right">
  <td align="left" class="DifFlBj">
   <a href="/esp/aspx/Empresas/FichaValor.aspx?ISIN=ES0125220311">
    ACCIONA
   </a>
  </td>
  <td>
   148,3000
  </td>
  <td class="DifClBj">
   -2,75
  </td>
  <td>
   152,8000
  </td>
  <td>
   148,1000
  </td>
  <td>
   51.504
  </td>
  <td>
   7.746,16
  </td>
  <td align="center">
   24/01/2022
  </td>
  <td align="center" class="Ult">
   15:39:29
  </td>
 </tr>
 <tr align="right">
  <td align="left" class="DifFlBj">
   <a href="/esp/aspx/Empresas/Ficha

In [11]:
# Parse columns and row values
values = []
for row in table_acciones.find_all('tr'):
    if row.find_all('th'):
        columns = [element.get_text() for element in row.find_all('th')]
    elif row.find_all('td'):
        values.append([element.get_text() for element in row.find_all('td')])

In [12]:
# Save values into a dataframe
acciones = pd.DataFrame(values, columns=columns)

In [13]:
# View the dataframe
acciones

Unnamed: 0,Nombre,Últ.,% Dif.,Máx.,Mín.,Volumen,Efectivo (miles €),Fecha,Hora
0,ACCIONA,1483000,-275,1528000,1481000,51.504,"7.746,16",24/01/2022,15:39:29
1,ACERINOX,111500,-724,119650,111100,3.513.907,"41.351,62",24/01/2022,15:39:36
2,ACS,224900,-268,230500,223000,815.730,"18.499,06",24/01/2022,15:39:37
3,AENA,1447000,-213,1482500,1443500,66.552,"9.713,73",24/01/2022,15:39:16
4,ALMIRALL,110600,128,111700,107400,412.317,"4.543,95",24/01/2022,15:38:25
5,AMADEUS,602400,-331,619600,593800,592.084,"35.944,00",24/01/2022,15:39:52
6,ARCELORMIT.,272500,-561,283000,266100,1.158.652,"31.739,77",24/01/2022,15:39:40
7,B.SANTANDER,29815,-328,30760,29695,41.001.024,"124.490,51",24/01/2022,15:39:52
8,BA.SABADELL,6200,-337,6418,6172,16.864.347,"10.585,88",24/01/2022,15:39:29
9,BANKINTER,50700,-209,52180,50520,2.213.163,"11.383,68",24/01/2022,15:39:47


## Selenium

In [14]:
# Open browser
browser = Chrome()

In [15]:
# Get the page
browser.get(url)

In [16]:
# Get the links
links = browser.find_elements(By.LINK_TEXT, 'Acciones')

In [17]:
# Open the acciones webpage
links[0].click()

In [18]:
# Get the table of Acciones
table_acciones = browser.find_element(By.ID, 'ctl00_Contenido_tblAcciones')

In [19]:
# Parse columns and row values
values = []
for row in table_acciones.find_elements(By.TAG_NAME, 'tr'):
    if row.find_elements(By.TAG_NAME, 'th'):
        columns = [element.text for element in row.find_elements(By.TAG_NAME, 'th')]
    elif row.find_elements(By.TAG_NAME, 'td'):
        values.append([element.text for element in row.find_elements(By.TAG_NAME, 'td')])

In [20]:
# Save values into a dataframe
acciones = pd.DataFrame(values, columns=columns)

In [21]:
# View the dataframe
acciones

Unnamed: 0,Nombre,Últ.,% Dif.,Máx.,Mín.,Volumen,Efectivo (miles €),Fecha,Hora
0,ACCIONA,1483000,-275,1528000,1481000,51.504,"7.746,16",24/01/2022,15:39:29
1,ACERINOX,111500,-724,119650,111100,3.513.907,"41.351,62",24/01/2022,15:39:36
2,ACS,224900,-268,230500,223000,815.730,"18.499,06",24/01/2022,15:39:37
3,AENA,1447000,-213,1482500,1443500,66.552,"9.713,73",24/01/2022,15:39:16
4,ALMIRALL,110600,128,111700,107400,412.317,"4.543,95",24/01/2022,15:38:25
5,AMADEUS,602400,-331,619600,593800,592.084,"35.944,00",24/01/2022,15:39:52
6,ARCELORMIT.,272500,-561,283000,266100,1.158.652,"31.739,77",24/01/2022,15:39:40
7,B.SANTANDER,29815,-328,30760,29695,41.001.024,"124.490,51",24/01/2022,15:39:52
8,BA.SABADELL,6200,-337,6418,6172,16.864.347,"10.585,88",24/01/2022,15:39:29
9,BANKINTER,50700,-209,52180,50520,2.213.163,"11.383,68",24/01/2022,15:39:47


# Exercici 2

Documenta en un word el teu conjunt de dades generat amb la informació que tenen els diferents arxius de Kaggle.

### Títol

Accions del IBEX-35 24/01/2022

### Descripció

Accions de les 35 companyies del IBEX-35 del dia 24/01/2022

### Llicència

[CC0: Public Domain](https://creativecommons.org/publicdomain/zero/1.0/)

### Context

El IBEX-35 (Índice Bursátil Español) és l'índex borsari de referència de la Bolsa de Madrid (Bolsa de Madrid) i està format per 35 empreses. Aquest conjunt de dades conté la història del 24 de gener del 2022. 

### Content
Head of the dataset:
<img src="acciones_head.png" width="700px">

### Acknowledgements

Origen: https://www.bolsamadrid.es/esp/aspx/Mercados/Precios.aspx?indice=ESI100000000

# Exercici 3

Tria una página web que tu vulguis i realitza web scraping mitjançant la llibreria Scrapy. 

In [22]:
import scrapy
from scrapy.crawler import CrawlerProcess

In [23]:
class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'http://quotes.toscrape.com/page/1/',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        page = response.url.split('/')[-2]
        filename = f'quotes/quotes-{page}.txt'
        with open(filename, 'w') as fw:
            for quote in response.css('div.quote'):
                author = quote.xpath('span/small/text()').get()
                quote = quote.css('span.text::text').get()
                fw.write(f"{author}: {quote}\n")
        self.log(f'Saved file {filename}')
        
        next_page = response.css('li.next a::attr("href")').get()
        if next_page is not None:
            yield response.follow(next_page, self.parse)
            

In [24]:
# Initialize the crawler
process = CrawlerProcess()

# Specify the spider to use
process.crawl(QuotesSpider)

# Start the crawling process
process.start()

2022-01-24 15:55:22 [scrapy.utils.log] INFO: Scrapy 2.5.1 started (bot: scrapybot)
2022-01-24 15:55:22 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.9.6 (v3.9.6:db3ff76da1, Jun 28 2021, 11:49:53) - [Clang 6.0 (clang-600.0.57)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1m  14 Dec 2021), cryptography 36.0.1, Platform macOS-10.16-x86_64-i386-64bit
2022-01-24 15:55:22 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-24 15:55:22 [scrapy.crawler] INFO: Overridden settings:
{}
2022-01-24 15:55:22 [scrapy.extensions.telnet] INFO: Telnet Password: e2918dbdfc318406
2022-01-24 15:55:22 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2022-01-24 15:55:22 [scrapy.middleware] INFO: Enabled downloader middlewares:
['s