Scrape the Spanish Stock Exchange website (https://www.bolsamadrid.es) using BeautifulSoup or Selenium.

In [1]:
# Import relevant libraries:

import requests
from bs4 import BeautifulSoup as bs
import selenium
import numpy as np
import pandas as pd


In [2]:
# Get the website's HTML thanks to the requests library:

URL = "https://www.bolsamadrid.es/esp/aspx/Mercados/Precios.aspx?indice=ESI100000000"
page = requests.get(URL)
# .get gets all the information from the URL

soup = bs(page.content, "html.parser")
# The BeautifulSoup object needs an HTML parser library; in this case, it is the Python pre-set


In [3]:
# We now get the HTML element that contains the information we want:

table = soup.find(id = "ctl00_Contenido_tblAcciones")

In [4]:
# We create an empty numpy array to which we will append our content:

general = np.array([])


In [5]:
# We use a for loop to extract the information into the numpy array:

for company in table:
    for elements in company:
        for content in elements:
            general = np.append(general, content) 


In [6]:
# Delete spaces that got into the array:

arr_new = np.delete(general, np.where(general == "\n"))



In [7]:
# Reshape into the original form: 9 characteristics, 35 companies (+ column names)
arr_new = np.reshape(arr_new, (36, 9))

# Convert into DataFrame and make the first row into column names:
df = pd.DataFrame(arr_new)

new_header = df.iloc[0]
df = df[1:] 
df.columns = new_header

# Visualise:
df.head(10)

Unnamed: 0,Nombre,Últ.,% Dif.,Máx.,Mín.,Volumen,Efectivo (miles €),Fecha,Hora
1,ACCIONA,1387000,124,1391000,1376000,4.833,66993,30/08/2021,10:08:01
2,ACERINOX,116950,86,117250,116000,44.853,52361,30/08/2021,10:07:41
3,ACS,229400,-39,231600,229300,49.826,"1.148,62",30/08/2021,10:08:50
4,AENA,1362500,-4,1367000,1359000,1.764,24051,30/08/2021,10:05:15
5,ALMIRALL,140300,-57,141800,139900,20.049,28172,30/08/2021,10:08:52
6,AMADEUS,512000,-12,516000,510600,23.203,"1.189,44",30/08/2021,10:08:10
7,ARCELORMIT.,297550,110,298200,293800,60.258,"1.787,10",30/08/2021,10:08:00
8,B.SANTANDER,31425,-74,31665,31425,791.22,"2.496,94",30/08/2021,10:08:32
9,BA.SABADELL,6090,-65,6162,6082,815.856,49954,30/08/2021,10:08:39
10,BANKINTER,49630,-94,50060,49630,151.694,75560,30/08/2021,10:07:27


Explain in a Word document subject to Kaggle standards the information you have gathered:

In [8]:
# The Word document can be found in the same directory as this file

Choose a website and scrap it using Scrapy:

In [9]:

# We will try to scrape basic data from all countries of the world:

import scrapy

# We instantiate the Spider that will scrape the website:

class Globetrotter(scrapy.Spider):
    name = 'countries'
    start_urls = [
        'https://www.scrapethissite.com/pages/simple/',
    ]
    
# We create a for loop that will get all items that we want:
    
    def parse(self, response):
        for country in response.css('div.col-md-4'):
            yield {
                'name': country.xpath('.//h3/text()')[1].get(),
                'capital': country.css('.country-capital::text').get(default = ""),
                'area': country.css('.country-area::text').get(default = ""),
                'population': country.css('.country-population::text').get(default = ""),
            }
            
# We move this code cell into its own pure Python file and we run it in the terminal with the command:

    # scrapy runspider countriesSpider.py -o countries.csv
    
# The results are saved into the countries CSV file
        