# Código Scraping TFM

A continuación se muestra el código python utilizado para la recolección de datos de barcos en venta en internet. La página web de la que serán extraídos los datos es https://www.boatinternational.com/yachts-for-sale.

### 1. Abrimos la página web y rechazamos cookies

In [1]:
#Importamos todas las librerias que necesitamos 

import pandas as pd
import sys
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
 
driver = webdriver.Firefox()
driver.get("https://www.boatinternational.com/yachts-for-sale")
time.sleep(3)
driver.maximize_window()

In [None]:
# Rechazamos cookies
cookies = driver.find_element(By.ID, "onetrust-pc-btn-handler")
time.sleep(1)
cookies.click()
time.sleep(3)
cookies = driver.find_element(By.XPATH,"/html/body/div[5]/div[3]/div/div[3]/div[1]/button")
cookies.click()
time.sleep(1)

### 2. Desde la página principal identificamos todos los anuncios de barcos disponibles

In [3]:
# Encontramos todos los elementos
boat_elements = driver.find_elements(By.XPATH, '//div[contains(@class, "card__title card__title--landscape")]')

In [4]:
len(boat_elements)

786

In [5]:
type(boat_elements)

list

### 3. Creamos un bucle para abrir cada anuncio de venta de barcos en una pestaña nueva, recopilar la información contenida y finalmente cerrarlo para acto seguido repetir el proceso con el siguiente

In [6]:
# Creamos un diccionario y un dataframe vacíos
boat_data = {}
df = pd.DataFrame([boat_data])

In [7]:
# Especificar si queremos empezar desde algún punto en concreto
# start_index = 128
# for element in boat_elements[start_index:]:

# Hacer un bucle de todos los barcos
for element in boat_elements:
    # Extraer el nombre del barco y el enlace a su anuncio
    try:
        boat_name = element.find_element(By.TAG_NAME, 'a').text.strip()
        boat_link = element.find_element(By.TAG_NAME, 'a').get_attribute('href')
        print("Visiting:", boat_name)

        # Abrir el enlace en una pestaña nueva usando JavaScript
        driver.execute_script("window.open(arguments[0], '_blank');", boat_link)

        # Esperar a que se abra la nueva pestaña para seleccionarla como pestaña activa
        WebDriverWait(driver, 10).until(lambda driver: len(driver.window_handles) == 2)
        driver.switch_to.window(driver.window_handles[1])

        ##################################################################################        
        ##################################################################################
        ####### Extraer datos de la nueva pestaña (recopilación para el dataframe) #######
        ##################################################################################
        ################################################################################## 

        
        # Usamos try-except para evitar errores en el programa por falta de elementos en algunos de los anuncios
        
        try:
            wait = WebDriverWait(driver, 5)
            wait.until(EC.presence_of_all_elements_located((By.XPATH, '//div[contains(@class, "spec-block")]' and "/html/body/div[1]/div/main/div/div[8]/div[2]/div[2]/div/div/a")))
            wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "spec-block")))

            stats_elements = driver.find_elements(By.XPATH, '//div[contains(@class, "stats__text")]')
            
            for stats_element in stats_elements:
                # Extraemos nombre de la variable y valor
                title = stats_element.find_element(By.CLASS_NAME, 'stats__title').text.strip()
                value = stats_element.find_element(By.CLASS_NAME, 'stats__value').text.strip()
    
                # Lo añadimos al diccionario
                boat_data[title] = value
    
            # Copiamos la referencia del broker
            broker = driver.find_element(By.XPATH, "/html/body/div[1]/div/main/div/div[8]/div[2]/div[2]/div/div/a").get_attribute('href')
            boat_data["BROKER"] = broker
    
            # Copiamos el precio de charter (si existe)
            try:
                charter = driver.find_element(By.XPATH, "/html/body/div[1]/div/main/div/div[8]/div[2]/div[3]/div/div/div[2]/div[1]/div[2]/span")
                boat_data["CHARTER"] = charter.text.strip()
            except NoSuchElementException:
                boat_data["CHARTER"] = None  # Si no existe precio de charter, lo dejamos como ausente
    
            
            
            other_stats_elements = driver.find_elements(By.CLASS_NAME, "spec-block")
            for other_stats_element in other_stats_elements:
                # Scroll the element into view
                # driver.execute_script("arguments[0].scrollIntoView();", other_stats_element)
    
                try:
                    other_stats_element.click()
                    time.sleep(1)
                except ElementClickInterceptedException:
                    print("Element is not clickable. Skipping to the next element.")
            
            elementos = driver.find_elements(By.CLASS_NAME, "spec-block__list-item")
            for elemento in elementos:
                # Extraemos nombre de la variable y valor
                title = elemento.find_element(By.CLASS_NAME, 'spec-block__title ').text.strip()
                value = elemento.find_element(By.CLASS_NAME, 'spec-block__data ').text.strip()
                # Lo añadimos al diccionario
                boat_data[title] = value
            
            
            # Añadimos los datos recopilados en el diccionario a una nueva fila del dataframe
            df = pd.concat([df, pd.DataFrame([boat_data])], ignore_index = True)
            
            # Finalmente vaciamos el diccionario
            boat_data = {}

        except TimeoutException:
            print("Timed out waiting for elements. Skipping to the next element.")

        except StaleElementReferenceException:
            print("Element is stale. Skipping to the next element.")

        ##################################################################################
        ##################################################################################
        ##################################################################################

        # Cerramos la pestaña del anuncio y volvemos a la pestaña principal para repetir el proceso hasta que se acabe el bucle
        driver.close()
        driver.switch_to.window(driver.window_handles[0])

    except StaleElementReferenceException:
        print("Element is stale. Skipping to the next element.")

# Reubicamos los anuncios de barcos después del bucle para evitar la excepción "StaleElementReferenceException"
boat_elements = driver.find_elements(By.XPATH, '//div[contains(@class, "card__title card__title--landscape")]')

Visiting: SHAMANNA
Visiting: MANA
Visiting: AMARYLLIS
Visiting: GRATEFUL
Visiting: SEAHAWK
Visiting: WIDER 210
Visiting: KAHALANI
Visiting: MARGUERITE
Visiting: LIBERTY
Visiting: POLAR STAR
Visiting: INCENTIVIZED
Visiting: GATSBY
Visiting: ILLUSION I
Visiting: WALINDI
Visiting: MARY A
Visiting: STATE OF GRACE
Element is not clickable. Skipping to the next element.
Visiting: BOOK ENDS
Visiting: MIMI
Visiting: PHOENIX 2
Visiting: SUNSEEKER 40
Visiting: BERILDA
Visiting: DOKINHA V
Element is not clickable. Skipping to the next element.
Visiting: DALOLI
Visiting: BARON TRENCK
Visiting: INSIGNIA
Visiting: THE GREAT ESCAPE
Visiting: UTOPIA IV
Element is not clickable. Skipping to the next element.
Visiting: TIARA II
Visiting: NL 50 PLUS
Element is not clickable. Skipping to the next element.
Visiting: SPLENDIDA
Element is not clickable. Skipping to the next element.
Visiting: 80 VELOCE
Visiting: JICJ
Visiting: SERAFIM
Visiting: DRINKABILITY
Visiting: AVANTE V
Visiting: OCEAN DRIVE
Visiting: 

Visiting: AQUABELLA
Visiting: ALDEBARAN III
Visiting: BUNKER
Visiting: ONE MORE DAY
Visiting: MORNINGSTAR
Visiting: BLACK ROCK
Visiting: BLACK HAWK
Visiting: SAFAD
Visiting: DOLCE VITA
Visiting: OLYMPIC MARITIME
Visiting: DAYLAMI
Visiting: SANSSOUCI STAR
Visiting: EVEREAST
Visiting: CADET V
Visiting: ROCHADE
Visiting: OFF THE GRID
Element is not clickable. Skipping to the next element.
Visiting: FLAG
Visiting: SILVER EDGE
Element is not clickable. Skipping to the next element.
Visiting: LUCKY ME
Element is not clickable. Skipping to the next element.
Visiting: BOURBON LEGEND
Element is not clickable. Skipping to the next element.
Visiting: KIRIBATI
Element is not clickable. Skipping to the next element.
Visiting: 4* PUPPIES
Element is not clickable. Skipping to the next element.
Visiting: FAMIGLIA
Element is not clickable. Skipping to the next element.
Visiting: JULIE II
Element is not clickable. Skipping to the next element.
Visiting: SELENA
Visiting: BUNTY
Element is not clickable. S

Visiting: SANLORENZO SD122
Element is not clickable. Skipping to the next element.
Visiting: DOGE 400
Element is not clickable. Skipping to the next element.
Visiting: SUPERSTAR
Element is not clickable. Skipping to the next element.
Visiting: SYNTHESIS 66
Element is not clickable. Skipping to the next element.
Visiting: STERN
Element is not clickable. Skipping to the next element.
Visiting: WERE DREAMS
Element is not clickable. Skipping to the next element.
Visiting: ICHTUS
Element is not clickable. Skipping to the next element.
Visiting: IKIGAI
Element is not clickable. Skipping to the next element.
Visiting: ALFA
Element is not clickable. Skipping to the next element.
Visiting: SUNSEEKER 94
Element is not clickable. Skipping to the next element.
Visiting: JANABANANA
Element is not clickable. Skipping to the next element.
Visiting: VOYAGER
Element is not clickable. Skipping to the next element.
Visiting: SEYCHELLE
Element is not clickable. Skipping to the next element.
Visiting: MAHA

Element is not clickable. Skipping to the next element.
Visiting: MR MOUSE
Element is not clickable. Skipping to the next element.
Visiting: SPECTRAL 50
Element is not clickable. Skipping to the next element.
Visiting: SOLSTICE
Element is not clickable. Skipping to the next element.
Visiting: GRAND ILLUSION
Element is not clickable. Skipping to the next element.
Visiting: GIAVA
Element is not clickable. Skipping to the next element.
Visiting: DESTINY
Element is not clickable. Skipping to the next element.
Visiting: MEIRA
Element is not clickable. Skipping to the next element.
Visiting: OTAM SD35
Element is not clickable. Skipping to the next element.
Visiting: LUXI 95
Element is not clickable. Skipping to the next element.
Visiting: TAMTEEN
Element is not clickable. Skipping to the next element.
Visiting: LUC-AN
Element is not clickable. Skipping to the next element.
Visiting: SILVER WIND
Element is not clickable. Skipping to the next element.
Visiting: NORTHERN SUN
Element is not clic

In [9]:
df

Unnamed: 0,LENGTH,BUILD YEAR,TOP SPEED,BEAM,GT,GUESTS,CREW,BROKER,CHARTER,Name:,...,Refits:,Single Rooms:,Triple Rooms:,Yacht Subtype:,Convertible:,Crew bunk beds:,Pullman Beds:,MCA Compliant:,Engine Model,Build time:
0,,,,,,,,,,,...,,,,,,,,,,
1,35.05 m,2016,12.8 kn,8.23 m,145,8,5,https://www.superyachtpartners.com/yachts/sham...,"Price from €80,000 p/w •",SHAMANNA,...,,,,,,,,,,
2,24.99 m,2021,,6 m,,,,https://www.galatiyachts.com/yachts/details/28...,,MANA,...,,,,,,,,,,
3,78.43 m,2011,16 kn,12.4 m,2108,12,23,https://moraviayachting.mc/buy/charter-motor-y...,"Price from €770,000 p/w •",AMARYLLIS,...,,,,,,,,,,
4,34.36 m,2023,16 kn,7.9 m,275,10,7,https://www.fraseryachts.com/,,GRATEFUL,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
756,49.3 m,2004,14 kn,9.25 m,659,12,12,https://www.fraseryachts.com/,"Price from $199,000 p/w •",VIBRANCE,...,2019-01-01,,,,,,,,,
757,31.71 m,2016,11 kn,6.9 m,99,8,4,https://www.edmiston.com/821/crossbow-yacht-fo...,"Price from €62,000 p/w •",CROSSBOW,...,,,,,,,,,,
758,34.14 m,2002,26 kn,7 m,212,8,4,https://www.galatiyachts.com/yachts/details/28...,,ECLIPSE,...,,,,,,,,,,
759,27.04 m,2005,31 kn,6.6 m,117,6,3,https://www.moranyachts.com/,,LADY VICTORIA,...,"2013-01-01,2022-05-21",,,,,,,,,


### 4. Guardamos como CSV

In [8]:
import csv

print("Introduzca la ruta en la que quiere guardar el dataframe")
ruta_csv = input("Ruta: ")
# C:/Users/Usuario/Documents/Python Anaconda Archivos/Almacenes de Datos/Scripts/Scripts parte 2
# Barcos_definitivo_total
print()
print("Ahora escoja un nombre para el archivo")
nombre_archivo_csv = input("Nombre del archivo: ")
print()

df.to_csv(ruta_csv + "/" + nombre_archivo_csv + ".csv", sep = ';')
print("Se grabó la información correctamente")

Introduzca la ruta en la que quiere guardar el dataframe
Ruta: C:/Users/Usuario/Documents/Python Anaconda Archivos/Almacenes de Datos/Scripts/Scripts parte 2

Ahora escoja un nombre para el archivo
Nombre del archivo: Barcos_definitivo_total

Se grabó la información correctamente


### 5. Lo cargamos desde el CSV para comprobar que funciona

In [None]:
df = pd.read_csv(ruta_csv + "/" + nombre_archivo_csv + ".csv", encoding = "utf-8",
     sep = ";", engine = "python", index_col = 0)
df

### 6. Lo guardamos en formato excel

In [None]:
print("Introduzca la ruta en la que quiere guardar el dataframe")
ruta_excel = input("Ruta: ")
# C:/Users/Usuario/Documents/Python Anaconda Archivos/Almacenes de Datos/Scripts/Scripts parte 2
# Barcos_definitivo_total
print()
print("Ahora escoja un nombre para el archivo")
nombre_archivo_excel = input("Nombre del archivo: ")
print()
escritor = pd.ExcelWriter(ruta_excel + "/" + nombre_archivo_excel + ".xlsx", engine = 'xlsxwriter')
df.to_excel(escritor, sheet_name="hoja1", index=True)
escritor.save()
print("Se grabó la información correctamente")

### 7. Lo volvemos a cargar para comprobar que funciona

In [None]:
df = pd.read_excel(ruta_excel + "/" + nombre_archivo_excel + ".xlsx", index_col = 0)
print(df)

### Código de prueba

In [None]:
df = df.append(dictionary, ignore_index=True)
df

In [None]:
dictionary = {"Column_1" : 107,
             "Column_2" : "depth",
             "Column_3" : 16.44,
             "Column_4" : "special data",
             "Column_5" : "another item"}
dictionary

In [None]:
# Otra forma con el método concat
df = pd.concat([df, pd.DataFrame([dictionary])], ignore_index = True)

In [None]:
df

In [None]:
df = []
df