# PRÁCTICA 2: **DATOS NO ESTRUCTURADOS**

**GRUPO:** GESTDB_2

**MIEMBROS:** 
- JAIME ALVAREZ URUEÑA
- ÁLVARO FRAILE CARMENA 
- ALEJANDRO MENDOZA MEDINA
- JAVIER QUESADA PAJARES
  
----

In [1]:
import pandas as pd
import requests
import re
from bs4.element import ResultSet, Tag
from bs4 import BeautifulSoup
from datetime import datetime

## **FUNCIONES AUXILIARES**

In [61]:
def decode_driver_name(string: str):
    driver_dictionary={
        'alb': 'Alex Albon',
        'alo': 'Fernando Alonso',
        'bea': 'Oliver Bearman',
        'ber': 'Oliver Bearman', # repe
        'bot': 'Valtteri Bottas',
        'cha': 'Charles Leclerc', # repe
        'col': 'Franco Colapinto',
        'dev': 'Nyck Devries',
        'gas': 'Pierre Gasly',
        'gio': 'Antonio Giovinazzi',
        'ham': 'Lewis Hamilton',
        'hul': 'Nico Hulkenberg',
        'kub': 'Robert Kubica',
        'kvy': 'Daniil Kvyat',
        'lat': 'Nicholas Latiffi',
        'law': 'Liam Lawson',
        'lec': 'Charles Leclerc',
        'mag': 'Kevin Magnussen',
        'max': 'Max Verstappen', # repe
        'maz': 'Nikita Mazepin',
        'msc': 'Mick Schumacher', # repe
        'ndv': 'Nick de Vries', # nuevo
        'nor': 'Lando Norris',
        'oco': 'Esteban Ocon',
        'per': 'Sergio Perez',
        'pia': 'Oscar Piastri',
        'rai': 'Kimi Raikkonen',
        'ric': 'Daniel Ricciardo',
        'rus': 'George Russell',
        'sai': 'Carlos Sainz',
        'sar': 'Logan Sargeant',
        'sch': 'Mick Schumacher',
        'str': 'Lance Stroll',
        'tsu': 'Yuki Tsunoda',
        'ver': 'Max Verstappen',
        'vet': 'Sebastian Vettel',
        'zho': 'Guanyu Zhou',
    }
    
    lowers=string.lower()
    for key,value in driver_dictionary.items():
        if key in lowers:
            return value
    return 'Unknown driver'

In [62]:
def get_drivers_review(article_soup: BeautifulSoup):
    drivers_reviews = []
    pattern = re.compile(r'^prose max-w-none mb-l tablet')
    text_items = article_soup.find_all('div', attrs={'class':pattern})[1:] # los primeros no valen, son la intro

    for index, text_box in enumerate(text_items): 
        if index == 10: # después de la décima, chao
            break

        texto_completo = ""
        for descendiente in text_box.contents:
            if isinstance(descendiente, Tag): # los '\n' de la lista ocultan los siguientes párrafos
                if descendiente.name == 'h3' or descendiente.find('strong'): # "missing out" o "read more" 
                    break
                else:
                    texto_completo += descendiente.text+" "
                
        drivers_reviews.append(texto_completo)
        
    return drivers_reviews

In [63]:
def get_drivers_names_and_positions(article_soup: BeautifulSoup):
    drivers_names = []
    pattern = re.compile(r'^border-t-0')
    driver_images = article_soup.find_all('div', attrs={'class':pattern})[:-1] # todas menos la última, la general

    for index, imagen in enumerate(driver_images):
        if index == 10: # a la décima paro
            break
        etiqueta_imagen = imagen.find('img')
        nombre_imagen: str = etiqueta_imagen['alt']
        drivers_names.append(decode_driver_name(nombre_imagen))

    return drivers_names, [i for i in range(1,11)]

In [64]:
def get_ranking_date(article_soup: BeautifulSoup): 
    date_item = article_soup.find('time')

    date_object = datetime.strptime(date_item.string, "%d %B %Y")
    date_format = date_object.strftime("%Y-%m-%d")

    return date_format

In [65]:
def get_ranking_page_data(url: str):
    response = requests.get(url)
    page_soup = BeautifulSoup(response.text, "html.parser")
    
    pattern = re.compile(r'^group group-hover:') # patrón de etiqueta de los artículos
    articles_list = page_soup.find_all('a', attrs={'class':pattern})

    dates_megalist = []
    names_megalist = []
    positions_megalist = []
    reviews_megalist = []
    links_megalist = []

    for articulo in articles_list:
        ranking_link_output = articulo['href'] # adicional, el link del ranking
        
        if 'fan' in ranking_link_output or 'pre-season' in ranking_link_output: # artículos inválidos
            continue # saltar iteración
        links_megalist.append(ranking_link_output)

        response = requests.get(ranking_link_output) # mismo HTML para los 3 scrapers
        article_soup = BeautifulSoup(response.text, "html.parser")

        date_output = get_ranking_date(article_soup) # PRIMERO, sacar fecha
        dates_megalist.append(date_output)
        
        drivers_names_output, positions_output = get_drivers_names_and_positions(article_soup) # SEGUNDO, sacar nombres de los pilotos
        names_megalist.append(drivers_names_output)
        positions_megalist.append(positions_output)
        
        reviews_output = get_drivers_review(article_soup) # TERCERO, sacar el texto
        reviews_megalist.append(reviews_output)

    return dates_megalist, names_megalist, positions_megalist, reviews_megalist, links_megalist

## **Insertar una página entera de rankings en CSV**

In [45]:
df = pd.DataFrame(pd.read_csv('data/unstructured/rankings_info_pos.csv'))

In [47]:
URL = "https://www.formula1.com/en/latest/tags/power-rankings.699Peq5SC9zNGvwCkb1ln6?page=5"
dates, drivers_names, positions, reviews, ranking_links = get_ranking_page_data(URL)

for fecha, tanda_pilotos, posiciones, tanda_reviews, link in zip(dates, drivers_names, positions, reviews, ranking_links):
    for piloto, posicion, review in zip(tanda_pilotos, posiciones, tanda_reviews):
        new_row = [fecha, piloto, posicion, review, link]   
        df.loc[len(df)] = new_row
df.to_csv('data/unstructured/rankings_info_pos.csv', index=False)

In [68]:
# Limpiar
df = pd.DataFrame(columns=df.columns)
df.to_csv('data/unstructured/rankings_info_pos.csv', index=False)
df = pd.DataFrame(pd.read_csv('data/unstructured/rankings_info_pos.csv'))

## **Insertar varias páginas enteras de rankings en CSV**

In [69]:
df = pd.DataFrame(pd.read_csv('data/unstructured/rankings_info_pos.csv'))

In [70]:
URL = "https://www.formula1.com/en/latest/tags/power-rankings.699Peq5SC9zNGvwCkb1ln6?page="

for num_pagina in range(1,7):
    url_pagina = URL + str(num_pagina)
    dates, drivers_names, positions, reviews, ranking_links = get_ranking_page_data(url_pagina)

    for fecha, tanda_pilotos, posiciones, tanda_reviews, link in zip(dates, drivers_names, positions, reviews, ranking_links):
        for piloto, posicion, review in zip(tanda_pilotos, posiciones, tanda_reviews):
            new_row = [fecha, piloto, posicion, review, link]   
            df.loc[len(df)] = new_row
    print(f'Página [{num_pagina}] hecha')
df.to_csv('data/unstructured/rankings_info_pos.csv', index=False)

Página [1] hecha
Página [2] hecha
Página [3] hecha
Página [4] hecha
Página [5] hecha
Página [6] hecha


In [105]:
# Limpiar
df = pd.DataFrame(columns=df.columns)
df.to_csv('data/unstructured/rankings_info_pos.csv', index=False)
df = pd.DataFrame(pd.read_csv('data/unstructured/rankings_info_pos.csv'))