# PRÁCTICA 2: **DATOS NO ESTRUCTURADOS**

**GRUPO:** GESTDB_2

**MIEMBROS:** 
- JAIME ALVAREZ URUEÑA
- ÁLVARO FRAILE CARMENA 
- ALEJANDRO MENDOZA MEDINA
- JAVIER QUESADA PAJARES
  
----

In [1]:
import pandas as pd
import requests
import re
from bs4.element import ResultSet, Tag
from bs4 import BeautifulSoup
from datetime import datetime

## **FUNCIONES AUXILIARES**

In [116]:
def decode_driver_name(string: str):
    driver_dictionary={
        'lec':'Charles Leclerc',
        'sai':'Carlos Sainz',
        'per' : 'Sergio Perez',
        'ver' : 'Max Verstappen',
        'pia': 'Oscar Piastri',
        'nor' : 'Lando Norris',
        'law' : 'Liam Lawson',
        'ric' : 'Daniel Ricciardo',
        'alo' : 'Fernando Alonso',
        'hul': 'Nico Hulkenberg',
        'col':'Franco Colapinto',
        'rus':'George Russell',
        'ham':'Lewis Hamilton',
        'alb' : 'Alex Albon',
        'bea' :' Oliver Bearman',
        'mag':'Kevin Magnussen',
        'gas' : 'Pierre Gasly',
        'oco':'Esteban Ocon',
        'sar':'Logan Sargeant',
        'zho' : 'Guanyu Zhou',
        'bot':'Valtteri Bottas',
        'dev':'Nyck Devries',
        'str':'Lance Stroll',
        'lat':'Nicholas Latiffi',
        'tsu':'Yuki Tsunoda',
        'sch':'Mick Schumacher',
        'gio':'Antonio Giovinazzi',
        'kub':'Robert Kubica',
        'vet':'Sebastian Vettel',
        'rai':'Kimi Raikkonen',
        'maz':'Nikita Mazepin'
    }
    
    lowers=string.lower()
    for key,value in driver_dictionary.items():
        if key in lowers:
            return value
    return 'Unknown driver'

In [117]:
def get_drivers_review(article_soup: BeautifulSoup):
    drivers_reviews = []
    pattern = re.compile(r'^prose max-w-none mb-l tablet')
    text_items = article_soup.find_all('div', attrs={'class':pattern})[1:] # los primeros no valen, son la intro

    for index, text_box in enumerate(text_items): 
        if index == 10: # después de la décima, chao
            break
        texto_completo = ""

        for descendiente in text_box.contents:
            if descendiente.name == 'h3' or descendiente.find('strong'): # "read more" o "missing out"
                break
            else:
                texto_completo += descendiente.text+" "
                
        drivers_reviews.append(texto_completo)
        
    return drivers_reviews

In [118]:
def get_drivers_names_and_positions(article_soup: BeautifulSoup):
    drivers_names = []
    pattern = re.compile(r'^border-t-0')
    driver_images = article_soup.find_all('div', attrs={'class':pattern})[:-1] # todas menos la última, la general

    for index, imagen in enumerate(driver_images):
        if index == 10: # a la décima paro
            break
        etiqueta_imagen = imagen.find('img')
        nombre_imagen: str = etiqueta_imagen['alt']
        drivers_names.append(decode_driver_name(nombre_imagen))

    return drivers_names, [i for i in range(1,11)]

In [119]:
def get_ranking_date(article_soup: BeautifulSoup): 
    date_item = article_soup.find('time')

    date_object = datetime.strptime(date_item.string, "%d %B %Y")
    date_format = date_object.strftime("%Y-%m-%d")

    return date_format

In [120]:
def get_ranking_page_data(url: str):
    response = requests.get(url)
    page_soup = BeautifulSoup(response.text, "html.parser")
    
    pattern = re.compile(r'^group group-hover:') # patrón de etiqueta de los artículos
    articles_list = page_soup.find_all('a', attrs={'class':pattern})

    dates_megalist = []
    names_megalist = []
    positions_megalist = []
    reviews_megalist = []
    links_megalist = []

    for articulo in articles_list:
        ranking_link_output = articulo['href'] # adicional, el link del ranking
        
        if 'fan' in ranking_link_output or 'pre-season' in ranking_link_output: # artículos inválidos
            continue # saltar iteración
        links_megalist.append(ranking_link_output)

        response = requests.get(ranking_link_output) # mismo HTML para los 3 scrapers
        article_soup = BeautifulSoup(response.text, "html.parser")

        date_output = get_ranking_date(article_soup) # PRIMERO, sacar fecha
        dates_megalist.append(date_output)
        
        drivers_names_output, positions_output = get_drivers_names_and_positions(article_soup) # SEGUNDO, sacar nombres de los pilotos
        names_megalist.append(drivers_names_output)
        positions_megalist.append(positions_output)
        
        reviews_output = get_drivers_review(article_soup) # TERCERO, sacar el texto
        reviews_megalist.append(reviews_output)

    return dates_megalist, names_megalist, positions_megalist, reviews_megalist, links_megalist

## **Insertar una página entera de rankings en CSV**

In [121]:
df = pd.DataFrame(pd.read_csv('data/unstructured/rankings_info_pos.csv'))

In [122]:
URL = "https://www.formula1.com/en/latest/tags/power-rankings.699Peq5SC9zNGvwCkb1ln6?page=5"
dates, drivers_names, positions, reviews, ranking_links = get_ranking_page_data(URL)

for fecha, tanda_pilotos, posiciones, tanda_reviews, link in zip(dates, drivers_names, positions, reviews, ranking_links):
    for piloto, posicion, review in zip(tanda_pilotos, posiciones, tanda_reviews):
        new_row = [fecha, piloto, posicion, review, link]   
        df.loc[len(df)] = new_row
df.to_csv('data/unstructured/rankings_info_pos.csv', index=False)

In [9]:
# Limpiar
df = pd.DataFrame(columns=df.columns)
df.to_csv('data/unstructured/rankings_info_pos.csv', index=False)
df = pd.DataFrame(pd.read_csv('data/unstructured/rankings_info_pos.csv'))

## **Insertar varias páginas enteras de rankings en CSV**

In [104]:
df = pd.DataFrame(pd.read_csv('data/unstructured/rankings_info_pos.csv'))

In [98]:
URL = "https://www.formula1.com/en/latest/tags/power-rankings.699Peq5SC9zNGvwCkb1ln6?page="

for num_pagina in range(1,7):
    url_pagina = URL + str(num_pagina)
    dates, drivers_names, reviews, ranking_links = get_ranking_page_data(url_pagina)

    for fecha, tanda_pilotos, tanda_reviews, link in zip(dates, drivers_names, reviews, ranking_links):
        for piloto, review in zip(tanda_pilotos, tanda_reviews):
            new_row = [fecha, piloto, review, link]
            
            df.loc[len(df)] = new_row
    print(f'Página [{num_pagina}] hecha')
df.to_csv('data/unstructured/rankings_info_pos.csv', index=False)

Página [1] hecha
Página [2] hecha
Página [3] hecha
Página [4] hecha
Página [5] hecha
Página [6] hecha


In [105]:
# Limpiar
df = pd.DataFrame(columns=df.columns)
df.to_csv('data/unstructured/rankings_info_pos.csv', index=False)
df = pd.DataFrame(pd.read_csv('data/unstructured/rankings_info_pos.csv'))

---

In [173]:
def get_drivers_review2(article_soup: BeautifulSoup):
    drivers_reviews = []
    pattern = re.compile(r'^prose max-w-none mb-l tablet')
    text_items = article_soup.find_all('div', attrs={'class':pattern})[1:] # los primeros no valen, son la intro

    for index, text_box in enumerate(text_items): 
        if index == 10: # después de la décima, chao
            break
        texto_completo = ""
        if index>=7:
            print(text_box.contents)
            for descendiente in text_box.contents:
                # print(type(descendiente))
                if descendiente.text == '\n':
                    print('SALTO')
                # else:
                #     texto_completo += descendiente.text+" "
                # if descendiente.name == 'h3' or descendiente.find('strong'): # "read more" o "missing out"
                    break
            print(texto_completo)

        # TODO: ver lo de los párrafos separados
                
        # drivers_reviews.append(texto_completo)
        
    return drivers_reviews

In [174]:
hola="https://www.formula1.com/en/latest/article/f1-power-rankings-who-topped-the-standings-after-another-monza-thriller.R92cNHwJCuUmCEmrY4K1B"
response = requests.get(hola) 
article_soup = BeautifulSoup(response.text, "html.parser")

gg = get_drivers_review2(article_soup)

[<p>Hot on the heels of his confirmation at Williams for a third season in 2022 – where he’ll <a href="https://www.formula1.com/en/latest/article.breaking-alex-albon-returns-to-f1-race-seat-with-williams-in-2022-alongside.46oM1Ffk7ngYX3yIpyciXC.html">partner Alex Albon</a> – Monza was yet another indication that Nicholas Latifi is moving the dial in the right direction.</p>, '\n', <p>The Canadian out-raced team mate Russell in the Sprint to finish P14, and ran in the top 10 at points only to get unlucky with the timing of the Safety Car, which allowed the fresher-tyred Esteban Ocon to pass him for P10 as Latifi finished just outside the points.</p>, '\n', <p><strong><a href="https://www.formula1.com/en/latest/article.ross-brawn-on-mclarens-monza-magic-and-what-the-verstappen-hamilton-crash.6MP4LCgVqrlaj8X9h6fMNR.html">READ MORE: Ross Brawn on McLaren’s Monza magic, and what the Verstappen-Hamilton crash means for the title battle</a></strong></p>]
SALTO

[<p>And so to the other protago