## Web scrapping de IMDB

Descarga la información correspondiente y guarda en un csv el top de las 250 películas mediante webscrapping. Encapsúlalo en un script.

Obtén:
* Título
* Año
* Duración
* Posición
* Rating

In [None]:
# Si la petición te devuelve un 403, puedes probar con:
# pip install fake-useragent
# from fake_useragent import UserAgent
# ua = UserAgent()
# headers = {'User-Agent': ua.random}
# response = requests.get(url, headers=headers)

In [None]:
from bs4 import BeautifulSoup as bs
import requests
import pandas as pd

In [None]:
url = "https://www.imdb.com/chart/top/"
response = requests.get(url)
response

In [None]:
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'User-Agent': ua.random}
response = requests.get(url, headers=headers)
response

In [None]:
soup = bs(response.content, 'html.parser')
soup

In [None]:
for x in soup.find_all('h3')[1:-1]:
    print(x.get_text().split(". ")[0])
    print(x.get_text().split(". ")[1])

In [None]:
for x in soup.find_all('span', class_='sc-f30335b4-7 jhjEEd cli-title-metadata-item'):
    text = x.get_text()
    # print(text)
    if "h" in text or "m" in text:
        print("Duración:", text)
    if len(text) == 4:
        print("Año:", text)

In [None]:
for x in soup.find_all('span', class_='ipc-rating-star--rating'):
    print(x.get_text())

In [None]:
for x in soup.find_all('span', class_='ipc-rating-star--rating'):
    print(x.get_text())

In [None]:
my_top25 = {
            "Titulo": [x.get_text().split(". ")[1] for x in soup.find_all('h3')[1:-1]],
            "Ranking": [x.get_text().split(". ")[0] for x in soup.find_all('h3')[1:-1]],
            "Año": [x.get_text() for x in soup.find_all('span', class_='sc-f30335b4-7 jhjEEd cli-title-metadata-item') if len(x.get_text()) == 4],
            "Duración": [x.get_text() for x in soup.find_all('span', class_='sc-f30335b4-7 jhjEEd cli-title-metadata-item') if "h" in x.get_text() or "m" in x.get_text()],
            "Rating": [x.get_text() for x in soup.find_all('span', class_='ipc-rating-star--rating')]
            }
pd.DataFrame(my_top25)

In [None]:
my_top25 = {
            "Ranking": [],
            "Titulo": [],
            "Año": [],
            "Duración": [],
            "Rating": []
            }

for p in soup.find_all("div", class_="sc-f30335b4-0 eefKuM cli-children"):
    my_top25['Ranking'].append(p.find('div').find('a').find('h3').get_text().split(". ")[0])
    my_top25['Titulo'].append(p.find('div').find('a').find('h3').get_text().split(". ")[1])
    my_top25['Año'].append(p.find("div", class_='sc-f30335b4-6 kGhnhC cli-title-metadata').find('span').get_text())
    my_top25['Duración'].append(p.find("div", class_='sc-f30335b4-6 kGhnhC cli-title-metadata').find_all('span')[1].get_text())
    my_top25['Rating'].append(p.find("span", class_="sc-f30335b4-1 kSqvWq").get_text()[:3])
    

df_25 = pd.DataFrame(my_top25)
df_25.to_csv("./data/top25.csv")
df_25

In [None]:
"milla verde" in soup.find("script", id="__NEXT_DATA__").get_text()

In [None]:
import json

In [None]:
json.loads(soup.find("script", id="__NEXT_DATA__").get_text())

In [None]:
# for x in {"1": "Hola", "2": "Mundo"}:
#     print(x)

json.loads(soup.find("script", type="application/ld+json").get_text())

In [None]:
for x in json.loads(soup.find("script", type="application/ld+json").get_text())['itemListElement']:
    # print(x['item'].get('alternateName', x['item'].get('name')))
    # print(x['item']['aggregateRating']['ratingValue'])
    print(x['item']['duration'][2:])
    

In [None]:
for p in json.loads(soup.find("script", id="__NEXT_DATA__").get_text())['props']['pageProps']['pageData']['chartTitles']['edges']:
    print(p)

In [None]:
print(json.loads(soup.find("script", id="__NEXT_DATA__").get_text())['props']['pageProps']['pageData']['chartTitles']['edges'][0]['node']['releaseYear']['year'])
print(json.loads(soup.find("script", id="__NEXT_DATA__").get_text())['props']['pageProps']['pageData']['chartTitles']['edges'][0]['currentRank'])

In [None]:
my_top250 = {
            "Ranking": [],
            "Titulo": [],
            "Año": [],
            "Duración": [],
            "Rating": []
            }

for x in json.loads(soup.find("script", type="application/ld+json").get_text())['itemListElement']:
    my_top250['Titulo'].append(x['item'].get('alternateName', x['item'].get('name')))
    my_top250['Duración'].append(x['item']['duration'][2:])    
    my_top250['Rating'].append(x['item']['aggregateRating']['ratingValue'])
    
for p in json.loads(soup.find("script", id="__NEXT_DATA__").get_text())['props']['pageProps']['pageData']['chartTitles']['edges']:
    my_top250['Año'].append(p['node']['releaseYear']['year'])    
    my_top250['Ranking'].append(p['currentRank'])
    
my_top250 = pd.DataFrame(my_top250)
my_top250.to_csv("./data/top250.csv")
my_top250

In [None]:
# {"data":[{"BootcampDS2503": {"profesores":[{"rol":"LI",
#                                         "name":"Miguel"},
#                                        {"rol":"TA",
#                                         "name":"Hugo"}],
#                          "alumnos": []}}]}

In [None]:
def find_paths(data, target_value, current_path=""):
    """Encuentra las rutas de las claves que contienen un valor específico en un JSON anidado."""
    if isinstance(data, dict):
        for key, value in data.items():
            new_path = f"{current_path}.{key}" if current_path else key
            if value == target_value:
                print(f"Valor encontrado en: {new_path}")
            find_paths(value, target_value, new_path)
    
    elif isinstance(data, list):
        for index, item in enumerate(data):
            new_path = f"{current_path}[{index}]"
            find_paths(item, target_value, new_path)

# JSON de ejemplo
json_data = {
    "usuario": {
        "nombre": "Carlos",
        "edad": 30,
        "direccion": {
            "ciudad": "Madrid",
            "codigo_postal": "28001"
        }
    },
    "pedidos": [
        {"id": 101, "producto": "Laptop", "precio": 1200},
        {"id": 102, "producto": "Teléfono", "precio": 800}
    ]
}

# Búsqueda del valor "Madrid"
find_paths(json_data, "Madrid")

In [None]:
json_data = json.loads(soup.find("script", id="__NEXT_DATA__").get_text())

In [None]:
find_paths(json_data, "La milla verde")

In [None]:
json_data['props']['pageProps']['pageData']['chartTitles']['edges'][25]['node']['titleText']['text']

In [None]:
with open('./data/json_ejemplo.json', 'w') as file:
    json.dump(json_data, file)