In [39]:
# Importar librerías para tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd  # Pandas es una poderosa librería para manipulación y análisis de datos en Python.

# Importar librerías para web scraping y manipulación de datos
# -----------------------------------------------------------------------
import requests

# Importar librerías para automatización de navegadores web con Selenium
# -----------------------------------------------------------------------
from selenium import webdriver  # Selenium es una herramienta para automatizar la interacción con navegadores web.
from webdriver_manager.chrome import ChromeDriverManager  # ChromeDriverManager gestiona la instalación del controlador de Chrome.
from selenium.webdriver.common.keys import Keys  # Keys es útil para simular eventos de teclado en Selenium.
from selenium.webdriver.support.ui import Select  # Select se utiliza para interactuar con elementos <select> en páginas web.

# Importar librerías para pausar la ejecución
# -----------------------------------------------------------------------
from time import sleep  # Sleep se utiliza para pausar la ejecución del programa por un número de segundos.

# Importar tqdm para ver el progreso la ejecución
from tqdm import tqdm

# Configuraciones
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None)  # Establece una opción de Pandas para mostrar todas las columnas de un DataFrame.


In [40]:
# Creamos una lista vacia y una tupla 

movie_list = []
movie_tuple = ()

# Creamos una función para importar el archivo csv y convertirlo a DF
def import_csv(csv):
    # Importamos los datos del CSV de las películas y convertirlo a dataframe
    df_completo = pd.read_csv(csv)
    # Filtramos solo las películas.
    df_movies = df_completo[df_completo["type"].isin(["Movie", "Short"])]
    print("We have imported the movies and shorts of the csv file")
    return df_movies

# Iniciamos el driver y rechazamos las cookies
def init_driver(url):
    driver = webdriver.Chrome()
    driver.get(url)
    sleep(2)
    driver.find_element("css selector", '#__next > div > div > div.sc-jrcTuL.bPmWiM > div > button.icb-btn.sc-bcXHqe.sc-hLBbgP.sc-ftTHYK.dcvrLS.dufgkr.ecppKW').click()
    print("Hemos rechazado las cookies")
    return driver 

#Recogemos toda la información y almacenarla en una tupla
id_list = []
def collect_data(driver, id):
    if id in id_list: # Vemos si la pelicula ya está recogida, ignoramos los duplicados
        pass 
    else:
        driver.find_element("xpath", '//*[@id="suggestion-search"]').send_keys(id, Keys.ENTER)
        print("We have accessed the movie")
        sleep(3)

        # Maximizamos la ventana de la pagina
        driver.maximize_window() 

        
        # Empezar a recoger información

        #Rating
        IMDB_rating = None
        try:
            IMDB_rating = driver.find_element("xpath", '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[2]/div/div[1]/a/span/div/div[2]/div[1]/span[1]').text
            print("We have accessed to the IMDB_rating")
            sleep(2)
            
        except:
            if not IMDB_rating: # en caso de que no haya puntuación
                print("We could not access the rating")                         

        # Directoras/es
        directors = []
        all_directors = driver.find_elements('xpath', '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/section/div[2]/div/ul/li[1]/div/ul/li/a')

        for i in range(len(all_directors)):
            directors.append(all_directors[i].get_attribute('innerHTML'))
    
       
        # Guionistas    
        writers = []
        all_writers = driver.find_elements("xpath", '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[2]/div[2]/div/ul/li[2]/div/ul/li/a')
        for i in range(len(all_writers)):
            writers.append(all_writers[i].get_attribute('innerHTML'))                   

        # Argumento
        plot = None
        try: 
            plot = driver.find_element("xpath", '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/section/p').text
            print("We have accessed to the plot")
            sleep(2)
        except:
            if not plot:
                plot = None
                print("We could not access the plot")
            elif plot == "Add a plot":
                print("There was no plot described")

        # Duración de la película
                  
        year = None
        rate_age = None
        lenght = None
        all_others_data = driver.find_elements('xpath', '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/ul/li')
        if len(all_others_data) == 2:
            year = all_others_data[0].find_element('css selector','a').get_attribute('innerHTML')
            lenght = all_others_data[1].get_attribute('innerHTML')
        elif len(all_others_data) == 3:
            year = all_others_data[0].find_element('css selector','a').get_attribute('innerHTML')
            rate_age = all_others_data[1].find_element('css selector','a').get_attribute('innerHTML')
            lenght = all_others_data[2].get_attribute('innerHTML')
        else:
            print(f'REVISAR - La pelicula con id {id} tiene {len(all_others_data)} variables')
        
        
        # Nombre
        try:
            name = driver.find_element("xpath", '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/h1/span').text
            print("We have accessed to the movie name") 
            sleep(5)
        except:
            if not name:
                name = None
                print("We could not access the movie name")
        
        # Añadimos el idOwner de esta película a la lista, para que no la repita
        id_list.append(id)

        movie_tuple = (id, IMDB_rating, directors, writers, plot, lenght, name, year)
        sleep(2)
        return movie_tuple
    

#Añadimos cada tupla a la lista de películas
def add_movie_tuple(movie_tuple):
    if movie_tuple in movie_list:
        pass
    else:
        movie_list.append(movie_tuple)
    return movie_list

# Ciclamos sobre los idOwner de df_movies
def loop_movies(driver, df_movies):
    for idx, id in enumerate(df_movies["idOwner"].unique()):
        print(f"{idx} - {id}")
        movie_tuple = collect_data(driver, id)
        add_movie_tuple(movie_tuple)
 
    


In [43]:
# Bloque para Ejecutar todas las funciones de arriba
url = "https://www.imdb.com/?ref_=nv_home"
csv = "Team1_API_extraction.csv"
driver = init_driver(url)
df_movies = import_csv(csv)
loop_movies(driver, df_movies)

Hemos rechazado las cookies
We have imported the movies and shorts of the csv file
0 - tt0052698
We have accessed the movie
We have accessed to the IMDB_rating
We have accessed to the plot
We have accessed to the movie name
1 - tt0053357
We have accessed the movie
We have accessed to the IMDB_rating
We have accessed to the plot
We have accessed to the movie name
2 - tt0052661
We have accessed the movie
We have accessed to the IMDB_rating
We have accessed to the plot
We have accessed to the movie name
3 - tt0053228
We have accessed the movie
We have accessed to the IMDB_rating
We have accessed to the plot
We have accessed to the movie name
4 - tt0053752
We have accessed the movie
We have accessed to the IMDB_rating
We have accessed to the plot
We have accessed to the movie name
5 - tt0051874
We have accessed the movie
We have accessed to the IMDB_rating
We have accessed to the plot
We have accessed to the movie name
6 - tt0053063
We have accessed the movie
We have accessed to the IMDB_r

UnboundLocalError: cannot access local variable 'plot' where it is not associated with a value

In [44]:
print(len(movie_list))

224


In [45]:
# Comprobamos qué información se recogió
for tuple in movie_list:
    print(tuple)

('tt0052698', '7,5', ['Claude Sautet'], ['José Giovanni', 'Claude Sautet', 'Pascal Jardin'], 'Un despiadado criminal huye de la persecución, lo que implica cada vez más víctimas.', '1h 50min', 'A todo riesgo', '1960')
('tt0053357', '5,5', ['Edward L. Cahn'], ['Robert E. Kent', 'Orville H. Hampton'], 'Tres asesinos profesionales irrumpen en la casa de un empleado del aeropuerto para disparar al avión en el que un primer ministro asiático debe abandonar Estados Unidos.', '1h 11min', 'Three Came to Kill', '1960')
('tt0052661', '6,1', ['Alfred Weidenmann'], ['Herbert Reinecker', 'Igor Sentjurc'], 'Añade un argumento en tu idioma', '1h 32min', 'Las dos caras del destino', '1960')
('tt0053228', '6,2', ['Jürgen Roland'], ['Edgar Wallace', 'Egon Eis', 'Wolfgang Menge'], 'Añade un argumento en tu idioma\nA strange, red circle appears on the neck of a man saved from the guillotine. What is its mysterious meaning? Tragically, it turns out to be something of a family curse, as each generation ther

In [46]:
#Pasamos la informacion a un DataFrame
df_details = pd.DataFrame(movie_list, columns= ["idOwner", "imdb_rating", "direction", "writers", "plot", "length", "name", "year"])


In [47]:
# Comprobamos la informacion en el DataFrame
df_details

Unnamed: 0,idOwner,imdb_rating,direction,writers,plot,length,name,year
0,tt0052698,75,[Claude Sautet],"[José Giovanni, Claude Sautet, Pascal Jardin]","Un despiadado criminal huye de la persecución,...",1h 50min,A todo riesgo,1960
1,tt0053357,55,[Edward L. Cahn],"[Robert E. Kent, Orville H. Hampton]",Tres asesinos profesionales irrumpen en la cas...,1h 11min,Three Came to Kill,1960
2,tt0052661,61,[Alfred Weidenmann],"[Herbert Reinecker, Igor Sentjurc]",Añade un argumento en tu idioma,1h 32min,Las dos caras del destino,1960
3,tt0053228,62,[Jürgen Roland],"[Edgar Wallace, Egon Eis, Wolfgang Menge]","Añade un argumento en tu idioma\nA strange, re...",1h 32min,El círculo rojo,1960
4,tt0053752,66,[John Guillermin],"[John Brophy, Howard Clewes, Richard Maibaum]","Añade un argumento en tu idioma\nIn 1901, a gr...",1h 25min,El robo al banco de Inglaterra,1960
...,...,...,...,...,...,...,...,...
219,tt0065050,85,[Murray Woroner],"[Rocky Marciano, Muhammad Ali, Jimmy Braddock]",Añade un argumento en tu idioma\nFictional box...,1h 10min,The Super Fight,1970
220,tt0064806,67,"[Chuck Jones, Abe Levitow, Dave Monahan]","[Chuck Jones, Sam Rosen, Norton Juster]",Añade un argumento en tu idioma\nMilo is a boy...,1h 30min,La puerta mágica,1970
221,tt0064932,60,[Gilberto Martínez Solares],"[Rafael García Travesi, Jesús Sotomayor Martínez]",Añade un argumento en tu idioma\nTo foil his p...,1h 25min,Santo el enmascarado de plata y Blue Demon con...,1970
222,tt0064554,66,[Pavel Arsenov],"[Carlo Gozzi, Vadim Korostylyov]","Añade un argumento en tu idioma\nDeramo, kind,...",1h 17min,Korol-olen,1970


In [48]:
#Pasamos todo a un CSV para poder trabajar con la informacion
df_details.to_csv("film_details.csv", index = False)