In [1]:
import pandas as pd
import numpy as np
import ast

In [2]:
# Cargar los datasets
movies_path = r"C:\Users\paula\OneDrive\Curso\Henry\Proyecto Individual\Movies\movies_dataset.csv"
credits_path = r"C:\Users\paula\OneDrive\Curso\Henry\Proyecto Individual\Movies\credits.csv"

movies = pd.read_csv(movies_path, low_memory=False)
credits = pd.read_csv(credits_path, low_memory=False)

In [3]:
# Asegurar que 'revenue' y 'budget' son numéricos y reemplazar nulos por 0
movies['revenue'] = pd.to_numeric(movies['revenue'], errors='coerce').fillna(0)
movies['budget'] = pd.to_numeric(movies['budget'], errors='coerce').fillna(0)

# Calcular 'return'
movies['return'] = np.where(movies['budget'] == 0, 0, movies['revenue'] / movies['budget'])

# Reemplazar divisiones por cero o valores nulos directamente
movies['return'] = movies['return'].replace([np.inf, np.nan], 0)


In [4]:
# Convertir release_date al formato datetime y manejar errores
movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')

# Crear la columna release_year extrayendo el año
movies['release_year'] = movies['release_date'].dt.year

# Crear la columna release_month extrayendo el mes
movies['release_month'] = movies['release_date'].dt.month

# Crear la columna release_dow extrayendo el dia de la semana
movies['release_dow'] = movies['release_date'].dt.day_of_week

# Eliminar aquellos registros que quedaron nulos
movies = movies[movies['release_year'].notnull()]

In [5]:
movies = movies.drop(columns=['video', 'imdb_id', 'adult', 'original_title', 'poster_path', 'homepage', 'spoken_languages', 'production_countries', 'belongs_to_collection', 'production_companies', 'original_language','tagline', 'genres', 'overview', 'runtime', 'status'], errors='ignore')

In [6]:
# Eliminar duplicados de "title" conservando el registro con mayor popularidad
movies = movies.sort_values(by='popularity', ascending=False)  # Ordenar por popularidad
movies = movies.drop_duplicates(subset=['title'], keep='first')  # Eliminar duplicados conservando el primero (el de mayor popularidad)

In [7]:
# Convertir la columna `crew` en listas de diccionarios
credits['crew'] = credits['crew'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Extraer el nombre de los directores
credits['director_name'] = credits['crew'].apply(
    lambda x: next((person['name'] for person in x if person.get('job') == 'Director'), None)
)

In [8]:
# Convertir la columna `cast` en listas de diccionarios
credits['cast'] = credits['cast'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Extraer los nombres de todos los actores en una lista
credits['actors'] = credits['cast'].apply(
    lambda x: [actor['name'] for actor in x] if isinstance(x, list) else []
)

In [9]:
# Crear un nuevo DataFrame reducido
credits = credits[['id', 'director_name', 'actors']]

In [10]:
# Tamaño de los datasets
print("Dimensiones de movies_dataset:", movies.shape)
print("Dimensiones de credits:", credits.shape)

Dimensiones de movies_dataset: (42196, 12)
Dimensiones de credits: (45476, 3)


In [11]:
# Nombres de las columnas
print("Columnas de movies_dataset:")
print(movies.columns.tolist())
print("\nColumnas de credits:")
print(credits.columns.tolist())

Columnas de movies_dataset:
['budget', 'id', 'popularity', 'release_date', 'revenue', 'title', 'vote_average', 'vote_count', 'return', 'release_year', 'release_month', 'release_dow']

Columnas de credits:
['id', 'director_name', 'actors']


In [12]:
# Tipos de datos
print("Tipos de datos de movies_dataset:")
print(movies.dtypes)
print("\nTipos de datos de credits:")
print(credits.dtypes)

Tipos de datos de movies_dataset:
budget                  float64
id                       object
popularity               object
release_date     datetime64[ns]
revenue                 float64
title                    object
vote_average            float64
vote_count              float64
return                  float64
release_year            float64
release_month           float64
release_dow             float64
dtype: object

Tipos de datos de credits:
id                int64
director_name    object
actors           object
dtype: object


In [13]:
# Valores nulos
print("Valores nulos en movies_dataset:")
print(movies.isnull().sum())
print("\nValores nulos en credits:")
print(credits.isnull().sum())

Valores nulos en movies_dataset:
budget           0
id               0
popularity       0
release_date     0
revenue          0
title            0
vote_average     0
vote_count       0
return           0
release_year     0
release_month    0
release_dow      0
dtype: int64

Valores nulos en credits:
id                 0
director_name    887
actors             0
dtype: int64


In [14]:
# Duplicados
print("Registros duplicados en movies_dataset:", movies.duplicated().sum())
print("Registros duplicados en credits:", credits.duplicated().sum())

Registros duplicados en movies_dataset: 0


TypeError: unhashable type: 'list'