## Comenzamos con la importancion de las librerias necesarias para este proyecto

In [38]:
import numpy as np
import pandas as pd
import pyarrow
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

## Carga de los archivos CSV

In [39]:
data_movies = pd.read_csv("../datasets_originales/Movies/movies_dataset.csv", low_memory=False)
data_credits = pd.read_csv("../datasets_originales/Movies/credits.csv", low_memory=False)

## Comenzamos con el ETL para el datasets data_movies

* Reemplazo de valores nulero de los campos revenue por 0
* Reemplazo de valores nulero de los campos budget por 0
* Eliminacion de los campos nulos de release_date
* Cambiar formato AAAA-mm-dd de la columna release_date
* Creacion de la columna release_year

In [40]:
# Conversion de las columnas revenue, budget y id a numericos
# Intercambio nulos por Ceros
data_movies['revenue'] = pd.to_numeric(data_movies['revenue'], errors='coerce').fillna(0)
data_movies['budget'] = pd.to_numeric(data_movies['budget'], errors='coerce').fillna(0)

# Eliminacion de nulos de la columna release_date y
# paso todo al tipo de dato date y el formato YYYY-mm-dd
data_movies = data_movies.dropna(subset="release_date")
data_movies["release_date"] = pd.to_datetime(data_movies["release_date"], format='%Y-%m-%d', errors='coerce')

# Creando la columna release_year
data_movies['release_year'] = data_movies['release_date'].dt.year

# Creando la columna return
data_movies['return'] = data_movies.apply(lambda row: row['revenue'] / row['budget'] if row['budget'] > 0 else 0, axis=1)

In [41]:
# Convertir la columna 'id' a numérico, los valores no numéricos se convertirán a NaN
data_movies['id'] = pd.to_numeric(data_movies['id'], errors='coerce')

# Eliminar las filas que tengan NaN en la columna 'id'
data_movies = data_movies.dropna(subset=['id'])

# Convertir la columna 'id' a tipo entero
data_movies['id'] = data_movies['id'].astype(int)

In [42]:
data_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45376 entries, 0 to 45465
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   adult                  45376 non-null  object        
 1   belongs_to_collection  4488 non-null   object        
 2   budget                 45376 non-null  float64       
 3   genres                 45376 non-null  object        
 4   homepage               7766 non-null   object        
 5   id                     45376 non-null  int64         
 6   imdb_id                45362 non-null  object        
 7   original_language      45365 non-null  object        
 8   original_title         45376 non-null  object        
 9   overview               44435 non-null  object        
 10  popularity             45376 non-null  object        
 11  poster_path            45037 non-null  object        
 12  production_companies   45376 non-null  object        
 13  produc

### Eliminacion de algunas columnas que no seran utilizadas

* video
* imdb
* adult
* original_title
* poster_path
* homepage
* tagline
* runtime
* belongs_to_collection

In [43]:
# Elimino las columnas que no se van a utilizar antes de seguir
# con cambio de valores nulos o tipos de datos
columnas_eliminar = ["video","imdb_id","adult","original_title",
                     "tagline","runtime","poster_path","homepage",
                     "belongs_to_collection"]
data_movies = data_movies.drop(columns=columnas_eliminar)

### Desanidando la columna genres

In [44]:
data_movies['genres'] = data_movies['genres'].apply(ast.literal_eval)
data_movies = data_movies.explode('genres')
data_movies['genre_name'] = data_movies['genres'].apply(lambda x: x['name'] if isinstance(x, dict) else None)
data_movies = data_movies.drop(columns=['genres'])

### Teniendo en cuenta el modelo final (MPV) elimino otras columnas que luego no voy a tener en cuenta. 

In [45]:
columnas_eliminar = ['production_companies','production_countries',
                     'spoken_languages']
data_movies = data_movies.drop(columns=columnas_eliminar)

### Verificamos tipos de datos

In [None]:
data_movies.info()

### Cambiando algunos tipos de datos

* Columna release_year
* Columna genre_name
* Columna title
* Columna overviwe

In [46]:
data_movies['release_year'] = data_movies['release_year'].astype('datetime64[ns]')
data_movies['genre_name'] = data_movies['genre_name'].astype(str)
data_movies['title'] = data_movies['title'].astype(str)
data_movies['overview'] = data_movies['overview'].astype(str)

In [37]:
data_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93420 entries, 0 to 45465
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             93420 non-null  float64       
 1   id                 93420 non-null  int64         
 2   original_language  93403 non-null  object        
 3   overview           93420 non-null  object        
 4   popularity         93420 non-null  object        
 5   release_date       93420 non-null  datetime64[ns]
 6   revenue            93420 non-null  float64       
 7   status             93296 non-null  object        
 8   title              93420 non-null  object        
 9   vote_average       93420 non-null  float64       
 10  vote_count         93420 non-null  float64       
 11  release_year       93420 non-null  datetime64[ns]
 12  return             93420 non-null  float64       
 13  genre_name         93420 non-null  object        
dtypes: datetime

In [48]:
data_movies.columns

Index(['budget', 'id', 'original_language', 'overview', 'popularity',
       'release_date', 'revenue', 'status', 'title', 'vote_average',
       'vote_count', 'release_year', 'return', 'genre_name'],
      dtype='object')

### Guardamos el archivo en parquet y pasamos al EDA

In [47]:
data_movies.to_parquet('../Source/data_movies.parquet')