## Comenzamos con la importancion de las librerias necesarias para este proyecto

In [None]:
import numpy as np
import pandas as pd
import pyarrow
import ast
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

## Carga de los archivos CSV

In [None]:
data_movies = pd.read_csv("../datasets_originales/Movies/movies_dataset.csv", low_memory=False)
data_credits = pd.read_csv("../datasets_originales/Movies/credits.csv", low_memory=False)

## Comenzamos con el ETL para el datasets data_movies

* Reemplazo de valores nulero de los campos revenue por 0
* Reemplazo de valores nulero de los campos budget por 0
* Eliminacion de los campos nulos de release_date
* Cambiar formato AAAA-mm-dd de la columna release_date
* Creacion de la columna release_year

In [None]:
# Conversion de las columnas revenue, budget y id a numericos
# Intercambio nulos por Ceros
data_movies['revenue'] = pd.to_numeric(data_movies['revenue'], errors='coerce').fillna(0)
data_movies['budget'] = pd.to_numeric(data_movies['budget'], errors='coerce').fillna(0)

# Eliminacion de nulos de la columna release_date y
# paso todo al tipo de dato date y el formato YYYY-mm-dd
data_movies = data_movies.dropna(subset="release_date")
data_movies["release_date"] = pd.to_datetime(data_movies["release_date"], format='%Y-%m-%d', errors='coerce')

# Creando la columna release_year
data_movies['release_year'] = data_movies['release_date'].dt.year

# Creando la columna return
data_movies['return'] = data_movies.apply(lambda row: row['revenue'] / row['budget'] if row['budget'] > 0 else 0, axis=1)

In [None]:
# Convertir la columna 'id' a numérico, los valores no numéricos se convertirán a NaN
data_movies['id'] = pd.to_numeric(data_movies['id'], errors='coerce')

# Eliminar las filas que tengan NaN en la columna 'id'
data_movies = data_movies.dropna(subset=['id'])

# Convertir la columna 'id' a tipo entero
data_movies['id'] = data_movies['id'].astype(int)

In [None]:
data_movies.info()

### Eliminacion de algunas columnas que no seran utilizadas

* video
* imdb
* adult
* original_title
* poster_path
* homepage
* tagline
* runtime
* belongs_to_collection

In [12]:
# Elimino las columnas que no se van a utilizar antes de seguir
# con cambio de valores nulos o tipos de datos
columnas_eliminar = ["video","imdb_id","adult","original_title",
                     "tagline","runtime","poster_path","homepage",
                     "belongs_to_collection"]
data_movies = data_movies.drop(columns=columnas_eliminar)

### Desanidando la columna genres

In [13]:
data_movies['genres'] = data_movies['genres'].apply(ast.literal_eval)
data_movies = data_movies.explode('genres')
data_movies['genre_name'] = data_movies['genres'].apply(lambda x: x['name'] if isinstance(x, dict) else None)
data_movies = data_movies.drop(columns=['genres'])

### Teniendo en cuenta el modelo final (MPV) elimino otras columnas que luego no voy a tener en cuenta. 

In [14]:
columnas_eliminar = ['production_companies','production_countries',
                     'spoken_languages']
data_movies = data_movies.drop(columns=columnas_eliminar)

### Verificamos tipos de datos

In [16]:
data_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93420 entries, 0 to 45465
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             93420 non-null  float64       
 1   id                 93420 non-null  int64         
 2   original_language  93403 non-null  object        
 3   overview           92104 non-null  object        
 4   popularity         93420 non-null  object        
 5   release_date       93420 non-null  datetime64[ns]
 6   revenue            93420 non-null  float64       
 7   status             93296 non-null  object        
 8   title              93420 non-null  object        
 9   vote_average       93420 non-null  float64       
 10  vote_count         93420 non-null  float64       
 11  release_year       93420 non-null  float64       
 12  return             93420 non-null  float64       
 13  genre_name         91036 non-null  object        
dtypes: datetime

### Cambiando algunos tipos de datos

* Columna release_year
* Columna genre_name
* Columna title
* Columna overviwe

In [22]:
data_movies['release_year'] = data_movies['release_year'].astype('datetime64[ns]')
data_movies['genre_name'] = data_movies['genre_name'].astype(str)
data_movies['title'] = data_movies['title'].astype(str)
data_movies['overview'] = data_movies['overview'].astype(str)

In [24]:
data_movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 93420 entries, 0 to 45465
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   budget             93420 non-null  float64       
 1   id                 93420 non-null  int64         
 2   original_language  93403 non-null  object        
 3   overview           93420 non-null  object        
 4   popularity         93420 non-null  object        
 5   release_date       93420 non-null  datetime64[ns]
 6   revenue            93420 non-null  float64       
 7   status             93296 non-null  object        
 8   title              93420 non-null  object        
 9   vote_average       93420 non-null  float64       
 10  vote_count         93420 non-null  float64       
 11  release_year       93420 non-null  datetime64[ns]
 12  return             93420 non-null  float64       
 13  genre_name         93420 non-null  object        
dtypes: datetime

In [25]:
data_movies

Unnamed: 0,budget,id,original_language,overview,popularity,release_date,revenue,status,title,vote_average,vote_count,release_year,return,genre_name
0,30000000.0,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,Released,Toy Story,7.7,5415.0,1970-01-01 00:00:00.000001995,12.451801,Animation
0,30000000.0,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,Released,Toy Story,7.7,5415.0,1970-01-01 00:00:00.000001995,12.451801,Comedy
0,30000000.0,862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373554033.0,Released,Toy Story,7.7,5415.0,1970-01-01 00:00:00.000001995,12.451801,Family
1,65000000.0,8844,en,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,Released,Jumanji,6.9,2413.0,1970-01-01 00:00:00.000001995,4.043035,Adventure
1,65000000.0,8844,en,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262797249.0,Released,Jumanji,6.9,2413.0,1970-01-01 00:00:00.000001995,4.043035,Fantasy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45463,0.0,67758,en,"When one of her hits goes wrong, a professiona...",0.903007,2003-08-01,0.0,Released,Betrayal,3.8,6.0,1970-01-01 00:00:00.000002003,0.000000,Action
45463,0.0,67758,en,"When one of her hits goes wrong, a professiona...",0.903007,2003-08-01,0.0,Released,Betrayal,3.8,6.0,1970-01-01 00:00:00.000002003,0.000000,Drama
45463,0.0,67758,en,"When one of her hits goes wrong, a professiona...",0.903007,2003-08-01,0.0,Released,Betrayal,3.8,6.0,1970-01-01 00:00:00.000002003,0.000000,Thriller
45464,0.0,227506,en,"In a small town live two brothers, one a minis...",0.003503,1917-10-21,0.0,Released,Satan Triumphant,0.0,0.0,1970-01-01 00:00:00.000001917,0.000000,


### Guardamos el archivo en parquet y pasamos al EDA

In [36]:
data_movies.to_parquet('../Source/data_movies.parquet')