In [0]:
dbutils.widgets.text("p_environment", "")

In [0]:
dbutils.widgets.get("p_environment")
v_environment = dbutils.widgets.get("p_environment")
print(v_environment)

## Ingestion del archivo movie_cast.json

### Paso 1- Leer el archivo JSON usando "DataFrameReader" de Spark

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [0]:
# {
#     "movieId": 285,
#     "personId": 85,
#     "characterName": "Captain Jack Sparrow",
#     "genderId": 2,
#     "castOrder": 0
#   }

movie_cast_schema = StructType(fields=[
    StructField("movieId", IntegerType(), True),
    StructField("personId", IntegerType(), True),
    StructField("characterName", StringType(), True),
    StructField("genderId", IntegerType(), True),
    StructField("castOrder", IntegerType(), True)

])


In [0]:
movie_cast_df = spark.read \
    .schema(movie_cast_schema) \
    .option("multiLine", True) \
    .json("/mnt/moviehistoryl/bronce/movie_cast.json")

In [0]:
movie_cast_df.printSchema()

In [0]:
movie_cast_df.display()

### Paso 2 - Cambiar el nombre de las columnas y añadir "ingestion_date" y "evironment"
1. "movieId" renombrar a "movie_id"
2. "personId" renombrar a "person_id"
3. "ChacarterName" renombrar a "character_name"
4. Agregar las columnas "ingestion_date" y environment"


In [0]:
from pyspark.sql.functions import col, concat, current_timestamp, lit

In [0]:
movie_final_cast_df = movie_cast_df \
    .withColumnRenamed("movieId", "movie_id") \
    .withColumnRenamed("personId", "person_id") \
    .withColumnRenamed("characterName", "character_named") \
    .withColumn("ingestion_date", current_timestamp()) \
    .withColumn("environment", lit(v_environment)) \
    .drop(col("genderId"), col("castOrder"))

movie_final_cast_df.display()

### Paso 3 - Escribir la salida en formato parquet

In [0]:
movie_final_cast_df.write.mode("overwrite").parquet("/mnt/moviehistoryl/silver/movie_cast")

In [0]:
spark.read.parquet("/mnt/moviehistoryl/silver/movie_cast").display()

In [0]:
dbutils.notebook.exit("success")