- #### Transformaciones
    - ##### select - when
    - ##### where/filter


In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("sesion_1") \
        .master("local[*]") \
        .getOrCreate()

In [2]:
def read_csv(path):
    return spark.read\
        .option("header","true")\
        .option("delimiter",",")\
        .option("inferSchema","false")\
        .csv(path)

base_path = "../../resources/data/csv/"
contracts_df = read_csv(base_path + "contracts.csv")

contracts_df.show()

+-------+-----------+------------+----------+------+
|cod_iuc|cod_titular|cod_producto|  fec_alta|activo|
+-------+-----------+------------+----------+------+
|  30000|      00006|         100|2012-05-01|  true|
|  30001|      00006|         200|2014-05-01|  true|
|  30002|      00006|         300|2006-02-01| false|
|  30003|      00006|         150|2012-05-01|  true|
|  30002|      00005|         300|2012-05-01|  true|
|  30004|      00006|         400|2012-05-01| false|
|  30005|      00006|         500|2012-05-01|  true|
|  30006|      00006|         600|2012-05-01| false|
|  30003|      00003|         150|2019-10-14|  true|
|  30007|      00006|         700|2014-02-01| false|
|  30008|      00006|         800|2012-05-01|  true|
|  30009|      00006|         900|2015-09-01|  true|
|  30009|      00002|         900|2009-10-01|  true|
|  30010|      00006|        1000|2014-02-01|  true|
|  30003|      00002|         150|2018-09-18|  true|
|  30011|      00003|        1100|2018-10-01| 

In [3]:
import pyspark.sql.functions as f
import pyspark.sql.types as t

# Select
# Método que regresa diferencia entre dos listas, se puede usar para eliminar columnas dentro de un select:
def difference(l1, l2):
    return list(set(l1) - set(l2))
# ------------------------------------------------
resulted_df = contracts_df \
    .select(
        *difference(contracts_df.columns, ["fec_alta", "activo"]),
        f.col("activo").cast(t.BooleanType()),
        f.col("fec_alta").alias("fec_alta_ini").cast(t.DateType()),
        f.current_date().alias("actual_date"),
        f.round(f.rand(0) * f.lit(10)).cast(t.IntegerType()).alias("randm_num"),
        f.date_add(f.col("fec_alta_ini"), f.col("randm_num")).alias("fec_alta_fin"),
        f.datediff(f.col("fec_alta_fin"), f.col("fec_alta_ini")).alias("diff")
        ) \
    .drop("randm_num")

resulted_df.show()
resulted_df.printSchema()

+-------+-----------+------------+------+------------+-----------+------------+----+
|cod_iuc|cod_titular|cod_producto|activo|fec_alta_ini|actual_date|fec_alta_fin|diff|
+-------+-----------+------------+------+------------+-----------+------------+----+
|  30000|      00006|         100|  true|  2012-05-01| 2024-02-22|  2012-05-09|   8|
|  30001|      00006|         200|  true|  2014-05-01| 2024-02-22|  2014-05-06|   5|
|  30002|      00006|         300| false|  2006-02-01| 2024-02-22|  2006-02-02|   1|
|  30003|      00006|         150|  true|  2012-05-01| 2024-02-22|  2012-05-04|   3|
|  30002|      00005|         300|  true|  2012-05-01| 2024-02-22|  2012-05-08|   7|
|  30004|      00006|         400| false|  2012-05-01| 2024-02-22|  2012-05-04|   3|
|  30005|      00006|         500|  true|  2012-05-01| 2024-02-22|  2012-05-03|   2|
|  30006|      00006|         600| false|  2012-05-01| 2024-02-22|  2012-05-07|   6|
|  30003|      00003|         150|  true|  2019-10-14| 2024-02-22

In [4]:
# select- when

cond_1 = f.col("cod_producto") <= 300 # baja
cond_2 = f.col("cod_producto") <= 600 # media
cond_3 = f.col("cod_producto") <= 1000 # alta

select_when_df = resulted_df \
    .select(
        *difference(resulted_df.columns, ["activo"]),
        f.when(cond_1, f.lit("baja"))
            .when(cond_2, f.lit("media"))
            .when(cond_3, f.lit("alta"))
            .otherwise(f.lit("muy alta")).alias("calidad"),
        f.when((f.col("activo") == f.lit(True)) & (f.col("calidad").isin("alta", "muy alta")), f.lit("ok")).alias("prioridad_alta"),
        f.when(f.col("activo") == True, f.lit(True)).alias("activo")
    )

select_when_df.show()

+------------+-----------+------------+-----------+-------+----+------------+--------+--------------+------+
|fec_alta_fin|cod_titular|fec_alta_ini|actual_date|cod_iuc|diff|cod_producto| calidad|prioridad_alta|activo|
+------------+-----------+------------+-----------+-------+----+------------+--------+--------------+------+
|  2012-05-09|      00006|  2012-05-01| 2024-02-22|  30000|   8|         100|    baja|          null|  true|
|  2014-05-06|      00006|  2014-05-01| 2024-02-22|  30001|   5|         200|    baja|          null|  true|
|  2006-02-02|      00006|  2006-02-01| 2024-02-22|  30002|   1|         300|    baja|          null|  null|
|  2012-05-04|      00006|  2012-05-01| 2024-02-22|  30003|   3|         150|    baja|          null|  true|
|  2012-05-08|      00005|  2012-05-01| 2024-02-22|  30002|   7|         300|    baja|          null|  true|
|  2012-05-04|      00006|  2012-05-01| 2024-02-22|  30004|   3|         400|   media|          null|  null|
|  2012-05-03|     

In [5]:
# where / filter
select_when_df \
    .filter(f.col("activo").isNotNull()) \
    .filter(~f.col("cod_producto").isin("100", "200", "150", "300")) \
    .where(f.col("fec_alta_ini").between("2014-02-01", "2018-10-01")) \
    .where((f.col("calidad") == "media") | (f.col("cod_titular").isin("00006", "00001"))) \
    .show()

+------------+-----------+------------+-----------+-------+----+------------+-------+--------------+------+
|fec_alta_fin|cod_titular|fec_alta_ini|actual_date|cod_iuc|diff|cod_producto|calidad|prioridad_alta|activo|
+------------+-----------+------------+-----------+-------+----+------------+-------+--------------+------+
|  2015-09-07|      00006|  2015-09-01| 2024-02-22|  30009|   6|         900|   alta|            ok|  true|
|  2014-02-06|      00006|  2014-02-01| 2024-02-22|  30010|   5|        1000|   alta|            ok|  true|
|  2016-07-03|      00001|  2016-07-01| 2024-02-22|  30007|   2|         700|   alta|            ok|  true|
|  2016-12-10|      00005|  2016-12-10| 2024-02-22|  30006|   0|         600|  media|          null|  true|
+------------+-----------+------------+-----------+-------+----+------------+-------+--------------+------+



In [6]:
select_when_df \
    .write.mode("overwrite").parquet("../../resources/data/parquet/contracts_tmp")