- #### Transformaciones
    - ##### Windows

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("sesion_1") \
        .master("local[*]") \
        .getOrCreate()

In [2]:
def read_csv(path):
    return spark.read\
        .option("header","true")\
        .option("delimiter",",")\
        .option("inferSchema","false")\
        .csv(path)

base_path = "../../resources/data/csv/"
contracts_df = read_csv(base_path + "contracts.csv")
contracts_df.show()

+-------+-----------+------------+----------+------+
|cod_iuc|cod_titular|cod_producto|  fec_alta|activo|
+-------+-----------+------------+----------+------+
|  30000|      00006|         100|2012-05-01|  true|
|  30001|      00006|         200|2014-05-01|  true|
|  30002|      00006|         300|2006-02-01| false|
|  30003|      00006|         150|2012-05-01|  true|
|  30002|      00005|         300|2012-05-01|  true|
|  30004|      00006|         400|2012-05-01| false|
|  30005|      00006|         500|2012-05-01|  true|
|  30006|      00006|         600|2012-05-01| false|
|  30003|      00003|         150|2019-10-14|  true|
|  30007|      00006|         700|2014-02-01| false|
|  30008|      00006|         800|2012-05-01|  true|
|  30009|      00006|         900|2015-09-01|  true|
|  30009|      00002|         900|2009-10-01|  true|
|  30010|      00006|        1000|2014-02-01|  true|
|  30003|      00002|         150|2018-09-18|  true|
|  30011|      00003|        1100|2018-10-01| 

In [3]:
# Windows
    # Agregacion -> max, min, count, avg, mean, sum, ...
    # Ranking -> row_number, rank, dense_rank, ...
    # lag lead

from pyspark.sql import Window
import pyspark.sql.functions as f

contracts_df.orderBy(f.col("cod_iuc")).show()

window_1 = Window.partitionBy(f.col("cod_iuc"))
window_2 = Window.partitionBy(f.col("cod_iuc")).orderBy(f.col("fec_alta").asc())


contracts_df.select(
    *contracts_df.columns,
    f.count("*").over(window_1).alias("count_w"),
    f.max(f.col("fec_alta")).over(window_2).alias("max_fec_alta_w"),
    f.sum(f.col("cod_producto")).over(window_2).alias("sum_cod_producto_w")
).show()

+-------+-----------+------------+----------+------+
|cod_iuc|cod_titular|cod_producto|  fec_alta|activo|
+-------+-----------+------------+----------+------+
|  30000|      00006|         100|2012-05-01|  true|
|  30000|      00005|         100|2015-05-25|  true|
|  30001|      00006|         200|2014-05-01|  true|
|  30001|      00004|         200|2017-12-01|  true|
|  30001|      00005|         200|2019-11-10|  true|
|  30002|      00006|         300|2006-02-01| false|
|  30002|      00005|         300|2012-05-01|  true|
|  30002|      00007|         300|2001-04-11| false|
|  30003|      00006|         150|2012-05-01|  true|
|  30003|      00003|         150|2019-10-14|  true|
|  30003|      00002|         150|2018-09-18|  true|
|  30003|      00001|         150|2016-05-01|  true|
|  30003|      00004|         150|2020-03-01|  true|
|  30004|      00006|         400|2012-05-01| false|
|  30004|      00002|         400|2008-02-01| false|
|  30005|      00006|         500|2012-05-01| 

In [4]:
contracts_df.orderBy(f.col("cod_iuc")).show()

window_3 = Window.partitionBy(f.col("cod_iuc")).orderBy(f.col("cod_titular").asc())


contracts_df.select(
    *contracts_df.columns,
    f.row_number().over(window_3).alias("row_number"),
    f.rank().over(window_3).alias("rank"),
    f.dense_rank().over(window_3).alias("dense_rank")
).show(100)

+-------+-----------+------------+----------+------+
|cod_iuc|cod_titular|cod_producto|  fec_alta|activo|
+-------+-----------+------------+----------+------+
|  30000|      00006|         100|2012-05-01|  true|
|  30000|      00005|         100|2015-05-25|  true|
|  30001|      00006|         200|2014-05-01|  true|
|  30001|      00004|         200|2017-12-01|  true|
|  30001|      00005|         200|2019-11-10|  true|
|  30002|      00006|         300|2006-02-01| false|
|  30002|      00005|         300|2012-05-01|  true|
|  30002|      00007|         300|2001-04-11| false|
|  30003|      00006|         150|2012-05-01|  true|
|  30003|      00003|         150|2019-10-14|  true|
|  30003|      00002|         150|2018-09-18|  true|
|  30003|      00001|         150|2016-05-01|  true|
|  30003|      00004|         150|2020-03-01|  true|
|  30004|      00006|         400|2012-05-01| false|
|  30004|      00002|         400|2008-02-01| false|
|  30005|      00006|         500|2012-05-01| 

In [5]:
contracts_df.orderBy(f.col("cod_iuc")).show()

window_4 = Window.partitionBy(f.col("cod_iuc")).orderBy(f.col("fec_alta").desc())


contracts_df.select(
    *contracts_df.columns,
    f.lag(f.col("fec_alta"), 1, "1970-01-01").over(window_4).alias("lag"),
    f.lead(f.col("fec_alta"), 1, "1970-01-01").over(window_4).alias("lead")
).show(100)

+-------+-----------+------------+----------+------+
|cod_iuc|cod_titular|cod_producto|  fec_alta|activo|
+-------+-----------+------------+----------+------+
|  30000|      00006|         100|2012-05-01|  true|
|  30000|      00005|         100|2015-05-25|  true|
|  30001|      00006|         200|2014-05-01|  true|
|  30001|      00004|         200|2017-12-01|  true|
|  30001|      00005|         200|2019-11-10|  true|
|  30002|      00006|         300|2006-02-01| false|
|  30002|      00005|         300|2012-05-01|  true|
|  30002|      00007|         300|2001-04-11| false|
|  30003|      00006|         150|2012-05-01|  true|
|  30003|      00003|         150|2019-10-14|  true|
|  30003|      00002|         150|2018-09-18|  true|
|  30003|      00001|         150|2016-05-01|  true|
|  30003|      00004|         150|2020-03-01|  true|
|  30004|      00006|         400|2012-05-01| false|
|  30004|      00002|         400|2008-02-01| false|
|  30005|      00006|         500|2012-05-01| 