- #### Acciones
    - ##### count
    - ##### collect
    - ##### show
    - ##### isEmpty
    - ##### head
    - ##### first
    - ##### take
    - ##### tail
- #### Transformaciones
    - ##### withColumn
    - ##### withColumnRenamed
    - ##### drop

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
        .appName("sesion_1") \
        .master("local[*]") \
        .getOrCreate()

In [2]:
def read_csv(path):
    return spark.read\
        .option("header","true")\
        .option("delimiter",",")\
        .option("inferSchema","false")\
        .csv(path)

base_path = "../../resources/data/csv/"
contracts_df = read_csv(base_path + "contracts.csv")

contracts_df.show()

+-------+-----------+------------+----------+------+
|cod_iuc|cod_titular|cod_producto|  fec_alta|activo|
+-------+-----------+------------+----------+------+
|  30000|      00006|         100|2012-05-01|  true|
|  30001|      00006|         200|2014-05-01|  true|
|  30002|      00006|         300|2006-02-01| false|
|  30003|      00006|         150|2012-05-01|  true|
|  30002|      00005|         300|2012-05-01|  true|
|  30004|      00006|         400|2012-05-01| false|
|  30005|      00006|         500|2012-05-01|  true|
|  30006|      00006|         600|2012-05-01| false|
|  30003|      00003|         150|2019-10-14|  true|
|  30007|      00006|         700|2014-02-01| false|
|  30008|      00006|         800|2012-05-01|  true|
|  30009|      00006|         900|2015-09-01|  true|
|  30009|      00002|         900|2009-10-01|  true|
|  30010|      00006|        1000|2014-02-01|  true|
|  30003|      00002|         150|2018-09-18|  true|
|  30011|      00003|        1100|2018-10-01| 

## Actions

In [3]:
#count - cuenta el número de registros
print(contracts_df.count())

32


In [4]:
#collect 
print(contracts_df.collect())

[Row(cod_iuc='30000', cod_titular='00006', cod_producto='100', fec_alta='2012-05-01', activo='true'), Row(cod_iuc='30001', cod_titular='00006', cod_producto='200', fec_alta='2014-05-01', activo='true'), Row(cod_iuc='30002', cod_titular='00006', cod_producto='300', fec_alta='2006-02-01', activo='false'), Row(cod_iuc='30003', cod_titular='00006', cod_producto='150', fec_alta='2012-05-01', activo='true'), Row(cod_iuc='30002', cod_titular='00005', cod_producto='300', fec_alta='2012-05-01', activo='true'), Row(cod_iuc='30004', cod_titular='00006', cod_producto='400', fec_alta='2012-05-01', activo='false'), Row(cod_iuc='30005', cod_titular='00006', cod_producto='500', fec_alta='2012-05-01', activo='true'), Row(cod_iuc='30006', cod_titular='00006', cod_producto='600', fec_alta='2012-05-01', activo='false'), Row(cod_iuc='30003', cod_titular='00003', cod_producto='150', fec_alta='2019-10-14', activo='true'), Row(cod_iuc='30007', cod_titular='00006', cod_producto='700', fec_alta='2014-02-01', ac

In [5]:
#show
contracts_df.show(5, False)

+-------+-----------+------------+----------+------+
|cod_iuc|cod_titular|cod_producto|fec_alta  |activo|
+-------+-----------+------------+----------+------+
|30000  |00006      |100         |2012-05-01|true  |
|30001  |00006      |200         |2014-05-01|true  |
|30002  |00006      |300         |2006-02-01|false |
|30003  |00006      |150         |2012-05-01|true  |
|30002  |00005      |300         |2012-05-01|true  |
+-------+-----------+------------+----------+------+
only showing top 5 rows



In [6]:
#isEmpty
contracts_df.isEmpty()

False

In [7]:
#first() -> primer elemento del df -> head()
#head(n) -> primer N-elementos del df -> take(n)
#take -> primer N-elementos del df -> limit().collect()

contracts_df.head(3)

[Row(cod_iuc='30000', cod_titular='00006', cod_producto='100', fec_alta='2012-05-01', activo='true'),
 Row(cod_iuc='30001', cod_titular='00006', cod_producto='200', fec_alta='2014-05-01', activo='true'),
 Row(cod_iuc='30002', cod_titular='00006', cod_producto='300', fec_alta='2006-02-01', activo='false')]

In [8]:
#tail
contracts_df.tail(3)

[Row(cod_iuc='30008', cod_titular='00002', cod_producto='800', fec_alta='2014-03-21', activo='true'),
 Row(cod_iuc='30004', cod_titular='00002', cod_producto='400', fec_alta='2008-02-01', activo='false'),
 Row(cod_iuc='30010', cod_titular='00007', cod_producto='1000', fec_alta='2014-08-01', activo='false')]

## Transformations

In [9]:
import pyspark.sql.functions as f
import pyspark.sql.types as t

#### withColumn, withColumnRenamed, drop

In [10]:
resulted_df = contracts_df \
    .withColumnRenamed("fec_alta", "fec_alta_ini") \
    .withColumn("actual_date", f.current_date()) \
    .withColumn("randm_num", f.round(f.rand(0) * f.lit(10)).cast(t.IntegerType())) \
    .withColumn("fec_alta_fin", f.date_add(f.col("fec_alta_ini"), f.col("randm_num"))) \
    .withColumn("diff", f.datediff(f.col("fec_alta_fin"), f.col("fec_alta_ini"))) \
    .drop("randm_num")

resulted_df.show()
resulted_df.printSchema()

+-------+-----------+------------+------------+------+-----------+------------+----+
|cod_iuc|cod_titular|cod_producto|fec_alta_ini|activo|actual_date|fec_alta_fin|diff|
+-------+-----------+------------+------------+------+-----------+------------+----+
|  30000|      00006|         100|  2012-05-01|  true| 2024-02-21|  2012-05-09|   8|
|  30001|      00006|         200|  2014-05-01|  true| 2024-02-21|  2014-05-06|   5|
|  30002|      00006|         300|  2006-02-01| false| 2024-02-21|  2006-02-02|   1|
|  30003|      00006|         150|  2012-05-01|  true| 2024-02-21|  2012-05-04|   3|
|  30002|      00005|         300|  2012-05-01|  true| 2024-02-21|  2012-05-08|   7|
|  30004|      00006|         400|  2012-05-01| false| 2024-02-21|  2012-05-04|   3|
|  30005|      00006|         500|  2012-05-01|  true| 2024-02-21|  2012-05-03|   2|
|  30006|      00006|         600|  2012-05-01| false| 2024-02-21|  2012-05-07|   6|
|  30003|      00003|         150|  2019-10-14|  true| 2024-02-21

In [11]:
dict_cols = {
    "actual_date": f.current_date(),
    "randm_num": f.round(f.rand(0) * f.lit(10)).cast(t.IntegerType()),
    "fec_alta_fin": f.date_add(f.col("fec_alta_ini"), f.col("randm_num")),
    "diff": f.datediff(f.col("fec_alta_fin"), f.col("fec_alta_ini"))
}

resulted_df = contracts_df \
    .withColumnRenamed("fec_alta", "fec_alta_ini") \
    .withColumns(dict_cols) \
    .drop("randm_num")

resulted_df.show()
resulted_df.printSchema()

+-------+-----------+------------+------------+------+-----------+------------+----+
|cod_iuc|cod_titular|cod_producto|fec_alta_ini|activo|actual_date|fec_alta_fin|diff|
+-------+-----------+------------+------------+------+-----------+------------+----+
|  30000|      00006|         100|  2012-05-01|  true| 2024-02-21|  2012-05-09|   8|
|  30001|      00006|         200|  2014-05-01|  true| 2024-02-21|  2014-05-06|   5|
|  30002|      00006|         300|  2006-02-01| false| 2024-02-21|  2006-02-02|   1|
|  30003|      00006|         150|  2012-05-01|  true| 2024-02-21|  2012-05-04|   3|
|  30002|      00005|         300|  2012-05-01|  true| 2024-02-21|  2012-05-08|   7|
|  30004|      00006|         400|  2012-05-01| false| 2024-02-21|  2012-05-04|   3|
|  30005|      00006|         500|  2012-05-01|  true| 2024-02-21|  2012-05-03|   2|
|  30006|      00006|         600|  2012-05-01| false| 2024-02-21|  2012-05-07|   6|
|  30003|      00003|         150|  2019-10-14|  true| 2024-02-21