### ![Spark Logo Tiny](https://files.training.databricks.com/images/105/logo_spark_tiny.png) Date functions

In [0]:
from pyspark.sql.functions import current_date, date_format, col, to_date

data = [['1','2020-02-01'],
        ['2','2019-03-01'],
        ['3','2021-06-15']
       ]

df = spark.createDataFrame(data,['id','input'])
df.printSchema()
df.show()

root
 |-- id: string (nullable = true)
 |-- input: string (nullable = true)

+---+----------+
| id|     input|
+---+----------+
|  1|2020-02-01|
|  2|2019-03-01|
|  3|2021-06-15|
+---+----------+



#### current_date( )

Utilice current_date() para obtener la fecha actual del sistema. Por defecto, los datos se devolverán en formato yyyy-mm-dd.

In [0]:
df_modif = df.withColumn('current_date',current_date())

df_modif.printSchema()
df_modif.show()

root
 |-- id: string (nullable = true)
 |-- input: string (nullable = true)
 |-- current_date: date (nullable = false)

+---+----------+------------+
| id|     input|current_date|
+---+----------+------------+
|  1|2020-02-01|  2023-01-12|
|  2|2019-03-01|  2023-01-12|
|  3|2021-06-15|  2023-01-12|
+---+----------+------------+



#### date_format( )

##### Cambiar de orden día,mes y año (String)

El siguiente ejemplo utiliza **date_format()** para analizar la fecha y convertirla de formato *yyyy-MM-dd* a *MM-dd-yyyy*.

In [0]:
df_modif = df.select(col('input'),date_format(col('input'), 'dd-MM-yyyy').alias('fecha_nueva'))

df_modif.printSchema()
df_modif.show()

root
 |-- input: string (nullable = true)
 |-- fecha_nueva: string (nullable = true)

+----------+-----------+
|     input|fecha_nueva|
+----------+-----------+
|2020-02-01| 01-02-2020|
|2019-03-01| 01-03-2019|
|2021-06-15| 15-06-2021|
+----------+-----------+



In [0]:
df_modif = df.select(col('input'),date_format(col('input'), 'MMMM dd, yyyy').alias('fecha_nueva'))

df_modif.printSchema()
df_modif.show()

root
 |-- input: string (nullable = true)
 |-- fecha_nueva: string (nullable = true)

+----------+-----------------+
|     input|      fecha_nueva|
+----------+-----------------+
|2020-02-01|February 01, 2020|
|2019-03-01|   March 01, 2019|
|2021-06-15|    June 15, 2021|
+----------+-----------------+



##### Timestamp a String

In [0]:
Buscar un ejemplo

##### Date a String

In [0]:
df_modif = df.select(current_date().alias('fecha_actual'))

df_modif.printSchema()
df_modif.show()

root
 |-- fecha_actual: date (nullable = false)

+------------+
|fecha_actual|
+------------+
|  2023-01-12|
|  2023-01-12|
|  2023-01-12|
+------------+



In [0]:
df_ds = df_modif.select(date_format('fecha_actual','dd-MM-yyyy').alias('fecha_actual_string'))

df_ds.printSchema()
df_ds.show()

root
 |-- fecha_actual_string: string (nullable = false)

+-------------------+
|fecha_actual_string|
+-------------------+
|         12-01-2023|
|         12-01-2023|
|         12-01-2023|
+-------------------+



#### to_date()

##### String a Date

El siguiente ejemplo convierte un string en formato de fecha yyyy-MM-dd a un DateType yyyy-MM-dd usando to_date(). También puedes usar esto para convertir en cualquier formato específico. PySpark soporta todos los patrones de Java DateTimeFormatter.

##### Ejemplo 1

In [0]:
df_modif = df.select(col('input'),to_date(col('input'),'yyyy-MM-dd').alias('fecha_nueva'))

df_modif.printSchema()
df_modif.show()

root
 |-- input: string (nullable = true)
 |-- fecha_nueva: date (nullable = true)

+----------+-----------+
|     input|fecha_nueva|
+----------+-----------+
|2020-02-01| 2020-02-01|
|2019-03-01| 2019-03-01|
|2021-06-15| 2021-06-15|
+----------+-----------+



##### Ejemplo 2

In [0]:
from pyspark.sql.functions import current_date, date_format, col, to_date

data = [['1','2020/02/01'],
        ['2','2019/03/01'],
        ['3','2021/06/12']
       ]

df = spark.createDataFrame(data,['id','input'])
df.printSchema()
df.show()

root
 |-- id: string (nullable = true)
 |-- input: string (nullable = true)

+---+----------+
| id|     input|
+---+----------+
|  1|2020/02/01|
|  2|2019/03/01|
|  3|2021/06/12|
+---+----------+



In [0]:
df_modif = df.withColumn('fecha_1',to_date(col('input'),'yyyy/MM/dd')). \
              withColumn('fecha_2',to_date(col('input'),'yyyy/dd/MM'))

df_modif.printSchema()
df_modif.show()

root
 |-- id: string (nullable = true)
 |-- input: string (nullable = true)
 |-- fecha_1: date (nullable = true)
 |-- fecha_2: date (nullable = true)

+---+----------+----------+----------+
| id|     input|   fecha_1|   fecha_2|
+---+----------+----------+----------+
|  1|2020/02/01|2020-02-01|2020-01-02|
|  2|2019/03/01|2019-03-01|2019-01-03|
|  3|2021/06/12|2021-06-12|2021-12-06|
+---+----------+----------+----------+



##### Ejemplo 3

In [0]:
from pyspark.sql.functions import current_date, date_format, col, to_date, lit, datediff

data = [['1','02/02/2022'],
        ['2','01/03/2021'],
        ['3','04/06/2020']
       ]

df = spark.createDataFrame(data,['id','input'])
df.printSchema()
df.show()

root
 |-- id: string (nullable = true)
 |-- input: string (nullable = true)

+---+----------+
| id|     input|
+---+----------+
|  1|02/02/2022|
|  2|01/03/2021|
|  3|04/06/2020|
+---+----------+



In [0]:
df_modif = df.withColumn('fecha_1',to_date(col('input'),'dd/MM/yyyy')). \
              withColumn('fecha_2',to_date(col('input'),'MM/dd/yyyy'))

df_modif.printSchema()
df_modif.show()

root
 |-- id: string (nullable = true)
 |-- input: string (nullable = true)
 |-- fecha_1: date (nullable = true)
 |-- fecha_2: date (nullable = true)

+---+----------+----------+----------+
| id|     input|   fecha_1|   fecha_2|
+---+----------+----------+----------+
|  1|02/02/2022|2022-02-02|2022-02-02|
|  2|01/03/2021|2021-03-01|2021-01-03|
|  3|04/06/2020|2020-06-04|2020-04-06|
+---+----------+----------+----------+



##### Ejemplo 4

In [0]:
df_modif = df.select(to_date(lit('March 2, 2021'),'MMMM d, yyyy').alias('fecha_nueva'))

df_modif.printSchema()
df_modif.show()

root
 |-- fecha_nueva: date (nullable = true)

+-----------+
|fecha_nueva|
+-----------+
| 2021-03-02|
| 2021-03-02|
| 2021-03-02|
+-----------+



In [0]:
df_modif = df.select(to_date(lit('2021-03-01'),'yyyy-MM-dd').alias('fecha_nueva'))

df_modif.printSchema()
df_modif.show()

root
 |-- fecha_nueva: date (nullable = true)

+-----------+
|fecha_nueva|
+-----------+
| 2021-03-01|
| 2021-03-01|
| 2021-03-01|
+-----------+



##### Timestamp a Date

In [0]:
Buscar un ejemplo

#### datediff( )

El siguiente ejemplo devuelve la diferencia entre dos fechas utilizando datediff().

In [0]:
from pyspark.sql.functions import current_date, date_format, col, to_date

data = [['1','2020-02-01'],
        ['2','2019-03-01'],
        ['3','2021-06-15']
       ]

df = spark.createDataFrame(data,['id','input'])
df.printSchema()
df.show()

root
 |-- id: string (nullable = true)
 |-- input: string (nullable = true)

+---+----------+
| id|     input|
+---+----------+
|  1|2020-02-01|
|  2|2019-03-01|
|  3|2021-06-15|
+---+----------+



In [0]:
df.select(col('input'),datediff(current_date(),col('input')).alias('diferencia')).show()

+----------+----------+
|     input|diferencia|
+----------+----------+
|2020-02-01|      1076|
|2019-03-01|      1413|
|2021-06-15|       576|
+----------+----------+



#### from_unixtime( )

Tenga en cuenta que la columna openDate es de tipo integer y representa una fecha en el formato UNIX epoch

In [0]:
from pyspark.sql.functions import current_date, date_format, col, to_date, from_unixtime

data = [(0,1100746394),(1,1474410343),(2,1116610009),(3,1408024997)]

columnas = ['local_id','fecha_apertura']

df = spark.createDataFrame(data,'local_int INT, fecha_apertura INT')

df.printSchema()
df.show()

root
 |-- local_int: integer (nullable = true)
 |-- fecha_apertura: integer (nullable = true)

+---------+--------------+
|local_int|fecha_apertura|
+---------+--------------+
|        0|    1100746394|
|        1|    1474410343|
|        2|    1116610009|
|        3|    1408024997|
+---------+--------------+



##### Ejemplo 1

UNIX epoch (Integer) a String (con formato MM/dd/yyyy)

In [0]:
df_modif = df.withColumn('fecha_apertura_string', from_unixtime('fecha_apertura', format='MM/dd/yyyy'))

df_modif.printSchema()
df_modif.show(truncate=False)

root
 |-- local_int: integer (nullable = true)
 |-- fecha_apertura: integer (nullable = true)
 |-- fecha_apertura_string: string (nullable = true)

+---------+--------------+---------------------+
|local_int|fecha_apertura|fecha_apertura_string|
+---------+--------------+---------------------+
|0        |1100746394    |11/18/2004           |
|1        |1474410343    |09/20/2016           |
|2        |1116610009    |05/20/2005           |
|3        |1408024997    |08/14/2014           |
+---------+--------------+---------------------+



#### months_between( )

El siguiente ejemplo devuelve los meses entre dos fechas utilizando months_between().

In [0]:
from pyspark.sql.functions import *

data = [['1','2020-02-01'],
        ['2','2019-03-01'],
        ['3','2021-06-15']
       ]

df = spark.createDataFrame(data,['id','input'])
df.printSchema()
df.show()

root
 |-- id: string (nullable = true)
 |-- input: string (nullable = true)

+---+----------+
| id|     input|
+---+----------+
|  1|2020-02-01|
|  2|2019-03-01|
|  3|2021-06-15|
+---+----------+



In [0]:
df.select(col('input'),months_between(current_date(),col('input')).alias('diferencia')).show()

+----------+-----------+
|     input| diferencia|
+----------+-----------+
|2020-02-01|35.35483871|
|2019-03-01|46.35483871|
|2021-06-15|18.90322581|
+----------+-----------+



#### trunc()

In [0]:
df_modif = df.select(col('input'),
              trunc(col('input'),'Month').alias('Mes trunc'),
              trunc(col('input'),'Year').alias('Año trunc')
             )

df_modif.printSchema()
df_modif.show()

root
 |-- input: string (nullable = true)
 |-- Mes trunc: date (nullable = true)
 |-- Año trunc: date (nullable = true)

+----------+----------+----------+
|     input| Mes trunc| Año trunc|
+----------+----------+----------+
|2020-02-01|2020-02-01|2020-01-01|
|2019-03-01|2019-03-01|2019-01-01|
|2021-06-15|2021-06-01|2021-01-01|
+----------+----------+----------+



#### add_month()
#### date_add()
#### date_sub()

In [0]:
df_modif = df.select(col('input'),
              add_months(col('input'),3).alias('add_months'),
              add_months(col('input'),-3).alias('sub months'),
              date_add(col('input'),4).alias('date_add'),
              date_sub(col('input'),4).alias('date_sub')   
             )

df_modif.printSchema()
df_modif.show()

root
 |-- input: string (nullable = true)
 |-- add_months: date (nullable = true)
 |-- sub months: date (nullable = true)
 |-- date_add: date (nullable = true)
 |-- date_sub: date (nullable = true)

+----------+----------+----------+----------+----------+
|     input|add_months|sub months|  date_add|  date_sub|
+----------+----------+----------+----------+----------+
|2020-02-01|2020-05-01|2019-11-01|2020-02-05|2020-01-28|
|2019-03-01|2019-06-01|2018-12-01|2019-03-05|2019-02-25|
|2021-06-15|2021-09-15|2021-03-15|2021-06-19|2021-06-11|
+----------+----------+----------+----------+----------+



#### year( )
#### month( )
#### next_day( )
#### weekofyear( )

In [0]:
df_modif = df.select(col('input'),
              year(col('input')).alias('year'),
              month(col('input')).alias('month'),
              next_day(col('input'),'Sunday').alias('next_day'),
              weekofyear(col('input')).alias('weekofyear') 
             )

df_modif.printSchema()
df_modif.show()

root
 |-- input: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- next_day: date (nullable = true)
 |-- weekofyear: integer (nullable = true)

+----------+----+-----+----------+----------+
|     input|year|month|  next_day|weekofyear|
+----------+----+-----+----------+----------+
|2020-02-01|2020|    2|2020-02-02|         5|
|2019-03-01|2019|    3|2019-03-03|         9|
|2021-06-15|2021|    6|2021-06-20|        24|
+----------+----+-----+----------+----------+



#### dayofweek( )
#### dayofmonth( )
#### dayofyear( )

In [0]:
df_modif = df.select(col('input'),
              dayofweek(col('input')).alias('dayofweek'),
              dayofmonth(col('input')).alias('dayofmonth'),
              dayofyear(col('input')).alias('dayofyear'),
             )

df_modif.printSchema()
df_modif.show()

root
 |-- input: string (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- dayofmonth: integer (nullable = true)
 |-- dayofyear: integer (nullable = true)

+----------+---------+----------+---------+
|     input|dayofweek|dayofmonth|dayofyear|
+----------+---------+----------+---------+
|2020-02-01|        7|         1|       32|
|2019-03-01|        6|         1|       60|
|2021-06-15|        3|        15|      166|
+----------+---------+----------+---------+

