# Funciones especiales

In [1]:
from pyspark.sql import SparkSession

# Fehca y hora
from pyspark.sql.functions import col, to_date, to_timestamp
from pyspark.sql.functions import date_format
from pyspark.sql.functions import datediff, months_between, last_day
from pyspark.sql.functions import date_add, date_sub
from pyspark.sql.functions import (year, month, dayofmonth,
                                   dayofyear, hour, minute, second)

# Cadenas de texto
from pyspark.sql.functions import ltrim, rtrim, trim
from pyspark.sql.functions import col, lpad, rpad
from pyspark.sql.functions import concat_ws, lower, upper, initcap, reverse
from pyspark.sql.functions import regexp_replace

path='files/'

In [2]:
spark = SparkSession.builder.getOrCreate()

24/01/08 11:44:19 WARN Utils: Your hostname, luis-Nitro-AN515-52 resolves to a loopback address: 127.0.1.1; using 192.168.1.17 instead (on interface enp7s0f1)
24/01/08 11:44:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/08 11:44:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Fecha y Hora

In [3]:
data = spark.read.parquet(path+'convertir')

data.printSchema()

                                                                                

root
 |-- date: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- date_str: string (nullable = true)
 |-- ts_str: string (nullable = true)



In [4]:
data.show()

+----------+--------------------+----------+----------------+
|      date|           timestamp|  date_str|          ts_str|
+----------+--------------------+----------+----------------+
|2021-01-01|2021-01-01 20:10:...|01-01-2021|18-08-2021 46:58|
+----------+--------------------+----------+----------------+



#### date & timestamp

In [5]:

data1 = data.select(
    to_date(col('date')).alias('date1'),
    to_timestamp(col('timestamp')).alias('ts1'),
    to_date(col('date_str'), 'dd-MM-yyyy').alias('date2'),
    to_timestamp(col('ts_str'), 'dd-MM-yyyy mm:ss').alias('ts2')

)

data1.show(truncate=False)

+----------+-----------------------+----------+-------------------+
|date1     |ts1                    |date2     |ts2                |
+----------+-----------------------+----------+-------------------+
|2021-01-01|2021-01-01 20:10:50.723|2021-01-01|2021-08-18 00:46:58|
+----------+-----------------------+----------+-------------------+



In [6]:
data1.printSchema()

root
 |-- date1: date (nullable = true)
 |-- ts1: timestamp (nullable = true)
 |-- date2: date (nullable = true)
 |-- ts2: timestamp (nullable = true)



### date_format

In [7]:
# Dar formato de fecha
data1.select(
    date_format(col('date1'), 'dd-MM-yyyy')
).show()

+------------------------------+
|date_format(date1, dd-MM-yyyy)|
+------------------------------+
|                    01-01-2021|
+------------------------------+



In [8]:
calculo = spark.read.parquet(path+'calculo')
calculo.show()

+------+-------------+------------+-------------------+
|nombre|fecha_ingreso|fecha_salida|       baja_sistema|
+------+-------------+------------+-------------------+
|  Jose|   2021-01-01|  2021-11-14|2021-10-14 15:35:59|
|Mayara|   2021-02-06|  2021-11-25|2021-11-25 10:35:55|
+------+-------------+------------+-------------------+



#### datediff 

In [9]:
# Calculos de fecha y hora
calculo.select(
    col('nombre'),
    datediff(col('fecha_salida'), col('fecha_ingreso')).alias('dias'),
    months_between(col('fecha_salida'), col('fecha_ingreso')).alias('meses'),
    last_day(col('fecha_salida')).alias('ultimo_dia_mes')
).show()

+------+----+-----------+--------------+
|nombre|dias|      meses|ultimo_dia_mes|
+------+----+-----------+--------------+
|  Jose| 317|10.41935484|    2021-11-30|
|Mayara| 292| 9.61290323|    2021-11-30|
+------+----+-----------+--------------+



### add & sub

In [10]:
# Sumar y restar fechas
calculo.select(
    col('nombre'),
    col('fecha_ingreso'),
    date_add(col('fecha_ingreso'), 14).alias('mas_14_dias'),
    date_sub(col('fecha_ingreso'), 1).alias('menos_1_dia')
).show()

+------+-------------+-----------+-----------+
|nombre|fecha_ingreso|mas_14_dias|menos_1_dia|
+------+-------------+-----------+-----------+
|  Jose|   2021-01-01| 2021-01-15| 2020-12-31|
|Mayara|   2021-02-06| 2021-02-20| 2021-02-05|
+------+-------------+-----------+-----------+



### year, month, day, hour, minute, second

In [11]:
# Extracion de datos de la fecha
calculo.select(
    col('baja_sistema'),
    year(col('baja_sistema')),
    month(col('baja_sistema')),
    dayofmonth(col('baja_sistema')),
    dayofyear(col('baja_sistema')),
    hour(col('baja_sistema')),
    minute(col('baja_sistema')),
    second(col('baja_sistema'))
).show()

+-------------------+------------------+-------------------+------------------------+-----------------------+------------------+--------------------+--------------------+
|       baja_sistema|year(baja_sistema)|month(baja_sistema)|dayofmonth(baja_sistema)|dayofyear(baja_sistema)|hour(baja_sistema)|minute(baja_sistema)|second(baja_sistema)|
+-------------------+------------------+-------------------+------------------------+-----------------------+------------------+--------------------+--------------------+
|2021-10-14 15:35:59|              2021|                 10|                      14|                    287|                15|                  35|                  59|
|2021-11-25 10:35:55|              2021|                 11|                      25|                    329|                10|                  35|                  55|
+-------------------+------------------+-------------------+------------------------+-----------------------+------------------+-----------------

## Cadenas de texto

In [13]:
data_str = spark.read.parquet(path+'data.parquet')
data_str.show() # ' Spark '

+-------+
| nombre|
+-------+
| Spark |
+-------+



In [15]:
# Borrar espacios
data_str.select(
    ltrim('nombre').alias('ltrim'), # Borrar espacios a la izquierda
    rtrim('nombre').alias('rtrim'), # Borrar espacios a la derecha
    trim('nombre').alias('trim') # Borrar espacios a la derecha e izquierda
).show()

+------+------+-----+
| ltrim| rtrim| trim|
+------+------+-----+
|Spark | Spark|Spark|
+------+------+-----+



In [20]:
# Agregar caracteres 
data_str.select(
    trim(col('nombre')).alias('trim') # Borrar espacios 
).select(
    lpad(col('trim'), 10, '-*').alias('lpad'), # agregar caracteres a la izquierda
    rpad(col('trim'), 10, '^=').alias('rpad') # agregar caracteres a la derecha
).show()

+----------+----------+
|      lpad|      rpad|
+----------+----------+
|-*-*-Spark|Spark^=^=^|
+----------+----------+



In [21]:
df1 = spark.createDataFrame(
    [('Spark', 'es', 'maravilloso')],
    ['sujeto', 'verbo', 'adjetivo'])

df1.show()

                                                                                

+------+-----+-----------+
|sujeto|verbo|   adjetivo|
+------+-----+-----------+
| Spark|   es|maravilloso|
+------+-----+-----------+



In [23]:
# transformaciones de texto
df1.select(
    concat_ws(' ', col('sujeto'), col('verbo'), col('adjetivo')).alias('frase') # unir strr
).select(
    col('frase'),
    lower(col('frase')).alias('minuscula'), # minuscula
    upper(col('frase')).alias('mayuscula'), # mayuscula
    initcap(col('frase')).alias('initcap'), # primera letra de cada palabra en mayuscula
    reverse(col('frase')).alias('reversa') # invertir texto
).show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|               frase|           minuscula|           mayuscula|             initcap|             reversa|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Spark es maravilloso|spark es maravilloso|SPARK ES MARAVILLOSO|Spark Es Maravilloso|osollivaram se krapS|
+--------------------+--------------------+--------------------+--------------------+--------------------+



In [24]:
df2 = spark.createDataFrame([(' voy a casa por mis llaves',)], ['frase'])
df2.show(truncate=False)

+--------------------------+
|frase                     |
+--------------------------+
| voy a casa por mis llaves|
+--------------------------+



In [25]:
# aplicar expresiones regulares
df2.select(
    regexp_replace(col('frase'), 'voy|por', 'ir').alias('nueva_frase')
).show(truncate=False)

+------------------------+
|nueva_frase             |
+------------------------+
| ir a casa ir mis llaves|
+------------------------+

