In [1]:
### Probando con datos
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

conf = (SparkConf().set("spark.executor.memory", "6g")
                    .set("spark.executor.cores", "3")
                    .set("spark.executor.instances", "2")
                    .set("spark.eventLog.enabled", "true"))
spark = SparkSession.builder.master("spark://spark-master:7077").appName("ContratosReadParquet").config(conf=conf).getOrCreate()
spark

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/10/23 12:27:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import time
print ('Inicio: '+time.strftime("%c"))
inicio = time.perf_counter()

Inicio: Sun Oct 23 12:27:12 2022


In [4]:
#spark job monitoring
from pyspark.sql.types import StructType
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType
from pyspark.sql.functions import sum, col, desc
import pyspark.sql.functions as F
from pyspark.sql import Window

esquema = StructType() \
      .add("ctId",StringType(),True) \
      .add("fecRef",StringType(),True) \
      .add("dato",StringType(),True) \
      .add("saldo",IntegerType(),True) \
      .add("proc",StringType(),True)


df = spark.read.option("mergeSchema", True).option("schema", "esquema").option("basePath", "file:///opt/workspace/datos/fakeContractsSinFechas.parquet/")\
               .format("parquet").load('./datos/fakeContractsSinFechas.parquet/proc=CT0015')

df.printSchema()

print(df.count())
df.show()
df.explain(extended=True)
df.explain(mode='cost')


root
 |-- ctId: string (nullable = true)
 |-- fecRef: string (nullable = true)
 |-- dato: string (nullable = true)
 |-- saldo: integer (nullable = true)
 |-- proc: string (nullable = true)

79928
+------------+----------+--------------------+-----+------+
|        ctId|    fecRef|                dato|saldo|  proc|
+------------+----------+--------------------+-----+------+
|CT0015371502|2020-08-31|Dato del contrato...|10800|CT0015|
|CT0015371502|2021-04-08|Dato del contrato...| 4700|CT0015|
|CT0015371502|2021-03-06|Dato del contrato...| 4200|CT0015|
|CT0015371502|2021-06-04|Dato del contrato...|56500|CT0015|
|CT0015371502|2020-09-20|Dato del contrato...|34900|CT0015|
|CT0015371502|2021-09-28|Dato del contrato...|24100|CT0015|
|CT0015371502|2021-08-07|Dato del contrato...|16400|CT0015|
|CT0015371516|2021-01-08|Dato del contrato...|79900|CT0015|
|CT0015371516|2020-06-23|Dato del contrato...| 4700|CT0015|
|CT0015371516|2020-04-26|Dato del contrato...| 5700|CT0015|
|CT0015371516|2021-08-07

In [5]:
print ('Final: '+time.strftime("%c"))
final = time.perf_counter()
print(f'Hecho en {round(final - inicio, 4)} segundo(s)')

Final: Sun Oct 23 12:28:47 2022
Hecho en 95.419 segundo(s)


## La función Window define una especificación que consiste en agrupar un campo a nivel de registro para todos los registros (que es la diferencia con el Group By).

In [6]:
print ('Inicio: '+time.strftime("%c"))
inicio = time.perf_counter()

WinSpec = Window.partitionBy('ctId')

dfNew = df.withColumn('lista', F.collect_list(F.col('fecRef')).over(WinSpec))
dfNew.select('ctId', 'lista').show(3, vertical=True, truncate=150)

print ('Final: '+time.strftime("%c"))
final = time.perf_counter()
print(f'Hecho en {round(final - inicio, 4)} segundo(s)')

Inicio: Sun Oct 23 12:29:13 2022




-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------
 ctId  | CT0015123497                                                                                                                                           
 lista | [2021-04-07, 2021-04-27, 2021-12-20, 2021-11-22, 2021-09-03, 2021-07-06, 2021-01-24, 2021-01-16, 2020-08-01, 2020-07-17, 2021-09-22, 2020-06-27, 20... 
-RECORD 1-------------------------------------------------------------------------------------------------------------------------------------------------------
 ctId  | CT0015123497                                                                                                                                           
 lista | [2021-04-07, 2021-04-27, 2021-12-20, 2021-11-22, 2021-09-03, 2021-07-06, 2021-01-24, 2021-01-16, 2020-08-01, 2020-07-17, 2021-09-22, 2020-06-27, 20... 
-RECORD 2-------------------------

                                                                                

In [7]:
print ('Inicio: '+time.strftime("%c"))
inicio = time.perf_counter()

"""
 *  La opción order by va acumulando elementos en la lista pero sin repetir...
"""

WinSpec = Window.partitionBy('ctId').orderBy('fecRef')

dfNew = df.withColumn('lista', F.collect_list(F.col('fecRef')).over(WinSpec))
dfNew.select('ctId', 'fecRef', 'lista').show(15, vertical=True, truncate=150)

print ('Final: '+time.strftime("%c"))
final = time.perf_counter()
print(f'Hecho en {round(final - inicio, 4)} segundo(s)')

Inicio: Sun Oct 23 12:29:46 2022
-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------
 ctId   | CT0015123497                                                                                                                                           
 fecRef | 2020-06-27                                                                                                                                             
 lista  | [2020-06-27]                                                                                                                                           
-RECORD 1--------------------------------------------------------------------------------------------------------------------------------------------------------
 ctId   | CT0015123497                                                                                                                                       

In [8]:
print ('Inicio: '+time.strftime("%c"))
inicio = time.perf_counter()

"""
 *  Probando rowsbetween
"""

WinSpec = Window.partitionBy('ctId').orderBy('fecRef').rowsBetween(-1, Window.currentRow)

dfNew = df.withColumn('lista', F.collect_list(F.col('fecRef')).over(WinSpec))
dfNew.select('ctId', 'fecRef', 'lista').show(15, vertical=True, truncate=150)

print ('Final: '+time.strftime("%c"))
final = time.perf_counter()
print(f'Hecho en {round(final - inicio, 4)} segundo(s)')

Inicio: Sun Oct 23 12:30:18 2022
-RECORD 0--------------------------
 ctId   | CT0015123497             
 fecRef | 2020-06-27               
 lista  | [2020-06-27]             
-RECORD 1--------------------------
 ctId   | CT0015123497             
 fecRef | 2020-07-17               
 lista  | [2020-06-27, 2020-07-17] 
-RECORD 2--------------------------
 ctId   | CT0015123497             
 fecRef | 2020-08-01               
 lista  | [2020-07-17, 2020-08-01] 
-RECORD 3--------------------------
 ctId   | CT0015123497             
 fecRef | 2020-10-13               
 lista  | [2020-08-01, 2020-10-13] 
-RECORD 4--------------------------
 ctId   | CT0015123497             
 fecRef | 2021-01-16               
 lista  | [2020-10-13, 2021-01-16] 
-RECORD 5--------------------------
 ctId   | CT0015123497             
 fecRef | 2021-01-24               
 lista  | [2021-01-16, 2021-01-24] 
-RECORD 6--------------------------
 ctId   | CT0015123497             
 fecRef | 2021-04-07           

In [9]:
dfNew.collect()[15]

Row(ctId='CT0015123556', fecRef='2020-12-03', dato='Dato del contrato CT0015123556 en 2020-12-03', saldo=10300, proc='CT0015', lista=['2020-07-10', '2020-12-03'])

In [11]:
dfNew.collect()[8900]['lista']

['2021-01-22', '2021-02-03']

In [13]:
dfGrp = df.groupBy('ctId').agg(F.expr('collect_list(fecRef)').alias('lista'))
dfGrp.show(truncate=False)
dfGrp.count()

+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ctId        |lista                                                                                                                                                                   |
+------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|CT0015123497|[2021-04-07, 2021-04-27, 2021-12-20, 2021-11-22, 2021-09-03, 2021-07-06, 2021-01-24, 2021-01-16, 2020-08-01, 2020-07-17, 2021-09-22, 2020-06-27, 2020-10-13]            |
|CT0015123543|[2021-02-13, 2020-07-31, 2021-07-06, 2020-08-26, 2021-03-29, 2021-04-28, 2021-12-06, 2020-07-05, 2020-02-27, 2020-09-22, 2021-01-11, 2021-01-23]                        |
|CT0015123556|[2021-10-28, 2021-03-25, 2021-08-30, 2020-05-14, 2021-02-26, 2020-

10000

In [None]:
spark.sparkContext.stop()
print('Sacabao')