In [1]:
### Probando con datos
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

conf = (SparkConf().set("spark.executor.memory", "6g")
                    .set("spark.executor.cores", "3")
                    .set("spark.executor.instances", "2")
                    .set("spark.eventLog.enabled", "true"))
spark = SparkSession.builder.master("spark://spark-master:7077").appName("ContratosReadParquet").config(conf=conf).getOrCreate()
spark

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/10/23 10:15:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import time
print ('Inicio: '+time.strftime("%c"))
inicio = time.perf_counter()

Inicio: Sun Oct 23 10:15:56 2022


In [3]:
#spark job monitoring
from pyspark.sql.types import StructType
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType
from pyspark.sql.functions import sum, col, desc
import pyspark.sql.functions as F
from pyspark.sql import Window

esquema = StructType() \
      .add("ctId",StringType(),True) \
      .add("fecIni",StringType(),True) \
      .add("fecFin",StringType(),True) \
      .add("dato",StringType(),True) \
      .add("saldo",IntegerType(),True)

df = spark.read.option("mergeSchema", True).option("schema", "esquema").option("basePath", "file:///opt/workspace/datos/fakeContracts01.parquet/")\
               .format("parquet").load('./datos/fakeContracts01.parquet/proc=CT0015')

df.printSchema()

print(df.count())
df.show()
df.explain(extended=True)
df.explain(mode='cost')


                                                                                

root
 |-- ctId: string (nullable = true)
 |-- fecIni: string (nullable = true)
 |-- fecFin: string (nullable = true)
 |-- dato: string (nullable = true)
 |-- saldo: integer (nullable = true)
 |-- proc: string (nullable = true)



                                                                                

89262


[Stage 4:>                                                          (0 + 1) / 1]

+------------+----------+----------+--------------------+-----+------+
|        ctId|    fecIni|    fecFin|                dato|saldo|  proc|
+------------+----------+----------+--------------------+-----+------+
|CT0015259018|1999-12-28|2020-03-29|Dato del contrato...| 7500|CT0015|
|CT0015259018|2020-03-29|2020-04-08|Dato del contrato...|46400|CT0015|
|CT0015259018|2020-04-08|2020-05-07|Dato del contrato...|34400|CT0015|
|CT0015259018|2020-05-07|2020-06-05|Dato del contrato...|44300|CT0015|
|CT0015259018|2020-06-05|2020-07-25|Dato del contrato...| 9500|CT0015|
|CT0015259018|2020-07-25|2020-08-18|Dato del contrato...|29700|CT0015|
|CT0015259018|2020-08-18|2020-08-18|Dato del contrato...|10200|CT0015|
|CT0015259018|2020-08-18|2020-10-01|Dato del contrato...| 2900|CT0015|
|CT0015259018|2020-10-01|2020-11-20|Dato del contrato...|12800|CT0015|
|CT0015259018|2020-11-20|2020-12-04|Dato del contrato...| 7300|CT0015|
|CT0015259018|2020-12-04|2020-12-31|Dato del contrato...| 6400|CT0015|
|CT001

                                                                                

In [4]:
print ('Final: '+time.strftime("%c"))
final = time.perf_counter()
print(f'Hecho en {round(final - inicio, 4)} segundo(s)')

Final: Sun Oct 23 10:16:03 2022
Hecho en 6.473 segundo(s)


## La función Window define una especificación que consiste en agrupar un campo a nivel de registro para todos los registros (que es la diferncia con el Group By).

In [None]:
print ('Inicio: '+time.strftime("%c"))
inicio = time.perf_counter()

WinSpec = Window.partitionBy('ctId')

dfNew = df.withColumn('lista', F.collect_list(F.col('fecFin')).over(WinSpec))
dfNew.select('ctId', 'lista').show(3, vertical=True, truncate=150)

print ('Final: '+time.strftime("%c"))
final = time.perf_counter()
print(f'Hecho en {round(final - inicio, 4)} segundo(s)')

Inicio: Sun Oct 23 11:51:02 2022
-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------
 ctId  | CT0015123497                                                                                                                                           
 lista | [2020-06-27, 2020-07-17, 2020-08-01, 2020-10-13, 2021-01-16, 2021-01-24, 2021-04-07, 2021-04-27, 2021-07-06, 2021-09-03, 2021-09-22, 2021-11-22, 20... 
-RECORD 1-------------------------------------------------------------------------------------------------------------------------------------------------------
 ctId  | CT0015123497                                                                                                                                           
 lista | [2020-06-27, 2020-07-17, 2020-08-01, 2020-10-13, 2021-01-16, 2021-01-24, 2021-04-07, 2021-04-27, 2021-07-06, 2021-09-03, 2021-09-22, 2021-11-22, 20... 
-

In [30]:
print ('Inicio: '+time.strftime("%c"))
inicio = time.perf_counter()

"""
 *  La opción order by va acumulando elementos en la lista pero sin repetir...
"""

WinSpec = Window.partitionBy('ctId').orderBy('fecFin')

dfNew = df.withColumn('lista', F.collect_list(F.col('fecFin')).over(WinSpec))
dfNew.select('ctId', 'fecFin', 'lista').show(15, vertical=True, truncate=150)

print ('Final: '+time.strftime("%c"))
final = time.perf_counter()
print(f'Hecho en {round(final - inicio, 4)} segundo(s)')

Inicio: Sun Oct 23 11:53:35 2022
-RECORD 0--------------------------------------------------------------------------------------------------------------------------------------------------------
 ctId   | CT0015123497                                                                                                                                           
 fecFin | 2020-06-27                                                                                                                                             
 lista  | [2020-06-27]                                                                                                                                           
-RECORD 1--------------------------------------------------------------------------------------------------------------------------------------------------------
 ctId   | CT0015123497                                                                                                                                       

In [45]:
print ('Inicio: '+time.strftime("%c"))
inicio = time.perf_counter()

"""
 *  Probando rowsbetween
"""

WinSpec = Window.partitionBy('ctId').orderBy('fecFin').rowsBetween(-1, Window.currentRow)

dfNew = df.withColumn('lista', F.collect_list(F.col('fecFin')).over(WinSpec))
dfNew.select('ctId', 'fecFin', 'lista').show(15, vertical=True, truncate=150)

print ('Final: '+time.strftime("%c"))
final = time.perf_counter()
print(f'Hecho en {round(final - inicio, 4)} segundo(s)')

Inicio: Sun Oct 23 12:16:24 2022
-RECORD 0--------------------------
 ctId   | CT0015123497             
 fecFin | 2020-06-27               
 lista  | [2020-06-27]             
-RECORD 1--------------------------
 ctId   | CT0015123497             
 fecFin | 2020-07-17               
 lista  | [2020-06-27, 2020-07-17] 
-RECORD 2--------------------------
 ctId   | CT0015123497             
 fecFin | 2020-08-01               
 lista  | [2020-07-17, 2020-08-01] 
-RECORD 3--------------------------
 ctId   | CT0015123497             
 fecFin | 2020-10-13               
 lista  | [2020-08-01, 2020-10-13] 
-RECORD 4--------------------------
 ctId   | CT0015123497             
 fecFin | 2021-01-16               
 lista  | [2020-10-13, 2021-01-16] 
-RECORD 5--------------------------
 ctId   | CT0015123497             
 fecFin | 2021-01-24               
 lista  | [2021-01-16, 2021-01-24] 
-RECORD 6--------------------------
 ctId   | CT0015123497             
 fecFin | 2021-04-07           

In [38]:
dfNew.collect()[15]

Row(ctId='CT0015123556', fecIni='2020-05-14', fecFin='2020-07-10', dato='Dato del contrato CT0015123556 en 2020-07-10', saldo=61700, proc='CT0015', lista=['2020-05-14', '2020-07-10'])

In [44]:
dfNew.collect()[89000]['lista']

['2020-03-15', '2020-04-22', '2020-09-02', '2020-09-20']

In [18]:
dfGrp = df.groupBy('ctId').agg(F.expr('collect_list(fecFin)').alias('lista'))
dfGrp.show()
dfGrp.count()

+------------+--------------------+
|        ctId|               lista|
+------------+--------------------+
|CT0015123497|[2020-06-27, 2020...|
|CT0015123543|[2020-02-27, 2020...|
|CT0015123556|[2020-05-14, 2020...|
|CT0015123575|[2020-03-10, 2020...|
|CT0015123580|[2020-10-30, 2021...|
|CT0015123600|[2020-05-05, 2020...|
|CT0015123649|        [2222-12-31]|
|CT0015123690|[2020-05-31, 2020...|
|CT0015123729|[2021-01-01, 2021...|
|CT0015123740|[2020-02-27, 2020...|
|CT0015123773|[2020-02-14, 2020...|
|CT0015123796|[2020-01-10, 2021...|
|CT0015123807|[2020-01-23, 2020...|
|CT0015123809|[2020-01-22, 2020...|
|CT0015123826|[2020-02-01, 2020...|
|CT0015123852|[2021-02-27, 2021...|
|CT0015123885|[2020-05-28, 2021...|
|CT0015123921|[2020-02-11, 2020...|
|CT0015123966|[2020-04-07, 2020...|
|CT0015124008|[2020-05-05, 2020...|
+------------+--------------------+
only showing top 20 rows



10000

In [None]:
spark.sparkContext.stop()
print('Sacabao')