In [4]:
### Probando con datos
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

conf = (SparkConf().set("spark.executor.memory", "6g")   #<--- Por executor. Como hay dos instancias, la mitad por executor.
                    .set("spark.executor.cores", "3")    #<--- Por cada executor. Como hay 4, 4*3=12 cores se usan.
                    .set("spark.executor.instances", "2")
                    .set("spark.eventLog.enabled", "true"))
spark = SparkSession.builder.master("spark://spark-master:7077").appName("SparkStreaming").config(conf=conf).getOrCreate()
spark

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/01/24 17:52:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
%%time
df = spark.read.csv('./datos/fakecards/synth_composite.csv', header=True, inferSchema=True)

print(df.count())
df.show()
df.explain(extended=True)
df.explain(mode='cost')


                                                                                

6362620
+----+--------+------------------+-------------+-------------------+------------------+-------------+--------------------+--------------------+-------+--------------+
|step|    type|            amount|     nameOrig|      oldbalanceOrg|    newbalanceOrig|     nameDest|      oldbalanceDest|      newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+------------------+-------------+-------------------+------------------+-------------+--------------------+--------------------+-------+--------------+
| 211|CASH_OUT|184185.75527735116|160_2696646_0| 203108.47605069657|18922.720773345412| 160_901564_0|  1019993.7073250777|   1204127.136276725|      0|             0|
| 394|CASH_OUT|237093.19600862195|160_5585569_0|  382707.4642382287|145614.26822960674|160_1586945_0|   587089.2765989383|   824320.2512419948|      0|             0|
| 306| CASH_IN|163600.51879411662|160_4221372_0|   3093.43116922955|               0.0|160_1701821_0|    653453.795387125|  490121.90304387547|      0|      

In [9]:
df.columns


['step',
 'type',
 'amount',
 'nameOrig',
 'oldbalanceOrg',
 'newbalanceOrig',
 'nameDest',
 'oldbalanceDest',
 'newbalanceDest',
 'isFraud',
 'isFlaggedFraud']

In [10]:
df.printSchema()

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)



### Step viene siendo como un número de sesión que permite agrupar los datos

In [11]:
df.groupBy('step').count().show()



+----+-----+
|step|count|
+----+-----+
| 471| 2532|
| 496|  831|
| 540| 5172|
| 137|33017|
| 251|35155|
| 451| 3500|
| 580|   50|
| 458|  397|
| 255|28825|
| 588| 4634|
| 133|35787|
| 472|   84|
| 322|26823|
| 321|17460|
| 362|  471|
| 673|  376|
| 613|   44|
| 375|25445|
| 597| 2291|
| 155|30521|
+----+-----+
only showing top 20 rows



                                                                                

In [25]:
df.describe().show()



+-------+------------------+--------+-------------------+--------+------------------+-----------------+--------+--------------------+--------------------+--------------------+--------------------+
|summary|              step|    type|             amount|nameOrig|     oldbalanceOrg|   newbalanceOrig|nameDest|      oldbalanceDest|      newbalanceDest|             isFraud|      isFlaggedFraud|
+-------+------------------+--------+-------------------+--------+------------------+-----------------+--------+--------------------+--------------------+--------------------+--------------------+
|  count|           6362620| 6362620|            6362620| 6362620|           6362620|          6362620| 6362620|             6362620|             6362620|             6362620|             6362620|
|   mean| 239.0055508579799|    null|  179855.5421124945|    null| 833897.5850304173|793085.3291809693|    null|  1100707.7318166096|  1225001.5492753538|0.001290820448180152|2.200351427556572...|
| stddev|141.65

                                                                                

#### Guarda la información de cada step en un archivo separado. Sólo se ejecuta una vez porque es muy pesado

In [3]:
%%time
"""
# Saca todos los códigos de step distintos que hay:
steps = df.select('step').distinct().collect()

for step in steps[:]:
    df1 = df.where(f"step = {step[0]}")
    # Añade coalesce para evitar que particione en más de un archivo:
    df1.coalesce(1).write.mode("append").option("header","true").csv("./datos/fakecards/archivos")
"""

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 2.86 µs


'\n# Saca todos los códigos de step distintos que hay:\nsteps = df.select(\'step\').distinct().collect()\n\nfor step in steps[:]:\n    df1 = df.where(f"step = {step[0]}")\n    # Añade coalesce para evitar que particione en más de un archivo:\n    df1.coalesce(1).write.mode("append").option("header","true").csv("./datos/fakecards/archivos")\n'

In [None]:
!cd ./datos/fakecards/archivos && ls -lah

#### Comprobamos que sólo hay un stept en cada partición

In [5]:
%%time
part = spark.read.csv('./datos/fakecards/archivos/part-00000-fff1cfd2-3371-44a5-a985-f66d84b9e291-c000.csv', header=True, inferSchema=True)
part.groupBy('step').count().show()

                                                                                

+----+-----+
|step|count|
+----+-----+
|  13|38919|
+----+-----+

CPU times: user 6.96 ms, sys: 110 µs, total: 7.07 ms
Wall time: 7.67 s


In [6]:
dataEsquema = part.schema
dataEsquema

StructType(List(StructField(step,IntegerType,true),StructField(type,StringType,true),StructField(amount,DoubleType,true),StructField(nameOrig,StringType,true),StructField(oldbalanceOrg,DoubleType,true),StructField(newbalanceOrig,DoubleType,true),StructField(nameDest,StringType,true),StructField(oldbalanceDest,DoubleType,true),StructField(newbalanceDest,DoubleType,true),StructField(isFraud,IntegerType,true),StructField(isFlaggedFraud,IntegerType,true)))

In [3]:
spark.sparkContext.stop()
print('Sacabao')

Sacabao
