## Probando la librería MLLib

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

conf = (SparkConf().set("spark.executor.memory", "6g")
                    .set("spark.executor.cores", "3")
                    .set("spark.executor.instances", "2")
                    .set("spark.eventLog.enabled", "true"))
spark = SparkSession.builder.master("spark://spark-master:7077").appName("PlayingWithMLLib").config(conf=conf).getOrCreate()
spark

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/10/25 17:31:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import time
print ('Inicio: '+time.strftime("%c"))
inicio = time.perf_counter()

Inicio: Tue Oct 25 17:31:41 2022


In [3]:
%%time

#spark job monitoring
from pyspark.sql.types import StructType
from pyspark.sql.types import StringType,BooleanType,DateType,IntegerType
from pyspark.sql.functions import sum, col, desc

df = spark.read.csv('./datos/PS_20174392719_1491204439457_log.csv', sep=",", header = True, inferSchema=True)

df.printSchema()

print(df.count())
df.show(3, vertical=True)
df.explain(extended=True)
df.explain(mode='cost')


                                                                                

root
 |-- step: integer (nullable = true)
 |-- type: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- nameOrig: string (nullable = true)
 |-- oldbalanceOrg: double (nullable = true)
 |-- newbalanceOrig: double (nullable = true)
 |-- nameDest: string (nullable = true)
 |-- oldbalanceDest: double (nullable = true)
 |-- newbalanceDest: double (nullable = true)
 |-- isFraud: integer (nullable = true)
 |-- isFlaggedFraud: integer (nullable = true)

6362620
-RECORD 0---------------------
 step           | 1           
 type           | PAYMENT     
 amount         | 9839.64     
 nameOrig       | C1231006815 
 oldbalanceOrg  | 170136.0    
 newbalanceOrig | 160296.36   
 nameDest       | M1979787155 
 oldbalanceDest | 0.0         
 newbalanceDest | 0.0         
 isFraud        | 0           
 isFlaggedFraud | 0           
-RECORD 1---------------------
 step           | 1           
 type           | PAYMENT     
 amount         | 1864.28     
 nameOrig       | C1666544295

In [4]:
print ('Final: '+time.strftime("%c"))
final = time.perf_counter()
print(f'Hecho en {round(final - inicio, 4)} segundo(s)')

Final: Tue Oct 25 17:31:50 2022
Hecho en 8.7829 segundo(s)


In [5]:
print ('Inicio: '+time.strftime("%c"))
inicio = time.perf_counter()

df.groupBy('type').count().show(200)

print ('Final: '+time.strftime("%c"))
final = time.perf_counter()
print(f'Hecho en {round(final - inicio, 4)} segundo(s)')

Inicio: Tue Oct 25 17:31:50 2022




+--------+-------+
|    type|  count|
+--------+-------+
|TRANSFER| 532909|
| CASH_IN|1399284|
|CASH_OUT|2237500|
| PAYMENT|2151495|
|   DEBIT|  41432|
+--------+-------+

Final: Tue Oct 25 17:31:52 2022
Hecho en 2.3377 segundo(s)


                                                                                

In [6]:
# Selecciona algunas columnas por facilitar el tema
df = df.select('type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'isFraud')
df.show(3, vertical=True)

-RECORD 0-------------------
 type           | PAYMENT   
 amount         | 9839.64   
 oldbalanceOrg  | 170136.0  
 newbalanceOrig | 160296.36 
 isFraud        | 0         
-RECORD 1-------------------
 type           | PAYMENT   
 amount         | 1864.28   
 oldbalanceOrg  | 21249.0   
 newbalanceOrig | 19384.72  
 isFraud        | 0         
-RECORD 2-------------------
 type           | TRANSFER  
 amount         | 181.0     
 oldbalanceOrg  | 181.0     
 newbalanceOrig | 0.0       
 isFraud        | 1         
only showing top 3 rows



### Separa los test de entrenamiento y test

In [7]:
train, test = df.randomSplit([0.7, 0.3], seed=7)

In [8]:
print ('Inicio: '+time.strftime("%c"))
inicio = time.perf_counter()

print(f'Train dataset size: {train.count()} records')
print(f'Test dataset size: {test.count()} records')

print ('Final: '+time.strftime("%c"))
final = time.perf_counter()
print(f'Hecho en {round(final - inicio, 4)} segundo(s)')

Inicio: Tue Oct 25 17:31:52 2022


                                                                                

Train dataset size: 4452656 records


[Stage 13:====>                                                   (1 + 11) / 12]

Test dataset size: 1909964 records
Final: Tue Oct 25 17:31:57 2022
Hecho en 4.9684 segundo(s)


                                                                                

### Separación de las variables categóricas (strings). Sólo hay una en este ejemplo:

In [9]:
train.dtypes

[('type', 'string'),
 ('amount', 'double'),
 ('oldbalanceOrg', 'double'),
 ('newbalanceOrig', 'double'),
 ('isFraud', 'int')]

In [10]:
categCols = [x for(x, dataType) in train.dtypes if dataType == 'string']
numerCols = [x for(x, dataType) in train.dtypes if (dataType == 'double') & (x != 'isFraud')]

print(categCols)
print(numerCols)

['type']
['amount', 'oldbalanceOrg', 'newbalanceOrig']


### Vamos con las tranformaciones

### Primero vemos lo que hay en los datos

In [11]:
import pyspark.sql.functions as F
import pyspark.sql.types as T

train.agg(F.countDistinct('type')).show()



+-----------+
|count(type)|
+-----------+
|          5|
+-----------+



                                                                                

In [12]:
train.groupBy('type').count().show()



+--------+-------+
|    type|  count|
+--------+-------+
|TRANSFER| 372584|
| CASH_IN| 979523|
|CASH_OUT|1565778|
| PAYMENT|1505805|
|   DEBIT|  28966|
+--------+-------+



                                                                                

In [13]:
# Una forma de sacar la media de todas las columnas

df.select(*[F.mean(c) for c in df.columns]).show()



+---------+-----------------+------------------+-------------------+--------------------+
|avg(type)|      avg(amount)|avg(oldbalanceOrg)|avg(newbalanceOrig)|        avg(isFraud)|
+---------+-----------------+------------------+-------------------+--------------------+
|     null|179861.9035491319| 833883.1040744885|  855113.6685785907|0.001290820448180152|
+---------+-----------------+------------------+-------------------+--------------------+



                                                                                

In [14]:
# Saca la media de cada columna por tipo

df.groupby('type').agg({col: 'avg' for col in df.columns[1:]}).show()



+--------+--------------------+------------------+-------------------+------------------+
|    type|        avg(isFraud)|       avg(amount)|avg(newbalanceOrig)|avg(oldbalanceOrg)|
+--------+--------------------+------------------+-------------------+------------------+
|TRANSFER|0.007687991758442811| 910647.0096454887| 10288.156703208235| 54441.85172470344|
| CASH_IN|                 0.0| 168920.2420040969|  3759378.712078749|  3590463.50829942|
|CASH_OUT|0.001839553072625...| 176273.9643461408| 17474.192737135192| 46023.80479455195|
| PAYMENT|                 0.0|13057.604660187482|  61837.89091078535| 68216.82757282731|
|   DEBIT|                 0.0|5483.6653137671365|  65161.65196273415| 68647.33712589303|
+--------+--------------------+------------------+-------------------+------------------+



                                                                                

### Importamos encoder e indexer de ml para hacer las transformaciones (hay que instalar numpy)

In [15]:
from pyspark.ml.feature import (
    OneHotEncoder,
    StringIndexer,
    VectorAssembler
)

In [16]:
str_indx = [StringIndexer(inputCol=x, outputCol=x + "_StringIndexer", handleInvalid='skip') for x in categCols]

str_indx

[StringIndexer_eb5557096d53]

In [17]:
one_HE = [ OneHotEncoder(
        inputCols=[f"{x}_StringIndexer" for x in categCols],
        outputCols=[f"{x}_OneHotEncoder" for x in categCols],)]

one_HE

[OneHotEncoder_049e4df4e15c]

### Vector assembler. Para pasar todos los datos a doble precisión, creo.

In [18]:
assembInput = [x for x in numerCols]
assembInput += [f"{x}_OneHotEncoder" for x in categCols]

assembInput

['amount', 'oldbalanceOrg', 'newbalanceOrig', 'type_OneHotEncoder']

In [19]:
##Voy por el segundo 13:24 de https://www.youtube.com/watch?v=1a7bB1ZcZ3k&t=821s

vect_assm = VectorAssembler (inputCols=assembInput, outputCol='VectAssm_features')


In [20]:
## Construye el pipeline para ML

stages = []
stages += str_indx 
stages += one_HE
stages += [vect_assm]

In [21]:
stages

[StringIndexer_eb5557096d53,
 OneHotEncoder_049e4df4e15c,
 VectorAssembler_f1694ab3f735]

In [22]:
%%time
# Transforma el dataset de entrenamiento:

from pyspark.ml import Pipeline

pipeline = Pipeline().setStages(stages)
model = pipeline.fit(train)

pp_df = model.transform(test)


                                                                                

CPU times: user 41.1 ms, sys: 4.89 ms, total: 46 ms
Wall time: 3.25 s


In [23]:
pp_df.show(truncate=False)from 

SyntaxError: invalid syntax (1942972909.py, line 1)

### Regresión logística

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
datos = pp_df.select(
    F.col("VectAssm_features").alias("features"),
    F.col("isFraud").alias("label")
)

In [None]:
datos.show(5,truncate=False)

In [None]:
%%time
model = LogisticRegression().fit(datos)

In [None]:
model.summary.areaUnderROC

In [None]:
model.summary.pr.show()

In [None]:
spark.sparkContext.stop()
print('Sacabao')