# Basado en el video https://www.youtube.com/watch?v=rWobNcypVHw

In [None]:
### Probando con datos
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf

conf = (SparkConf().set("spark.executor.memory", "6g")   #<--- Por executor. Como hay dos instancias, la mitad por executor.
                    .set("spark.executor.cores", "3")    #<--- Por cada executor. Como hay 4, 4*3=12 cores se usan.
                    .set("spark.executor.instances", "2")
                    .set("spark.eventLog.enabled", "true"))
spark = SparkSession.builder.master("spark://spark-master:7077").appName("cursitoML").config(conf=conf).getOrCreate()
spark

#### En teoría vamos a hacer:
+ Data prepration
+ Feature engineering
+ Construcción del modelo
+ Evaluación

In [None]:
%%time
df = spark.read.csv("../datos/hcvdat0.csv", header=True, inferSchema=True)
df.show(1, vertical=True)
df.show()

# Reordena las columnas no sé muy bien para qué y quita la primera:

In [None]:
df = df.select("Age","Sex","ALB","ALP","ALT","AST","BIL","CHE","CHOL","CREA","GGT","PROT","Category")
df.show(5)
df.describe().show()

#### Se pone a contar algún dato:

In [None]:
for c in ['Category','Sex','Age']:df.groupBy(c).count().show(truncate=False)

### Empieza la conversión de valores de las variables categóricas:

In [None]:
!pip install numpy

In [None]:
import pyspark.ml
dir(pyspark.ml)

In [None]:
from pyspark.ml.feature import VectorAssembler, StringIndexer

### Conversión de la columna sex:

In [None]:
sexCoder = StringIndexer(inputCol='Sex',outputCol='SexNum').fit(df)
df = sexCoder.transform(df)
df.show()

### Convierte la columna Category

In [None]:
catCoder = StringIndexer(inputCol='Category',outputCol='CategoryNum').fit(df)
df = catCoder.transform(df)
df.show()

### Para saber a qué corresponden las etiquetas:

In [None]:
sexCoder.labels

In [None]:
catCoder.labels

### Se puede hacer lo contrario para saber a qué etiqueta corresponde un valor:

In [None]:
from pyspark.ml.feature import IndexToString
convertidor = IndexToString(inputCol='CategoryNum',outputCol='catOriginal')
df_conv = convertidor.transform(df)
df_conv.groupBy('catOriginal').count().show(truncate=False)

### Casteo de las variables string que todavía no se han convertido

In [None]:
df.dtypes

In [None]:
df = df.withColumn("ALB",df.ALB.cast('double'))
df = df.withColumn("ALP",df.ALP.cast('double'))
df = df.withColumn("ALT",df.ALT.cast('double'))
df = df.withColumn("CHOL",df.CHOL.cast('double'))
df = df.withColumn("PROT",df.PROT.cast('double'))

In [None]:
df.dtypes

### Cuenta cuántos nan or null hay en cada columna:

In [None]:
from pyspark.sql.functions import *
print(df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show())

### Cambio los nulos por ceros

In [None]:
df = df.na.fill(value=0,subset=["ALB","ALP","ALT","AST","BIL","CHE","CHOL","CREA","GGT","PROT"])
print(df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show())

### Selección de las variables numéricas y vectorización:

In [None]:
featuresReq = ["Age","ALB","ALP","ALT","AST","BIL","CHE","CHOL","CREA","GGT","PROT","SexNum","CategoryNum"]

vec_ass = VectorAssembler(inputCols=featuresReq,outputCol='features')

df_vect = vec_ass.transform(df)
df_vect.show(1, truncate=False, vertical=True)

### División de los DF de entrenamiento y test:

In [None]:
train_df,test_df = df_vect.randomSplit([0.8,0.2])

print(train_df.count())
print(test_df.count())


### Construcción del modelo

In [45]:
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier

#Modelo logístico:
lr = LogisticRegression(featuresCol='features', labelCol='CategoryNum')

lr_model= lr.fit(train_df)

### Detección:

In [46]:
detect = lr_model.transform(test_df)
detect.show(1, truncate=False, vertical=True)

-RECORD 0---------------------------------------------------------------------------------------------------------
 Age           | 32                                                                                               
 Sex           | m                                                                                                
 ALB           | 39.2                                                                                             
 ALP           | 74.1                                                                                             
 ALT           | 32.6                                                                                             
 AST           | 24.8                                                                                             
 BIL           | 9.6                                                                                              
 CHE           | 9.15                                                           

In [64]:
detect.groupBy('prediction').count().show(truncate=False)

+----------+-----+
|prediction|count|
+----------+-----+
|0.0       |100  |
|1.0       |5    |
|4.0       |3    |
|3.0       |7    |
|2.0       |6    |
+----------+-----+



### Evaluación de la efectividad del modelo

In [58]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [62]:
evaluator = MulticlassClassificationEvaluator(labelCol='CategoryNum',metricName='accuracy')

In [63]:
evaluator.evaluate(detect)

0.9586776859504132

### Precisión y otras cosas:

In [65]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [None]:
lr_metric = MulticlassMetrics(detect)

In [None]:
"""
spark.sparkContext.stop()
print('Sacabao')catOriginal
"""
print("Pasó por aquí")