In [1]:
#sc.stop()

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("MLib Decision Tree")\
    .getOrCreate()

sc = spark.sparkContext

In [3]:
sc._conf.getAll()

[('spark.app.id', 'local-1638856180271'),
 ('spark.local.dir', '/home/marco/claseBigData/ProyectoBD/tmp'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.memory', '16g'),
 ('spark.app.name', 'PySparkShell'),
 ('spark.driver.host', '192.168.3.5'),
 ('spark.sql.catalogImplementation', 'hive'),
 ('spark.rdd.compress', 'True'),
 ('spark.driver.port', '35803'),
 ('spark.app.startTime', '1638856179666'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

### Carga de Datos

In [4]:
irisData = sc.textFile("clase/iris.csv")
irisData.cache()
irisData.take(5)

['5.1,3.5,1.4,0.2,setosa',
 '4.9,3,1.4,0.2,setosa',
 '4.7,3.2,1.3,0.2,setosa',
 '4.6,3.1,1.5,0.2,setosa',
 '5,3.6,1.4,0.2,setosa']

In [5]:
# Archivo no contiene encabezados
irisData.take(5)

['5.1,3.5,1.4,0.2,setosa',
 '4.9,3,1.4,0.2,setosa',
 '4.7,3.2,1.3,0.2,setosa',
 '4.6,3.1,1.5,0.2,setosa',
 '5,3.6,1.4,0.2,setosa']

In [6]:
irisData.count()

150

### Preprocesamiento de datos

In [7]:
from pyspark.sql import Row

# Crea DataFrame a partir de los datos
parts = irisData.map(lambda l: l.split(","))
irisMap = parts.map(lambda p: Row(SEPAL_LENGTH=float(p[0]),\
                                 SEPAL_WIDTH=float(p[1]),\
                                 PETAL_LENGTH=float(p[2]),\
                                 PETAL_WIDTH=float(p[3]),\
                                 SPECIES=p[4]))

# Infiere el esquema y registra el dataframe como tabla
irisDf = spark.createDataFrame(irisMap)
irisDf.cache()

DataFrame[SEPAL_LENGTH: double, SEPAL_WIDTH: double, PETAL_LENGTH: double, PETAL_WIDTH: double, SPECIES: string]

In [8]:
irisDf.show()

+------------+-----------+------------+-----------+-------+
|SEPAL_LENGTH|SEPAL_WIDTH|PETAL_LENGTH|PETAL_WIDTH|SPECIES|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

In [9]:
# Agregamos un indexador numerico para la etiqueta / columna de destino
from pyspark.ml.feature import StringIndexer
stringIndexer = StringIndexer(inputCol="SPECIES",outputCol="IND_SPECIES")
si_model = stringIndexer.fit(irisDf)
irisNormDf = si_model.transform(irisDf)

irisNormDf.select("SPECIES","IND_SPECIES").distinct().collect()

[Row(SPECIES='setosa', IND_SPECIES=0.0),
 Row(SPECIES='virginica', IND_SPECIES=2.0),
 Row(SPECIES='versicolor', IND_SPECIES=1.0)]

In [10]:
irisNormDf.cache()

DataFrame[SEPAL_LENGTH: double, SEPAL_WIDTH: double, PETAL_LENGTH: double, PETAL_WIDTH: double, SPECIES: string, IND_SPECIES: double]

### Data Analisis

In [11]:
irisNormDf.describe().show()

+-------+------------------+-------------------+-----------------+------------------+---------+------------------+
|summary|      SEPAL_LENGTH|        SEPAL_WIDTH|     PETAL_LENGTH|       PETAL_WIDTH|  SPECIES|       IND_SPECIES|
+-------+------------------+-------------------+-----------------+------------------+---------+------------------+
|  count|               150|                150|              150|               150|      150|               150|
|   mean| 5.843333333333332| 3.0573333333333337|3.758000000000001|1.1993333333333331|     null|               1.0|
| stddev|0.8280661279778633|0.43586628493669777|1.765298233259466|0.7622376689603467|     null|0.8192319205190407|
|    min|               4.3|                2.0|              1.0|               0.1|   setosa|               0.0|
|    max|               7.9|                4.4|              6.9|               2.5|virginica|               2.0|
+-------+------------------+-------------------+-----------------+--------------

### El siguiente paso no se deberia realizar ya que estariamos tomando correlacion de numericas vs categoricas

In [12]:
# Encuentra la correlacion entre los predictores y el objetivo
for i in irisNormDf.columns:
    if not (isinstance(irisNormDf.select(i).take(1)[0][0], str)):
        print("Correlation to SPECIES for ",i,irisNormDf.stat.corr("IND_SPECIES",i))

Correlation to SPECIES for  SEPAL_LENGTH 0.7825612318100816
Correlation to SPECIES for  SEPAL_WIDTH -0.4266575607811234
Correlation to SPECIES for  PETAL_LENGTH 0.9490346990083889
Correlation to SPECIES for  PETAL_WIDTH 0.9565473328764028
Correlation to SPECIES for  IND_SPECIES 1.0


### Preparar datos para usar MLlib

In [13]:
from pyspark.ml.linalg import Vectors

def transformToLabeledPoint(row):
    lp = (row["SPECIES"], row["IND_SPECIES"], Vectors.dense([row["SEPAL_LENGTH"],\
                                                             row["SEPAL_WIDTH"],\
                                                             row["PETAL_LENGTH"],\
                                                             row["PETAL_WIDTH"]]))
    return lp

In [14]:
irisLp = irisNormDf.rdd.map(transformToLabeledPoint)
irisLpDf = spark.createDataFrame(irisLp,["species","label","features"])
irisLpDf.select("species","label","features").show(10)
irisLpDf.cache()

+-------+-----+-----------------+
|species|label|         features|
+-------+-----+-----------------+
| setosa|  0.0|[5.1,3.5,1.4,0.2]|
| setosa|  0.0|[4.9,3.0,1.4,0.2]|
| setosa|  0.0|[4.7,3.2,1.3,0.2]|
| setosa|  0.0|[4.6,3.1,1.5,0.2]|
| setosa|  0.0|[5.0,3.6,1.4,0.2]|
| setosa|  0.0|[5.4,3.9,1.7,0.4]|
| setosa|  0.0|[4.6,3.4,1.4,0.3]|
| setosa|  0.0|[5.0,3.4,1.5,0.2]|
| setosa|  0.0|[4.4,2.9,1.4,0.2]|
| setosa|  0.0|[4.9,3.1,1.5,0.1]|
+-------+-----+-----------------+
only showing top 10 rows



DataFrame[species: string, label: double, features: vector]

### Realizar aprendizaje de maquina

In [15]:
(trainingData, testData) = irisLpDf.randomSplit([0.8,0.2])
print("trainingData.count(): ",trainingData.count())
print("testData.count(): ",testData.count())

trainingData.count():  127
testData.count():  23


In [16]:
testData.collect()

[Row(species='setosa', label=0.0, features=DenseVector([4.3, 3.0, 1.1, 0.1])),
 Row(species='setosa', label=0.0, features=DenseVector([4.5, 2.3, 1.3, 0.3])),
 Row(species='setosa', label=0.0, features=DenseVector([4.6, 3.2, 1.4, 0.2])),
 Row(species='setosa', label=0.0, features=DenseVector([4.6, 3.6, 1.0, 0.2])),
 Row(species='setosa', label=0.0, features=DenseVector([4.9, 3.6, 1.4, 0.1])),
 Row(species='setosa', label=0.0, features=DenseVector([5.0, 3.3, 1.4, 0.2])),
 Row(species='setosa', label=0.0, features=DenseVector([5.1, 3.5, 1.4, 0.3])),
 Row(species='setosa', label=0.0, features=DenseVector([5.1, 3.8, 1.5, 0.3])),
 Row(species='setosa', label=0.0, features=DenseVector([5.4, 3.9, 1.7, 0.4])),
 Row(species='versicolor', label=1.0, features=DenseVector([4.9, 2.4, 3.3, 1.0])),
 Row(species='versicolor', label=1.0, features=DenseVector([5.9, 3.0, 4.2, 1.5])),
 Row(species='versicolor', label=1.0, features=DenseVector([5.9, 3.2, 4.8, 1.8])),
 Row(species='versicolor', label=1.0, fe

In [17]:
# Construir el modelo sobre los datos de entrenamiento
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Crear el modelo
dtClassifier = DecisionTreeClassifier(maxDepth=2, labelCol="label",featuresCol="features")
dtModel = dtClassifier.fit(trainingData)

# Muestra de metricas
print("Nro de Nodos: ",dtModel.numNodes)
print("Profundidad: ",dtModel.depth)


# Predict sobre los datos de test
predictions = dtModel.transform(testData)
predictions.select("prediction","species","label").collect()

Nro de Nodos:  5
Profundidad:  2


[Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=0.0, species='setosa', label=0.0),
 Row(prediction=1.0, species='versicolor', label=1.0),
 Row(prediction=1.0, species='versicolor', label=1.0),
 Row(prediction=2.0, species='versicolor', label=1.0),
 Row(prediction=1.0, species='versicolor', label=1.0),
 Row(prediction=1.0, species='versicolor', label=1.0),
 Row(prediction=1.0, species='versicolor', label=1.0),
 Row(prediction=1.0, species='versicolor', label=1.0),
 Row(prediction=1.0, species='versicolor', label=1.0),
 Row(prediction=2.0, species='virginica', label=2.0),
 Row(prediction=2.0, species='virginica', label

In [18]:
# Evaluar Accuracy

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",\
                               labelCol="label", metricName="accuracy")
evaluator.evaluate(predictions)

0.9565217391304348

### Matriz de confusion

In [19]:
predictions.groupBy("label","prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|    7|
|  2.0|       2.0|    6|
|  1.0|       2.0|    1|
|  0.0|       0.0|    9|
+-----+----------+-----+



In [20]:
sc.stop()

In [21]:
import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)

/home/marco/anaconda3/bin/python
3.8.3 (default, Jul  2 2020, 16:21:59) 
[GCC 7.3.0]
sys.version_info(major=3, minor=8, micro=3, releaselevel='final', serial=0)
