In [1]:
import seaborn as sns
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('clasificacion_penguins').getOrCreate()
df = spark.createDataFrame(sns.load_dataset('penguins').dropna()) # Quitamos los numos con dropna
df.show(3)

+-------+---------+--------------+-------------+-----------------+-----------+------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|
+-------+---------+--------------+-------------+-----------------+-----------+------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|
| Adelie|Torgersen|          40.3|         18.0|            195.0|     3250.0|Female|
+-------+---------+--------------+-------------+-----------------+-----------+------+
only showing top 3 rows



## Encoding y Assembler

Para poder usar las columnas categóricas tenemos que codificarlas, como ocurría en scikit learn.

La diferencia es que para poder usar OneHotEncoder primero tenemos que usar StringIndexer, porque el OneHotEncoder de spark requiere números.

#### Es obligatorio pasar las columnas categoricas a numericas y despues hacerles el OneHorEncoder

In [2]:
# StringIndexers

from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder

indexer_species = StringIndexer(inputCol='species', outputCol='species_indexed')
df = indexer_species.fit(df).transform(df)
df.show(2)

+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|species_indexed|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|            0.0|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|            0.0|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+
only showing top 2 rows



In [3]:
# columna apredecir: island
indexer_islands = StringIndexer(inputCol='island', outputCol='label')
df = indexer_islands.fit(df).transform(df)
df.show(2)

+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|species_indexed|label|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|            0.0|  2.0|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|            0.0|  2.0|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+
only showing top 2 rows



In [4]:
# columna genero
indexer_sex = StringIndexer(inputCol='sex', outputCol='sex_indexed')
df = indexer_sex.fit(df).transform(df)
df.show(2)

+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|species_indexed|label|sex_indexed|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|            0.0|  2.0|        0.0|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|            0.0|  2.0|        1.0|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+
only showing top 2 rows



In [5]:
# OneHotEncoder sobre las categoricas de la entrada que usaremos en 'features'
# species_indexed, sex_indexed

encoder = OneHotEncoder(
    inputCols = ['species_indexed', 'sex_indexed'],
    outputCols = ['species_onehot', 'sex_onehot']   
)

df = encoder.fit(df).transform(df)
df.show(2)

+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+--------------+-------------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|species_indexed|label|sex_indexed|species_onehot|   sex_onehot|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+--------------+-------------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|            0.0|  2.0|        0.0| (2,[0],[1.0])|(1,[0],[1.0])|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|            0.0|  2.0|        1.0| (2,[0],[1.0])|    (1,[],[])|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+--------------+-------------+
only showing top 2 rows



In [6]:
#  VectorAssembler
assembler = VectorAssembler(
    inputCols=['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'],
    outputCol='features'
)

df = assembler.transform(df)
df.show(2)

+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+--------------+-------------+--------------------+
|species|   island|bill_length_mm|bill_depth_mm|flipper_length_mm|body_mass_g|   sex|species_indexed|label|sex_indexed|species_onehot|   sex_onehot|            features|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+--------------+-------------+--------------------+
| Adelie|Torgersen|          39.1|         18.7|            181.0|     3750.0|  Male|            0.0|  2.0|        0.0| (2,[0],[1.0])|(1,[0],[1.0])|[39.1,18.7,181.0,...|
| Adelie|Torgersen|          39.5|         17.4|            186.0|     3800.0|Female|            0.0|  2.0|        1.0| (2,[0],[1.0])|    (1,[],[])|[39.5,17.4,186.0,...|
+-------+---------+--------------+-------------+-----------------+-----------+------+---------------+-----+-----------+--------------+-------------+--

In [7]:
# Quedarnos con features y label para poder hacer modelado
df_to_predict = df.select('features', 'label')
df_to_predict.show(2)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[39.1,18.7,181.0,...|  2.0|
|[39.5,17.4,186.0,...|  2.0|
+--------------------+-----+
only showing top 2 rows



In [8]:
df_train, df_test = df_to_predict.randomSplit([0.8, 0.2], seed=42)