#### PARTE 1 (10 %) Carga de datos de diamonds desde CSV con schema: https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/diamonds.csv

In [1]:
import pyspark
import seaborn as sns
import pandas as pd
import requests
from pyspark.sql.types import StructType, StructField, FloatType, StringType, IntegerType, NumericType
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import StringIndexer, Imputer, OneHotEncoder, VectorAssembler, MinMaxScaler

In [2]:
spark = SparkSession.builder.appName('diamonds_evaluate').getOrCreate()

# Cargamos el CSV y hacemos el Schema

url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/refs/heads/master/diamonds.csv'
csv_path = 'diamonds.csv'

with open(csv_path, 'wb') as file:
    file.write(requests.get(url).content)
    
schema = StructType([
    StructField('carat', FloatType(), True),
    StructField('cut', StringType(), True),
    StructField('color', StringType(), True),
    StructField('clarity', StringType(), True),
    StructField('depth', FloatType(), True),
    StructField('table', FloatType(), True),
    StructField('price', IntegerType(), True),
    StructField('x', FloatType(), True),
    StructField('y', FloatType(), True),
    StructField('z', FloatType(), True),
])

df = spark.read.csv(csv_path, header=True, inferSchema=False, schema=schema)
df.show(3)
df.printSchema()

+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 3 rows

root
 |-- carat: float (nullable = true)
 |-- cut: string (nullable = true)
 |-- color: string (nullable = true)
 |-- clarity: string (nullable = true)
 |-- depth: float (nullable = true)
 |-- table: float (nullable = true)
 |-- price: integer (nullable = true)
 |-- x: float (nullable = true)
 |-- y: float (nullable = true)
 |-- z: float (nullable = true)



#### PARTE 2 (40 %) Pipeline regresión price con preprocesados
  * Imputer, StringIndexer, OneHotEncoder, MinMaxScaler o StandardScaler, VectorAssembler

In [3]:
# Como vamos a predecir 'Price' borramos filas donde 'Price' sea nan:
df_reg = df.dropna(subset=['price'])

# Contamos nulos en todas las columnas: equivalente a pandas df.isna().sum()
df_reg.select([sum(col(c).isNull().cast('int')).alias(c) for c in df_reg.columns]).show()

+-----+---+-----+-------+-----+-----+-----+---+---+---+
|carat|cut|color|clarity|depth|table|price|  x|  y|  z|
+-----+---+-----+-------+-----+-----+-----+---+---+---+
|    0|  0|    0|      0|    0|    0|    0|  0|  0|  0|
+-----+---+-----+-------+-----+-----+-----+---+---+---+



In [4]:
# Renombramos la columna 'Price' como 'Label'
df_reg = df_reg.withColumnRenamed('price', 'label')
df_reg.show(3)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|carat|    cut|color|clarity|depth|table|label|   x|   y|   z|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 3 rows



In [5]:
# Separamos las columnas a las que aplicar preprocesados
numerical_columns = [field.name for field in df_reg.schema.fields if isinstance(field.dataType, NumericType) and field.name != 'price']
categorical_columns = [field.name for field in df_reg.schema.fields if isinstance(field.dataType, StringType)]
label_col = 'price'

print(numerical_columns)
print(categorical_columns)

['carat', 'depth', 'table', 'label', 'x', 'y', 'z']
['cut', 'color', 'clarity']


In [6]:
# Hacemos el Indexers para las columnas(features) categoricas de la entrada
# Crea un objeto StringIndexer por cada columna
indexers_features = [
    StringIndexer(inputCol=c, outputCol=c + '_indexed', handleInvalid='keep') for c in categorical_columns
]
categorical_columns_indexed = [c + '_indexed' for c in categorical_columns]

print(categorical_columns_indexed)

['cut_indexed', 'color_indexed', 'clarity_indexed']


In [7]:
# Hacemos el Imputer con la moda para las columnas categoricas indexadas
imputer_categorical = Imputer(
    inputCols=categorical_columns_indexed,
    outputCols=[c + '_imputed' for c in categorical_columns_indexed],
    strategy='mode'
)
categorical_cols_indexed_imputed = [c + '_imputed' for c in categorical_columns_indexed]

print(categorical_cols_indexed_imputed)

['cut_indexed_imputed', 'color_indexed_imputed', 'clarity_indexed_imputed']


In [8]:
# Hacemos el OneHotEncoder para las columnas categoricas 
encoders_onehot = [
    OneHotEncoder(inputCol=c, outputCol=c + '_onehot') 
    for c in categorical_cols_indexed_imputed
]
categorical_cols_onehot = [c + '_onehot' for c in categorical_cols_indexed_imputed]

print(categorical_cols_onehot)

['cut_indexed_imputed_onehot', 'color_indexed_imputed_onehot', 'clarity_indexed_imputed_onehot']


In [9]:
# Hacemos el Imputer a las columnas numericas
imputer_numerical = Imputer(
    inputCols=numerical_columns,
    outputCols=[c + '_imputed' for c in numerical_columns],
    strategy='median'
)
numerical_cols_imputed = [c + '_imputed' for c in numerical_columns]

print(numerical_cols_imputed)

['carat_imputed', 'depth_imputed', 'table_imputed', 'label_imputed', 'x_imputed', 'y_imputed', 'z_imputed']


In [10]:
# Escalamos columnas numericas con MinMaxScaler
assembler_numerical = VectorAssembler(
    inputCols=numerical_cols_imputed,
    outputCol='numeric_features'
)
scaler = MinMaxScaler(
    inputCol='numeric_features',
    outputCol='numeric_features_scaled'
)

In [11]:
# Juntamos todas las columnas
all_columns = ['numeric_features_scaled'] + categorical_cols_onehot

print(all_columns)

['numeric_features_scaled', 'cut_indexed_imputed_onehot', 'color_indexed_imputed_onehot', 'clarity_indexed_imputed_onehot']


In [12]:
# Ensamblamos las columnas numéricas + categóricas y obtenemos features
assembler_all = VectorAssembler(
    inputCols=all_columns,
    outputCol='features'
)

In [13]:
# Hacemos el Regressor 
regressor = RandomForestRegressor(seed=42)

In [14]:
# Hacemos el particionamiento de datos
df_train, df_test = df_reg.randomSplit([0.8, 0.2], seed=42)

In [15]:
# Hacemos el PipeLine
pipeline = Pipeline(stages = [
    # Hacemos el Indexers para las columnas categóricas 
    *indexers_features, # Ponemos * porque es una lista de objetos
    # Hacemos el Imputer para las columnas categóricas
    imputer_categorical,
    # Hacemos el OneHotEncoders para columnas categóricas
    *encoders_onehot,
    # Hacemos el Imputer para columnas numéricas
    imputer_numerical,
    # Ensamblamos columnas numéricas + escalado
    assembler_numerical,
    scaler,
    # Ensamblamos columnas numéricas escaladas + columnas categóricas en una sola columna 'features'
    assembler_all,
    # 8. Modelo
    regressor
])

In [16]:
# Entrenamos el PipeLine
pipeline_model = pipeline.fit(df_train)
df_pred = pipeline_model.transform(df_test)

In [17]:
# Creamos los evaluadores
evaluator_r2 = RegressionEvaluator(metricName='r2')
evaluator_mae = RegressionEvaluator(metricName='mae')
evaluator_mse = RegressionEvaluator(metricName='mse')
evaluator_rmse = RegressionEvaluator(metricName='rmse')

print(f"r2: {evaluator_r2.evaluate(df_pred):.3f}")
print(f"mae: {evaluator_mae.evaluate(df_pred):.3f}")
print(f"mse: {evaluator_mse.evaluate(df_pred):.3f}")
print(f"rmse: {evaluator_rmse.evaluate(df_pred):.3f}")

r2: 0.983
mae: 303.824
mse: 280472.219
rmse: 529.596


#### PARTE 3 (40 %) Pipeline clasificación multiclase sobre variable cut con preprocesados
  * Imputer, StringIndexer, OneHotEncoder, MinMaxScaler o StandardScaler, VectorAssembler

In [18]:
# Como vamos a predecir la columna 'Cut' borramos filas donde 'Cut' sea nan:
df_class = df.dropna(subset=['cut'])

# Contamos nulos en todas las columnas: equivalente a pandas df.isna().sum()
df_class.select([sum(col(c).isNull().cast('int')).alias(c) for c in df_class.columns]).show()

+-----+---+-----+-------+-----+-----+-----+---+---+---+
|carat|cut|color|clarity|depth|table|price|  x|  y|  z|
+-----+---+-----+-------+-----+-----+-----+---+---+---+
|    0|  0|    0|      0|    0|    0|    0|  0|  0|  0|
+-----+---+-----+-------+-----+-----+-----+---+---+---+



In [19]:
# Separamos las columnas a las que aplicar preprocesados
numerical_columns = [field.name for field in df_class.schema.fields if isinstance(field.dataType, NumericType)]
categorical_columns = [field.name for field in df_class.schema.fields if isinstance(field.dataType, StringType) and field.name != 'cut']
label_col = 'cut'

In [20]:
# Hacemos Indexer para la columna 'Cut'
indexer_label = StringIndexer(
    inputCol=label_col, # Indexa columna 'Cut', que es la que queremos predecir
    outputCol='label',
    handleInvalid='keep'
)

In [21]:
# Hacemos Indexer para las columnas(features) de la entrada
# Crea un objeto StringIndexer por cada columna categórica a indexar
indexers_features = [
    StringIndexer(inputCol=c, outputCol=c + '_indexed', handleInvalid='keep') for c in categorical_columns
]
categorical_cols_indexed = [c + '_indexed' for c in categorical_columns]

print(categorical_cols_indexed)

['color_indexed', 'clarity_indexed']


In [22]:
# Hacemos Imputer a las columnas categoricas
imputer_categorical = Imputer(
    inputCols=categorical_cols_indexed,
    outputCols=[c + '_imputed' for c in categorical_cols_indexed],
    strategy='mode'
)
categorical_cols_indexed_imputed = [c + '_imputed' for c in categorical_cols_indexed]

print(categorical_cols_indexed_imputed)

['color_indexed_imputed', 'clarity_indexed_imputed']


In [23]:
# Hacemos OneHotEncoder a als columnas categoricas
encoders_onehot = [
    OneHotEncoder(inputCol=c, outputCol=c + '_onehot') 
    for c in categorical_cols_indexed_imputed
]
categorical_cols_onehot = [c + '_onehot' for c in categorical_cols_indexed_imputed]

print(categorical_cols_onehot)

['color_indexed_imputed_onehot', 'clarity_indexed_imputed_onehot']


In [24]:
# Hacemos Imputer a las columnas numericas
imputer_numerical = Imputer(
    inputCols=numerical_columns,
    outputCols=[c + '_imputed' for c in numerical_columns],
    strategy='median'
)
numerical_cols_imputed = [c + '_imputed' for c in numerical_columns]

print(numerical_cols_imputed)

['carat_imputed', 'depth_imputed', 'table_imputed', 'price_imputed', 'x_imputed', 'y_imputed', 'z_imputed']


In [25]:
# Escalamos las columnas numéricas con MinMaxScaler
assembler_numerical = VectorAssembler(
    inputCols=numerical_cols_imputed,
    outputCol='numeric_features'
)
scaler = MinMaxScaler(
    inputCol='numeric_features',
    outputCol='numeric_features_scaled'
)

In [26]:
# JUntamos todas las columnas
all_columns = ['numeric_features_scaled'] + categorical_cols_onehot

print(all_columns)

['numeric_features_scaled', 'color_indexed_imputed_onehot', 'clarity_indexed_imputed_onehot']


In [27]:
# Ensamblamos las columnas numéricas + categóricas y obtenemos features
assembler_all = VectorAssembler(
    inputCols=all_columns,
    outputCol='features'
)

In [28]:
# Hacemos el clasificador
classifier = RandomForestClassifier(seed=42)

In [29]:
# Hacemos el particionamiento
df_train, df_test = df_class.randomSplit([0.8, 0.2], seed=42)

In [30]:
pipeline = Pipeline(stages = [
    # Hacemos Indexer para 'cut' que es la columna a predecir
    indexer_label,
    # Hacemos Indexer para columnas categóricas
    *indexers_features, # ponemos * porque es una lista de objetos
    # Hacemos Imputer para columnas categóricas
    imputer_categorical,
    # Hacemo OneHotEncoders para columnas categóricas
    *encoders_onehot,
    # Hacemos Imputer para columnas numéricas
    imputer_numerical,
    # Ensamblamos columnas numéricas y hacemos escalado
    assembler_numerical,
    scaler,
    # Ensamblamos numéricas escaladas y categóricas en la columna 'features'
    assembler_all,
    # Hacemos el modelo de clasificación
    classifier
])

In [31]:
# Entrenamos el PipeLine
pipeline_model = pipeline.fit(df_train)
df_pred = pipeline_model.transform(df_test)

In [32]:
# Creamos los evaluadores
evaluator_accuracy = MulticlassClassificationEvaluator(metricName='accuracy')
evaluator_f1 = MulticlassClassificationEvaluator(metricName='f1')
evaluator_precision = MulticlassClassificationEvaluator(metricName='weightedPrecision')
evaluator_recall = MulticlassClassificationEvaluator(metricName='weightedRecall')

print(f"accuracy: {evaluator_accuracy.evaluate(df_pred):.3f}")
print(f"f1: {evaluator_f1.evaluate(df_pred):.3f}")
print(f"precision: {evaluator_precision.evaluate(df_pred):.3f}")
print(f"recall: {evaluator_recall.evaluate(df_pred):.3f}")

accuracy: 0.669
f1: 0.619
precision: 0.647
recall: 0.669


#### PARTE 4 (10 %) Gridsearch con CrossValidation sobre cualquiera de los pipelines

Los modelos, se puede utilizar RandomForest para los dos por ejemplo o el que se quiera. Ejemplo RandomForestRegressor para regresión y MultiLayerPerceptronClassifier para clasificación.

In [33]:
# Creamos el Gridsearch
paramGrid = (
    ParamGridBuilder()
    .addGrid(classifier.numTrees, [15, 20, 25, 30]) # Por defecto es 20
    .addGrid(classifier.maxDepth, [1, 3, 5, 10, 15, 20]) # Por defecto es 5(Tiene un rango de [0, 30])
    .build()
)

In [34]:
# Creamo sel CrossValidator
crossval = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid, # Parámetros para grid search hyper parameter tuning
    evaluator=evaluator_f1,
    numFolds=3, # Valor por defecto
    parallelism=4,
    seed=42
)

In [35]:
# Entrenamos el modelo
cv_model = crossval.fit(df_train)
df_pred = cv_model.transform(df_test)

In [None]:
# Mostramos las metricas
print(f"accuracy: {evaluator_accuracy.evaluate(df_pred):.3f}")
print(f"f1: {evaluator_f1.evaluate(df_pred):.3f}")
print(f"precision: {evaluator_precision.evaluate(df_pred):.3f}")
print(f"recall: {evaluator_recall.evaluate(df_pred):.3f}")

accuracy: 0.721
f1: 0.707
precision: 0.706
recall: 0.721
