In [1]:
import seaborn as sns
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('regresion_mpg').getOrCreate()
df = spark.createDataFrame(sns.load_dataset('mpg').dropna()) # Quitamos los numos con dropna
df.show(3)

+----+---------+------------+----------+------+------------+----------+------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model_year|origin|                name|
+----+---------+------------+----------+------+------------+----------+------+--------------------+
|18.0|        8|       307.0|     130.0|  3504|        12.0|        70|   usa|chevrolet chevell...|
|15.0|        8|       350.0|     165.0|  3693|        11.5|        70|   usa|   buick skylark 320|
|18.0|        8|       318.0|     150.0|  3436|        11.0|        70|   usa|  plymouth satellite|
+----+---------+------------+----------+------+------------+----------+------+--------------------+
only showing top 3 rows



In [2]:
# OPcion 1
# Hacer assebler antes del train test split que particiona los datos
from pyspark.ml.feature import VectorAssembler


# El assembler lo que hace es colocar todos los datos de las columnas 
# en una misma columna en forma de lista en la columna 'Features'
assembler = VectorAssembler(
    inputCols=['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year'],
    outputCol='features' # le llamamos features para que coincida con lo que piden los algoritmos
)
df_assembled = assembler.transform(df)
df_assembled.show(3)

+----+---------+------------+----------+------+------------+----------+------+--------------------+--------------------+
| mpg|cylinders|displacement|horsepower|weight|acceleration|model_year|origin|                name|            features|
+----+---------+------------+----------+------+------------+----------+------+--------------------+--------------------+
|18.0|        8|       307.0|     130.0|  3504|        12.0|        70|   usa|chevrolet chevell...|[8.0,307.0,130.0,...|
|15.0|        8|       350.0|     165.0|  3693|        11.5|        70|   usa|   buick skylark 320|[8.0,350.0,165.0,...|
|18.0|        8|       318.0|     150.0|  3436|        11.0|        70|   usa|  plymouth satellite|[8.0,318.0,150.0,...|
+----+---------+------------+----------+------+------------+----------+------+--------------------+--------------------+
only showing top 3 rows



In [3]:
# Renombramos las columnas
df_features_label = df_assembled.withColumnRenamed('mpg', 'label').select('features', 'label')
df_features_label.show(3)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[8.0,307.0,130.0,...| 18.0|
|[8.0,350.0,165.0,...| 15.0|
|[8.0,318.0,150.0,...| 18.0|
+--------------------+-----+
only showing top 3 rows



In [4]:
# Particionamiento de los datos
df_train, df_test = df_features_label.randomSplit([0.8, 0.2], seed=42)
df_train.show(3)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[4.0,97.0,46.0,18...| 26.0|
|[4.0,97.0,88.0,21...| 27.0|
|[4.0,104.0,95.0,2...| 25.0|
+--------------------+-----+
only showing top 3 rows



In [5]:
# OPcion 2
# Primero particionar y suego usar vectorAssembler
numeric_cols = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year']
label_col = 'mpg'
df_selected = df.select(numeric_cols + [label_col])
# df_selected.show(1)
df_train, df_test = df_selected.randomSplit([0.8, 0.2], seed=42)
assembler = VectorAssembler(
    inputCols=numeric_cols,
    outputCol='features'
    )

# Le hemos dejado com 'mpg' en lugar de 'label' para ver
# como usarlo en algoritmos ML
df_train = assembler.transform(df_train).select('features', label_col)
df_test = assembler.transform(df_test).select('features', label_col)
df_train.show(3)

+--------------------+----+
|            features| mpg|
+--------------------+----+
|[4.0,97.0,46.0,18...|26.0|
|[4.0,97.0,88.0,21...|27.0|
|[4.0,104.0,95.0,2...|25.0|
+--------------------+----+
only showing top 3 rows



In [6]:
# Modelo de regresion ya con el entrenamiento y la prediccion
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(labelCol='mpg') # Modelo
model = lr.fit(df_train) # Entrenamiento
df_pred = model.transform(df_test) # Prediccion
df_pred.show(2)

+--------------------+----+------------------+
|            features| mpg|        prediction|
+--------------------+----+------------------+
|[4.0,97.0,88.0,21...|27.0|25.250093988412512|
|[4.0,113.0,95.0,2...|25.0|24.676738282045054|
+--------------------+----+------------------+
only showing top 2 rows



In [None]:
# Calculo de las metricas

from pyspark.ml.evaluation import RegressionEvaluator

evaluator_r2 = RegressionEvaluator(metricName='r2', labelCol=label_col)
evaluator_mae = RegressionEvaluator(metricName='mae', labelCol=label_col)
evaluator_mse = RegressionEvaluator(metricName='mse', labelCol=label_col)
evaluator_rmse = RegressionEvaluator(metricName='rmse', labelCol=label_col)


print('r2', evaluator_r2.evaluate(df_pred))
print('mae', evaluator_mae.evaluate(df_pred))
print('mse', evaluator_mse.evaluate(df_pred))
print('rmse', evaluator_rmse.evaluate(df_pred))

r2 0.7855411849361144
mae 2.729576315612864
mse 12.79695049563809
rmse 3.577282557422336


In [None]:
# Detectar columnas numericas automaticamentes, lo que es el 'select_dtypes' en pandas

from pyspark.sql.types import NumericType

numeric_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, NumericType)]
numeric_cols

['mpg',
 'cylinders',
 'displacement',
 'horsepower',
 'weight',
 'acceleration',
 'model_year']

In [11]:
# Detectar columnas categoricas automaticamentes, lo que es el 'select_dtypes' en pandas

from pyspark.sql.types import StringType

categorical_cols = [field.name for field in df.schema.fields if isinstance(field.dataType, StringType)]
df_categorical = df.select(categorical_cols)
df_categorical.show(3)

+------+--------------------+
|origin|                name|
+------+--------------------+
|   usa|chevrolet chevell...|
|   usa|   buick skylark 320|
|   usa|  plymouth satellite|
+------+--------------------+
only showing top 3 rows

