### MLib

Machine Learning en pyspark. Similar a Scikit Learn pero en pyspark.

In [1]:
import seaborn as sns
import pandas as pd

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('regresion_diamonds').getOrCreate()
df = spark.createDataFrame(sns.load_dataset('diamonds'))
df.show(3)

+-----+-------+-----+-------+-----+-----+-----+----+----+----+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+
only showing top 3 rows



In [None]:
from pyspark.ml.regression import LinearRegression

# Explicacion de como se usa (Documentacion oficial)
# help(LinearRegression)

In [7]:
# Obtener la X, que para pyspark se llama 'features'
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=['carat', 'depth', 'x', 'y', 'z', 'table'],
    outputCol='features' # le llamamos features para que coincida con lo que piden los algoritmos
)
df_assembled = assembler.transform(df)
df_assembled.show(3)
# En la columna features se añaden todos los valores de las columnas numericas

+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+
|carat|    cut|color|clarity|depth|table|price|   x|   y|   z|            features|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+
| 0.23|  Ideal|    E|    SI2| 61.5| 55.0|  326|3.95|3.98|2.43|[0.23,61.5,3.95,3...|
| 0.21|Premium|    E|    SI1| 59.8| 61.0|  326|3.89|3.84|2.31|[0.21,59.8,3.89,3...|
| 0.23|   Good|    E|    VS1| 56.9| 65.0|  327|4.05|4.07|2.31|[0.23,56.9,4.05,4...|
+-----+-------+-----+-------+-----+-----+-----+----+----+----+--------------------+
only showing top 3 rows



In [None]:
# Seleccionamos las columnas que usaremos para el modelado
df_features_label = df_assembled.withColumnRenamed('price', 'label').select('features', 'label')
df_features_label.show(3)
# Asi nos quedamos con las columnas y el nombre que necesita pyspark
# La entrada "X" seria "features"
# La salida "Y" seria "label"

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.23,61.5,3.95,3...|  326|
|[0.21,59.8,3.89,3...|  326|
|[0.23,56.9,4.05,4...|  327|
+--------------------+-----+
only showing top 3 rows



In [10]:
# Particionamiento de los datos
df_train, df_test = df_features_label.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Modelo de regresion ya con el entrenamiento y la prediccion
lr = LinearRegression() # Modelo
model= lr.fit(df_train) # Entrenamiento
df_pred = model.transform(df_test) # Prediccion
df_pred.show(4)

+--------------------+-----+------------------+
|            features|label|        prediction|
+--------------------+-----+------------------+
|[0.22,59.3,3.91,3...|  404| 23.60821074172054|
|[0.23,56.9,4.05,4...|  327| 135.6672490111814|
|[0.23,59.4,4.0,4....|  338| 104.8061350055832|
|[0.23,60.5,3.96,3...|  357|-69.15086283582787|
+--------------------+-----+------------------+
only showing top 4 rows



In [20]:
from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Definir los evaluadores
evaluator_r2 = RegressionEvaluator(metricName='r2')
evaluator_mae = RegressionEvaluator(metricName='mae')
evaluator_mse = RegressionEvaluator(metricName='mse')
evaluator_rmse = RegressionEvaluator(metricName='rmse')

In [21]:
# Modelo decision tree regression

tree = DecisionTreeRegressor() # Modelo
model= tree.fit(df_train) # Entrenamiento
df_pred = model.transform(df_test) # Prediccion

print('r2', evaluator_r2.evaluate(df_pred))
print('mae', evaluator_mae.evaluate(df_pred))
print('mse', evaluator_mse.evaluate(df_pred))
print('rmse', evaluator_rmse.evaluate(df_pred))

r2 0.8735318379567073
mae 803.4717018889393
mse 2015965.0804856266
rmse 1419.8468510672644


In [22]:
# Random forest
forest = RandomForestRegressor(numTrees=200) # Modelo
model= forest.fit(df_train) # Entrenamiento
df_pred = model.transform(df_test) # Prediccion

print('r2', evaluator_r2.evaluate(df_pred))
print('mae', evaluator_mae.evaluate(df_pred))
print('mse', evaluator_mse.evaluate(df_pred))
print('rmse', evaluator_rmse.evaluate(df_pred))

r2 0.8772337207183463
mae 796.0495113421505
mse 1956955.2375422209
rmse 1398.9121621968338


In [23]:
gbt = GBTRegressor()
model = gbt.fit(df_train)
df_pred = model.transform(df_test)
print('r2', evaluator_r2.evaluate(df_pred))
print('mae', evaluator_mae.evaluate(df_pred))
print('mse', evaluator_mse.evaluate(df_pred))
print('rmse', evaluator_rmse.evaluate(df_pred))

r2 0.8815473800708988
mae 779.4271243946961
mse 1888193.373027426
rmse 1374.1154875145778
