# Ejemplo modelización Regresión Lineal

In [0]:
df = spark.read.table("insurance_final") # cargar datos

df.display()

age,sex,bmi,children,smoker,region,charges
19,female,27.9,0,yes,southwest,16884.924
18,male,33.77,1,no,southeast,1725.5523
28,male,33.0,3,no,southeast,4449.462
33,male,22.705,0,no,northwest,21984.47061
32,male,28.88,0,no,northwest,3866.8552
31,female,25.74,0,no,southeast,3756.6216
46,female,33.44,1,no,southeast,8240.5896
37,female,27.74,3,no,northwest,7281.5056
37,male,29.83,2,no,northeast,6406.4107
60,female,25.84,0,no,northwest,28923.13692


Definir muestra de entrenamiento y test

In [0]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

pdf = df.toPandas() # conversión a dataframe de pandas

# obtención de los datos de entrada y el target
X = pdf.drop(columns=["charges"])
y = pdf["charges"]

# conversión one-hot-encoder de las variables explicativas categóricas
X = pd.get_dummies(X, drop_first=True, dtype=int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.7, random_state=123
)

Realización del modelo de regresión lineal

In [0]:
from sklearn.linear_model import LinearRegression


# Entrenar el modelo de regresión lineal
modelo = LinearRegression().fit(X_train[["smoker_yes"]], y_train)

In [0]:
modelo

Convertir la muestra de test en spark dataframe

In [0]:
X_test_sdf = spark.createDataFrame(X_test)
y_test_sdf = spark.createDataFrame(y_test.to_frame())

Realizaciones de las predicciones con UDF de spark

In [0]:
import pyspark.sql.functions as F

# Definir la UDF para predecir usando el modelo entrenado
def predecir(col1, modelo):
    X = np.array([[col1]])
    return float(modelo.predict(X)[0])

# Crear la UDF
predecir_udf = F.udf(lambda col1: predecir(col1, modelo))


Obtener los resultados 

In [0]:
df_con_predicciones = X_test_sdf.withColumn("Prediccion", predecir_udf(X_test_sdf['smoker_yes']))
df_con_predicciones.display()

age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest,Prediccion
49,42.68,2,0,0,0,1,0,8266.640086249998
32,37.335,1,1,0,0,0,0,8266.640086249998
27,31.4,0,0,1,0,0,1,31547.66788910113
35,24.13,1,1,0,1,0,0,8266.640086249998
60,25.74,0,1,0,0,1,0,8266.640086249998
50,28.16,3,0,0,0,1,0,8266.640086249998
27,18.905,3,1,0,0,0,0,8266.640086249998
33,18.5,1,0,0,0,0,1,8266.640086249998
22,33.77,0,1,0,0,1,0,8266.640086249998
51,22.42,0,1,0,0,0,0,8266.640086249998


age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest,Prediccion
49,42.68,2,0,0,0,1,0,8266.640086249998
32,37.335,1,1,0,0,0,0,8266.640086249998
27,31.4,0,0,1,0,0,1,31547.66788910113
35,24.13,1,1,0,1,0,0,8266.640086249998
60,25.74,0,1,0,0,1,0,8266.640086249998
50,28.16,3,0,0,0,1,0,8266.640086249998
27,18.905,3,1,0,0,0,0,8266.640086249998
33,18.5,1,0,0,0,0,1,8266.640086249998
22,33.77,0,1,0,0,1,0,8266.640086249998
51,22.42,0,1,0,0,0,0,8266.640086249998



Podría generalizarse usándose todas las variables explicativas

In [0]:
# Definir la UDF para predecir usando el modelo entrenado
def predecir(variables_explicativas, modelo):
    X = np.array([variables_explicativas])
    return float(modelo.predict(X)[0])

# Crear el modelo de regresión lineal
modelo_all = LinearRegression().fit(X_train, y_train)

# Crear la UDF especificando el tipo de retorno
# La función lambda toma una fila de la tabla y llama a la función predecir con esa fila y el modelo creado
predecir_udf = F.udf(lambda row: predecir(row, modelo_all))

# Aplicar la UDF al DataFrame
# Se crea un array que contiene todas las columnas del DataFrame
df_con_predicciones_all = X_test_sdf.withColumn("Prediccion", predecir_udf(F.array(*X_test_sdf.columns)))
df_con_predicciones_all.display()

age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest,Prediccion
49,42.68,2,0,0,0,1,0,15629.071895460937
32,37.335,1,1,0,0,0,0,9726.574777674708
27,31.4,0,0,1,0,0,1,28461.67324526529
35,24.13,1,1,0,1,0,0,5031.117345729424
60,25.74,0,1,0,0,1,0,10435.405931880809
50,28.16,3,0,0,0,1,0,10183.721543855772
27,18.905,3,1,0,0,0,0,1464.90600180185
33,18.5,1,0,0,0,0,1,1766.2919539404356
22,33.77,0,1,0,0,1,0,4340.421889699657
51,22.42,0,1,0,0,0,0,8021.699094248765
