In [1]:
#Descargamos spark
!pip install pyspark




In [2]:
from pyspark.sql import SparkSession
import pandas as pd

In [3]:
# Crear sesión de Spark
spark = SparkSession.builder.appName("DementiaModel").getOrCreate()

In [5]:
#Cargar y analizar la dataa
df = pd.read_csv("data.csv")

In [6]:
# Convertir DataFrame de pandas a DataFrame de Spark
spark_df = spark.createDataFrame(df)

In [7]:
# Mostrar los primeros registros
spark_df.show(5)

+---+---+------+--------+------------+--------------+----+----+------+-----------+-----------+-------+----------------+----------------+---------------+-----------------+-----------+-----------+----------+------+----------------+-----------------+-------+---------+---------+
| ID|age|gender|dementia|dementia_all|educationyears|  EF|  PS|Global|glucose_min|glucose_max|smoking|hypertension_sys|hypertension_dia|cholesterol_ldl|cholesterol_total|lacunes_num|fazekas_cat|     study|study1|SVD Simple Score|SVD Amended Score|Fazekas|lac_count|CMB_count|
+---+---+------+--------+------------+--------------+----+----+------+-----------+-----------+-------+----------------+----------------+---------------+-----------------+-----------+-----------+----------+------+----------------+-----------------+-------+---------+---------+
|  1| 80|     F|       0|           0|             8|0.98|0.67|  0.82|         82|        128|      1|             132|              75|            174|              221|  

In [8]:
import pickle

# Cargar el modelo desde el archivo .pkl
with open("modelo.pkl", "rb") as f:
    modelo = pickle.load(f)


In [9]:
from pyspark.sql.functions import when, col

# Asegurar codificación de 'gender' como numérico
spark_df = spark_df.withColumn("gender", when(col("gender") == "F", 0).otherwise(1))


In [10]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
import numpy as np

# Columnas que espera el modelo
feature_columns = ['age', 'gender', 'educationyears', 'glucose_min',
                   'cholesterol_total', 'EF', 'PS', 'Global']

# Función que aplica el modelo
def predict_dementia(*cols):
    X = np.array(cols).reshape(1, -1)
    return float(modelo.predict(X)[0])  # devuelve 0 o 1

# Crear la UDF
predict_udf = udf(predict_dementia, DoubleType())


In [11]:
# Generar nueva columna con predicción
spark_df = spark_df.withColumn("prediction", predict_udf(*[col(c) for c in feature_columns]))

# Ver algunas predicciones
spark_df.select("age", "gender", "prediction").show(10)


+---+------+----------+
|age|gender|prediction|
+---+------+----------+
| 80|     0|       0.0|
| 67|     0|       0.0|
| 88|     0|       0.0|
| 81|     0|       0.0|
| 79|     0|       0.0|
| 75|     0|       0.0|
| 80|     0|       0.0|
| 74|     0|       0.0|
| 72|     0|       0.0|
| 75|     1|       0.0|
+---+------+----------+
only showing top 10 rows

