In [None]:
# Starting Spark properly with a custom temp directory to avoid shuffle errors
from pyspark.sql import SparkSession
import os

# Set a safe local temp directory
os.environ["SPARK_LOCAL_DIRS"] = "C:/temp/spark"

# Start Spark session
spark = SparkSession.builder.appName("BigDataAnalysis").getOrCreate()
spark

In [None]:
# Loading in the cleaned patient dataset
df = spark.read.csv("cleaned_patients.csv", header=True, inferSchema=True)

# Quick look at first few rows
df.show(5)

# Checking what the structure of the data looks like
df.printSchema()

In [None]:
# Picking out the main features to use in the model
from pyspark.ml.feature import VectorAssembler

features = ["AGE", "OBESITY", "TOBACCO", "HIPERTENSION", "DIABETES"]

# Combining them into one feature column
assembler = VectorAssembler(inputCols=features, outputCol="features")
df_transformed = assembler.transform(df)

# Just checking that the features column looks alright
df_transformed.select("AGE", "OBESITY", "TOBACCO", "HIPERTENSION", "DIABETES", "features").show(5, truncate=False)

In [None]:
# Using KMeans to group patients based on the features
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=3, seed=42, featuresCol="features")
kmeans_model = kmeans.fit(df_transformed)
clusters = kmeans_model.transform(df_transformed)

# Checking what cluster each patient ended up in
clusters.select("features", "prediction").show(10)

In [None]:
# Seeing how ICU cases are spread across each cluster
clusters.groupBy("prediction", "ICU").count().show()

In [None]:
# Training a Random Forest model to predict ICU
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="ICU", featuresCol="features", numTrees=100)
rf_model = rf.fit(df_transformed)

# Making predictions on full dataset
rf_predictions = rf_model.transform(df_transformed)
rf_predictions.select("features", "ICU", "prediction", "probability").show(10, truncate=False)

In [None]:
# Balancing the dataset by oversampling ICU = 1 cases

# Splitting the data into ICU = 1 (minority) and ICU = 2 (majority)
minority = df_transformed.filter(df_transformed["ICU"] == 1)
majority = df_transformed.filter(df_transformed["ICU"] == 2)

# Duplicating the minority class to balance it
oversampled = minority.sample(withReplacement=True, fraction=2.0)

# Putting both groups back together
balanced_data = majority.union(oversampled)

# Shuffling the data
from pyspark.sql.functions import rand
balanced_data = balanced_data.orderBy(rand())

# Checking how balanced it is now
balanced_data.groupBy("ICU").count().show()

In [None]:
# Training the model again on the balanced dataset
rf_balanced = RandomForestClassifier(labelCol="ICU", featuresCol="features", numTrees=100)
rf_balanced_model = rf_balanced.fit(balanced_data)

# Making new predictions
balanced_predictions = rf_balanced_model.transform(balanced_data)
balanced_predictions.select("features", "ICU", "prediction", "probability").show(10, truncate=False)

In [None]:
# Checking how accurate the model is
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="ICU", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(balanced_predictions)

print(f"Model Accuracy: {accuracy:.2f}")

In [None]:
# Extra evaluation metrics
f1_eval = MulticlassClassificationEvaluator(labelCol="ICU", predictionCol="prediction", metricName="f1")
f1 = f1_eval.evaluate(balanced_predictions)
print(f"F1 Score: {f1:.2f}")

precision_eval = MulticlassClassificationEvaluator(labelCol="ICU", predictionCol="prediction", metricName="weightedPrecision")
precision = precision_eval.evaluate(balanced_predictions)
print(f"Precision: {precision:.2f}")

recall_eval = MulticlassClassificationEvaluator(labelCol="ICU", predictionCol="prediction", metricName="weightedRecall")
recall = recall_eval.evaluate(balanced_predictions)
print(f"Recall: {recall:.2f}")

In [None]:
# Splitting the data into train and test sets
train_data, test_data = balanced_data.randomSplit([0.8, 0.2], seed=42)

# Just checking how many rows in each
print("Train count:", train_data.count())
print("Test count:", test_data.count())

In [None]:
# Training model on just the training data
final_rf = RandomForestClassifier(labelCol="ICU", featuresCol="features", numTrees=100)
final_model = final_rf.fit(train_data)

# Predicting on test data
test_preds = final_model.transform(test_data)
test_preds.select("features", "ICU", "prediction", "probability").show(10, truncate=False)

In [None]:
# Final evaluation on test data
evaluator = MulticlassClassificationEvaluator(labelCol="ICU", predictionCol="prediction")

acc = evaluator.setMetricName("accuracy").evaluate(test_preds)
f1 = evaluator.setMetricName("f1").evaluate(test_preds)
prec = evaluator.setMetricName("weightedPrecision").evaluate(test_preds)
rec = evaluator.setMetricName("weightedRecall").evaluate(test_preds)

print(f"Accuracy: {acc:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Precision: {prec:.2f}")
print(f"Recall: {rec:.2f}")