In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
import numpy as np

# Improved Data Cleansing
def removeUnnecessaryColumns(df_patients):
    df_patients.drop(columns=['index', 'PATIENT_TYPE'], inplace=True)

# Identifying and handling missing values
def handleMissingValues(df_patients):

    # Make ? in pregnant 1 instead of deleting as ? means male
    df_patients['PREGNANT'] = df_patients['PREGNANT'].replace('?', 1)

    #Make all 9999-99-99 dates NaT then an average date 
    df_patients['DATE_DIED'] = pd.to_datetime(df_patients['DATE_DIED'], errors='coerce')
    average_date = pd.to_datetime(df_patients['DATE_DIED'].dropna().astype(int).mean(), unit='ns')
    df_patients['DATE_DIED'] = df_patients['DATE_DIED'].fillna(average_date)
    
    for col in df_patients.select_dtypes(include=['object']).columns:
        df_patients[col] = df_patients[col].replace('?', np.nan)
        df_patients[col] = LabelEncoder().fit_transform(df_patients[col].astype(str))

    # Apply KNNImputer to handle missing values
    imputer = KNNImputer(n_neighbors=5)
    numeric_cols = df_patients.select_dtypes(include=['float64', 'int64']).columns
    df_patients[numeric_cols] = imputer.fit_transform(df_patients[numeric_cols])

# Outlier Detection and Removal using IQR
def outlierDetection(df_patients):
    numeric_cols = df_patients.select_dtypes(include=['float64', 'int64']).columns

    Q1 = df_patients[numeric_cols].quantile(0.25)
    Q3 = df_patients[numeric_cols].quantile(0.75)
    IQR = Q3 - Q1

    df_patients = df_patients[~((df_patients[numeric_cols] < (Q1 - 1.5 * IQR)) | 
                                (df_patients[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
    

#Normalization of the AGE column using StandardScaler
def normalizeData(df_patients):
    print(df_patients['AGE'].head())
    scaler = StandardScaler()
    df_patients['AGE'] = scaler.fit_transform(df_patients[['AGE']])

# Loading the dataset from the CSV file
data_file = 'Dataset.csv'
df_patients = pd.read_csv(data_file, low_memory=False)

# Clean the dataset
removeUnnecessaryColumns(df_patients)
handleMissingValues(df_patients)
outlierDetection(df_patients)
normalizeData(df_patients)

# Save the cleaned dataset to a new CSV file
df_patients.to_csv('deep_cleaned_patients.csv', index=False)

# Checking the data types
df_patients['SEX'] = df_patients['SEX'].astype('object')


0    55.0
1    40.0
2    37.0
3    25.0
4    24.0
Name: AGE, dtype: float64


In [1]:
# Starting Spark properly with a custom temp directory to avoid shuffle errors
from pyspark.sql import SparkSession
import os

# Set a safe local temp directory
os.environ["SPARK_LOCAL_DIRS"] = "C:/temp/spark"

# Start Spark session
spark = SparkSession.builder.appName("BigDataAnalysis").getOrCreate()
spark



In [2]:
# Loading in the cleaned patient dataset
df = spark.read.csv("deep_cleaned_patients.csv", header=True, inferSchema=True)

# Quick look at first few rows
df.show(5)

# Checking what the structure of the data looks like
df.printSchema()

+-----+------------+---+--------------------+-------+---------+-------------------+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+
|USMER|MEDICAL_UNIT|SEX|           DATE_DIED|INTUBED|PNEUMONIA|                AGE|PREGNANT|DIABETES|COPD|ASTHMA|INMSUPR|HIPERTENSION|OTHER_DISEASE|CARDIOVASCULAR|OBESITY|RENAL_CHRONIC|TOBACCO|CLASIFFICATION_FINAL|ICU|
+-----+------------+---+--------------------+-------+---------+-------------------+--------+--------+----+------+-------+------------+-------------+--------------+-------+-------------+-------+--------------------+---+
|  2.0|         1.0|2.0| 2020-09-06 00:00:00|    0.0|      1.0|0.11245548466355285|     0.0|     0.0| 1.0|   1.0|    1.0|         1.0|          1.0|           1.0|    1.0|          1.0|    1.0|                 3.0|1.0|
|  2.0|         1.0|1.0|2020-06-25 08:06:...|    1.0|      0.0|-0.6439819373845154|     1.0|     1.0| 1.0|   1.0|    1.0|   

In [3]:
# Picking out the main features to use in the model
from pyspark.ml.feature import VectorAssembler

features = ["AGE", "OBESITY", "TOBACCO", "HIPERTENSION", "DIABETES"]

# Combining them into one feature column
assembler = VectorAssembler(inputCols=features, outputCol="features")
df_transformed = assembler.transform(df)

# Just checking that the features column looks alright
df_transformed.select("AGE", "OBESITY", "TOBACCO", "HIPERTENSION", "DIABETES", "features").show(5, truncate=False)

+-------------------+-------+-------+------------+--------+-------------------------------------+
|AGE                |OBESITY|TOBACCO|HIPERTENSION|DIABETES|features                             |
+-------------------+-------+-------+------------+--------+-------------------------------------+
|0.11245548466355285|1.0    |1.0    |1.0         |0.0     |[0.11245548466355285,1.0,1.0,1.0,0.0]|
|-0.6439819373845154|1.0    |1.0    |1.0         |1.0     |[-0.6439819373845154,1.0,1.0,1.0,1.0]|
|-0.7952694217941291|0.0    |1.0    |0.0         |0.0     |(5,[0,2],[-0.7952694217941291,1.0])  |
|-1.4004193594325836|1.0    |1.0    |1.0         |1.0     |[-1.4004193594325836,1.0,1.0,1.0,1.0]|
|-1.4508485209024549|1.0    |1.0    |1.0         |1.0     |[-1.4508485209024549,1.0,1.0,1.0,1.0]|
+-------------------+-------+-------+------------+--------+-------------------------------------+
only showing top 5 rows



In [4]:
# Using KMeans to group patients based on the features
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=3, seed=42, featuresCol="features")
kmeans_model = kmeans.fit(df_transformed)
clusters = kmeans_model.transform(df_transformed)

# Checking what cluster each patient ended up in
clusters.select("features", "prediction").show(10)

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[0.11245548466355...|         0|
|[-0.6439819373845...|         2|
|(5,[0,2],[-0.7952...|         0|
|[-1.4004193594325...|         2|
|[-1.4508485209024...|         2|
|[-1.1482735520832...|         2|
|[1.37318452141033...|         1|
|[0.31417213054303...|         0|
|[-0.3918361300351...|         2|
|[-0.3918361300351...|         2|
+--------------------+----------+
only showing top 10 rows



In [5]:
# Seeing how ICU cases are spread across each cluster
clusters.groupBy("prediction", "ICU").count().show()

+----------+---+-----+
|prediction|ICU|count|
+----------+---+-----+
|         2|1.0|57828|
|         1|0.0| 6614|
|         0|2.0| 1532|
|         0|0.0| 4653|
|         1|2.0| 2230|
|         2|2.0| 3726|
|         1|1.0|70650|
|         2|0.0| 5591|
|         0|1.0|47207|
+----------+---+-----+



In [6]:
# Training a Random Forest model to predict ICU
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(labelCol="ICU", featuresCol="features", numTrees=100)
rf_model = rf.fit(df_transformed)

# Making predictions on full dataset
rf_predictions = rf_model.transform(df_transformed)
rf_predictions.select("features", "ICU", "prediction", "probability").show(10, truncate=False)

+-------------------------------------+---+----------+-------------------------------------------------------------+
|features                             |ICU|prediction|probability                                                  |
+-------------------------------------+---+----------+-------------------------------------------------------------+
|[0.11245548466355285,1.0,1.0,1.0,0.0]|1.0|1.0       |[0.07819016120327144,0.890493321909399,0.031316516887329594] |
|[-0.6439819373845154,1.0,1.0,1.0,1.0]|1.0|1.0       |[0.07786334886867124,0.8905219366081505,0.03161471452317827] |
|(5,[0,2],[-0.7952694217941291,1.0])  |1.0|1.0       |[0.09669544402339211,0.871601532575399,0.03170302340120894]  |
|[-1.4004193594325836,1.0,1.0,1.0,1.0]|1.0|1.0       |[0.07607678129615723,0.886954532440889,0.03696868626295389]  |
|[-1.4508485209024549,1.0,1.0,1.0,1.0]|1.0|1.0       |[0.07607678129615723,0.886954532440889,0.03696868626295389]  |
|[-1.1482735520832275,1.0,1.0,1.0,1.0]|1.0|1.0       |[0.0761614

In [7]:
# Balancing the dataset by oversampling ICU = 1 cases

# Splitting the data into ICU = 1 (minority) and ICU = 2 (majority)
minority = df_transformed.filter(df_transformed["ICU"] == 1)
majority = df_transformed.filter(df_transformed["ICU"] == 2)

# Duplicating the minority class to balance it
oversampled = minority.sample(withReplacement=True, fraction=2.0)

# Putting both groups back together
balanced_data = majority.union(oversampled)

# Shuffling the data
from pyspark.sql.functions import rand
balanced_data = balanced_data.orderBy(rand())

# Checking how balanced it is now
balanced_data.groupBy("ICU").count().show()

+---+------+
|ICU| count|
+---+------+
|2.0|  7488|
|1.0|351566|
+---+------+



In [8]:
# Training the model again on the balanced dataset
rf_balanced = RandomForestClassifier(labelCol="ICU", featuresCol="features", numTrees=100)
rf_balanced_model = rf_balanced.fit(balanced_data)

# Making new predictions
balanced_predictions = rf_balanced_model.transform(balanced_data)
balanced_predictions.select("features", "ICU", "prediction", "probability").show(10, truncate=False)

+--------------------------------------+---+----------+---------------------------------------------+
|features                              |ICU|prediction|probability                                  |
+--------------------------------------+---+----------+---------------------------------------------+
|[-0.19011948415567445,0.0,1.0,0.0,1.0]|1.0|1.0       |[0.0,0.979783873151759,0.020216126848241072] |
|[0.3646012920129089,1.0,1.0,1.0,1.0]  |1.0|1.0       |[0.0,0.9795425539343663,0.02045744606563369] |
|[0.3141721305430377,1.0,1.0,0.0,1.0]  |1.0|1.0       |[0.0,0.979783873151759,0.020216126848241072] |
|[0.8184637452417498,1.0,1.0,1.0,1.0]  |1.0|1.0       |[0.0,0.9795425539343663,0.02045744606563369] |
|[-0.0388319997460608,1.0,1.0,1.0,0.0] |1.0|1.0       |[0.0,0.9798422311243377,0.020157768875662296]|
|[0.8688929067116211,1.0,1.0,0.0,0.0]  |1.0|1.0       |[0.0,0.9800835503417303,0.019916449658269678]|
|[-0.8456985832640003,1.0,1.0,1.0,1.0] |1.0|1.0       |[0.0,0.9795425539343663,0.0

In [9]:
# Checking how accurate the model is
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="ICU", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(balanced_predictions)

print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.98


In [10]:
# Extra evaluation metrics
f1_eval = MulticlassClassificationEvaluator(labelCol="ICU", predictionCol="prediction", metricName="f1")
f1 = f1_eval.evaluate(balanced_predictions)
print(f"F1 Score: {f1:.2f}")

precision_eval = MulticlassClassificationEvaluator(labelCol="ICU", predictionCol="prediction", metricName="weightedPrecision")
precision = precision_eval.evaluate(balanced_predictions)
print(f"Precision: {precision:.2f}")

recall_eval = MulticlassClassificationEvaluator(labelCol="ICU", predictionCol="prediction", metricName="weightedRecall")
recall = recall_eval.evaluate(balanced_predictions)
print(f"Recall: {recall:.2f}")

F1 Score: 0.97
Precision: 0.96
Recall: 0.98


In [11]:
# Splitting the data into train and test sets
train_data, test_data = balanced_data.randomSplit([0.8, 0.2], seed=42)

# Just checking how many rows in each
print("Train count:", train_data.count())
print("Test count:", test_data.count())

Train count: 287628
Test count: 71421


In [12]:
# Training model on just the training data
final_rf = RandomForestClassifier(labelCol="ICU", featuresCol="features", numTrees=100)
final_model = final_rf.fit(train_data)

# Predicting on test data
test_preds = final_model.transform(test_data)
test_preds.select("features", "ICU", "prediction", "probability").show(10, truncate=False)

+-------------------------------------+---+----------+---------------------------------------------+
|features                             |ICU|prediction|probability                                  |
+-------------------------------------+---+----------+---------------------------------------------+
|[-2.6107192347094927,1.0,1.0,1.0,1.0]|2.0|1.0       |[0.0,0.9723290744811335,0.027670925518866508]|
|[-2.661148396179364,1.0,1.0,1.0,1.0] |2.0|1.0       |[0.0,0.9723290744811335,0.027670925518866508]|
|[0.616747099362265,1.0,1.0,1.0,1.0]  |1.0|1.0       |[0.0,0.979361343802665,0.020638656197334876] |
|[-0.7448402603242578,1.0,1.0,0.0,0.0]|1.0|1.0       |[0.0,0.980289444739156,0.01971055526084399]  |
|[0.11245548466355285,1.0,1.0,0.0,0.0]|1.0|1.0       |[0.0,0.980289444739156,0.01971055526084399]  |
|[1.6757594902295605,1.0,1.0,0.0,0.0] |1.0|1.0       |[0.0,0.9794794843676553,0.02052051563234471] |
|[-0.8961277447338715,1.0,1.0,1.0,1.0]|1.0|1.0       |[0.0,0.979361343802665,0.020638656197

In [13]:
# Final evaluation on test data
evaluator = MulticlassClassificationEvaluator(labelCol="ICU", predictionCol="prediction")

acc = evaluator.setMetricName("accuracy").evaluate(test_preds)
f1 = evaluator.setMetricName("f1").evaluate(test_preds)
prec = evaluator.setMetricName("weightedPrecision").evaluate(test_preds)
rec = evaluator.setMetricName("weightedRecall").evaluate(test_preds)

print(f"Accuracy: {acc:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Precision: {prec:.2f}")
print(f"Recall: {rec:.2f}")

Accuracy: 0.98
F1 Score: 0.97
Precision: 0.96
Recall: 0.98
