# Predicting Heart Disease

In [1]:
# initializing the Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 

24/05/26 13:04:50 WARN Utils: Your hostname, Nikas-Macbook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.72.3.173 instead (on interface en0)
24/05/26 13:04:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/26 13:04:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# read the csv file
df = spark.read.csv("heart_statlog_cleveland_hungary_final.csv", inferSchema = True, header=True)
df = df.repartition(10)

df.show(5)

                                                                                

+---+---+---------------+------------+-----------+-------------------+-----------+--------------+---------------+-------+--------+------+
|age|sex|chest pain type|resting bp s|cholesterol|fasting blood sugar|resting ecg|max heart rate|exercise angina|oldpeak|ST slope|target|
+---+---+---------------+------------+-----------+-------------------+-----------+--------------+---------------+-------+--------+------+
| 42|  1|              3|         134|        240|                  0|          0|           160|              0|    0.0|       1|     0|
| 54|  0|              3|         110|        214|                  0|          0|           158|              0|    1.6|       2|     0|
| 46|  1|              1|         140|        272|                  1|          0|           175|              0|    2.0|       2|     1|
| 29|  1|              2|         130|        204|                  0|          2|           202|              0|    0.0|       1|     0|
| 52|  1|              1|         

In [3]:
# View data types
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- chest pain type: integer (nullable = true)
 |-- resting bp s: integer (nullable = true)
 |-- cholesterol: integer (nullable = true)
 |-- fasting blood sugar: integer (nullable = true)
 |-- resting ecg: integer (nullable = true)
 |-- max heart rate: integer (nullable = true)
 |-- exercise angina: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- ST slope: integer (nullable = true)
 |-- target: integer (nullable = true)



### Splitting the data into train and test dat

In [4]:
# Split the data into train and test sets
train_data, test_data = df.randomSplit([0.7, 0.3], seed=42)

print(train_data.count())
print(test_data.count())

                                                                                

841
349


In [16]:
from pyspark.ml.feature import VectorAssembler, PCA
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import GBTClassifier

In [7]:
# Assemble the feature vectors
assembler = VectorAssembler(inputCols=['age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol', 'fasting blood sugar', 'resting ecg', 'max heart rate', 'exercise angina', 'oldpeak', 'ST slope'], outputCol='features')

# Transform the data
trainData = assembler.transform(train_data)
testData = assembler.transform(test_data)

### Logistic regression

In [None]:
# Define the Logistic Regression estimator
lr = LogisticRegression(featuresCol='features', labelCol='target', maxIter=100)

# Define the parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# Define the evaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='target')

# Define the cross-validator
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Fit the cross-validator and get the best model
cvModel = cv.fit(trainData)
bestLRModel = cvModel.bestModel

In [10]:
lrPredictions = bestLRModel.transform(testData)
lrAccuracy = evaluator.evaluate(lrPredictions, {evaluator.metricName: 'areaUnderROC'})
print('Logistic Regression AUC: ', lrAccuracy)

Logistic Regression AUC:  0.9051517406764238


### Random forest

In [18]:
# Define the Random Forest estimator
rf = RandomForestClassifier(featuresCol='features', labelCol='target')

# Define the parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(rf.maxDepth, [5, 10, 20]) \
    .addGrid(rf.numTrees, [50, 100, 200]) \
    .build()

# Define the evaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='target')

# Define the cross-validator
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Fit the cross-validator and get the best model
cvModel = cv.fit(trainData)
bestRFModel = cvModel.bestModel

24/05/23 14:18:41 WARN DAGScheduler: Broadcasting large task binary with size 1068.8 KiB
24/05/23 14:18:44 WARN DAGScheduler: Broadcasting large task binary with size 1078.8 KiB
24/05/23 14:18:44 WARN DAGScheduler: Broadcasting large task binary with size 1394.7 KiB
24/05/23 14:18:44 WARN DAGScheduler: Broadcasting large task binary with size 1655.6 KiB
24/05/23 14:18:44 WARN DAGScheduler: Broadcasting large task binary with size 1848.6 KiB
24/05/23 14:18:45 WARN DAGScheduler: Broadcasting large task binary with size 1370.2 KiB
24/05/23 14:18:48 WARN DAGScheduler: Broadcasting large task binary with size 1411.6 KiB
24/05/23 14:18:48 WARN DAGScheduler: Broadcasting large task binary with size 2025.7 KiB
24/05/23 14:18:49 WARN DAGScheduler: Broadcasting large task binary with size 2.6 MiB
24/05/23 14:18:49 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
24/05/23 14:18:49 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/05/23 14:18:50 WARN DAGSche

In [19]:
rfPredictions = bestRFModel.transform(testData)
rfAccuracy = evaluator.evaluate(rfPredictions, {evaluator.metricName: 'areaUnderROC'})
print('Random Forest AUC: ', rfAccuracy)

24/05/23 14:20:52 WARN DAGScheduler: Broadcasting large task binary with size 1681.5 KiB


Random Forest AUC:  0.936620644312952


### Support Vector Machines

In [12]:
# Define the SVM estimator
svm = LinearSVC(featuresCol='features', labelCol='target', maxIter=100)

# Define the parameter grid
paramGrid = ParamGridBuilder() \
    .addGrid(svm.regParam, [0.01, 0.1, 1.0]) \
    .build()

# Define the evaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='target')

# Define the cross-validator
cv = CrossValidator(estimator=svm, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Fit the cross-validator and get the best model
cvModel = cv.fit(trainData)
bestSVMModel = cvModel.bestModel

# Make predictions on the test data using the best SVM model
svmPredictions = bestSVMModel.transform(testData)
svmAccuracy = evaluator.evaluate(svmPredictions, {evaluator.metricName: 'areaUnderROC'})
print('SVM AUC: ', svmAccuracy)

24/05/26 13:10:47 WARN CacheManager: Asked to cache already cached data.        
24/05/26 13:10:47 WARN CacheManager: Asked to cache already cached data.


SVM AUC:  0.8843523997370152


### Conclusion

The best model: Random Forests

The accuracy: 93.7%