In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load(r"C:\Users\Britt\Documents\RUG\Year 1\Big Data Analystics\heart_statlog_cleveland_hungary_final.csv")

In [4]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- chest pain type: integer (nullable = true)
 |-- resting bp s: integer (nullable = true)
 |-- cholesterol: integer (nullable = true)
 |-- fasting blood sugar: integer (nullable = true)
 |-- resting ecg: integer (nullable = true)
 |-- max heart rate: integer (nullable = true)
 |-- exercise angina: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- ST slope: integer (nullable = true)
 |-- target: integer (nullable = true)



In [5]:
df.show(5)

+---+---+---------------+------------+-----------+-------------------+-----------+--------------+---------------+-------+--------+------+
|age|sex|chest pain type|resting bp s|cholesterol|fasting blood sugar|resting ecg|max heart rate|exercise angina|oldpeak|ST slope|target|
+---+---+---------------+------------+-----------+-------------------+-----------+--------------+---------------+-------+--------+------+
| 40|  1|              2|         140|        289|                  0|          0|           172|              0|    0.0|       1|     0|
| 49|  0|              3|         160|        180|                  0|          0|           156|              0|    1.0|       2|     1|
| 37|  1|              2|         130|        283|                  0|          1|            98|              0|    0.0|       1|     0|
| 48|  0|              4|         138|        214|                  0|          0|           108|              1|    1.5|       2|     1|
| 54|  1|              3|         

## Logistic Regression

In [6]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [8]:
#dividing the data into training and testing sets
train_df, test_df = df.randomSplit([0.6, 0.4], seed=42)

In [17]:
features = df["age", "sex", "chest pain type", "resting bp s", "cholesterol", "fasting blood sugar", "resting ecg", "max heart rate", "exercise angina", "oldpeak", "ST slope"]

In [11]:
#making a logistic regression model
lr = LogisticRegression(labelCol="target", featuresCol="features")

In [12]:
#making a Paramgrid
Paramgrid_lr = (ParamGridBuilder()
                .addGrid(lr.regParam, [0.1, 0.01])
                .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
                .build())

In [None]:
#using cross validation to get the best results
cv_lr = CrossValidator(estimator=lr, estimatorParamMaps=Paramgrid_lr, evaluator=BinaryClassificationEvaluator(labelCol="target"), numFolds=5)
cv_model_lr = cv_lr.fit(train_df)

In [None]:
#making predictions
lr_predictions = cv_model_lr.transform(test_df)

In [None]:
#evaluating the model
evaluator = BinaryClassificationEvaluator(labelCol="target")
lr_accuracy = evaluator.evaluate(lr_predictions)
print(lr_accuracy)