## Heart Disease Detection with VowalWabbit Classifier

#### Read dataset

In [None]:
dataset = spark.read.format("csv")\
  .option("header", True)\
  .load("wasbs://publicwasb@mmlspark.blob.core.windows.net/heart_disease_prediction_data.csv")
# print dataset size
print("records read: " + str(dataset.count()))

In [None]:
# convert features to double type
from pyspark.sql.functions import col
from pyspark.sql.types import DoubleType
for colName in dataset.columns:
  dataset = dataset.withColumn(colName, col(colName).cast(DoubleType()))
print("Schema: ")
dataset.printSchema()

In [None]:
dataset.show(10, truncate=False)

#### Split the dataset into train and test

In [None]:
train, test = dataset.randomSplit([0.85, 0.15], seed=1)

#### Use VowalWabbitFeaturizer to convert data features into vector

In [None]:
from mmlspark.vw import VowpalWabbitFeaturizer
featurizer = VowpalWabbitFeaturizer(inputCols=dataset.columns[:-1], outputCol="features")
train_data = featurizer.transform(train)["target", "features"]
test_data = featurizer.transform(test)["target", "features"]

In [None]:
train_data.groupBy("target").count().show()

#### Model Training

In [None]:
from mmlspark.vw import VowpalWabbitClassifier
model = VowpalWabbitClassifier(numPasses=20, labelCol="target", featuresCol="features").fit(train_data)

#### Model Prediction

In [None]:
predictions = model.transform(test_data)
predictions.limit(10).toPandas()

In [None]:
from mmlspark.train import ComputeModelStatistics
metrics = ComputeModelStatistics(evaluationMetric='classification', labelCol='target', scoredLabelsCol='prediction').transform(predictions)
display(metrics)