# Parkinson's Disease Detector with Apache Cassandra and PySpark Machine Learning

### Jupyter notebook inspired by the template at https://github.com/datastaxdevs/workshop-machine-learning/blob/master/jupyter/Random%20Forest.ipynb

In [14]:
!pip3 install matplotlib --quiet

In [15]:
!pip3 install ipykernel --quiet

In [17]:
!python3 -m ipykernel install --user --name=vs-l-pd-detector

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
Installed kernelspec vs-l-pd-detector in /Users/mariannelynemanaog/Library/Jupyter/kernels/vs-l-pd-detector


In [18]:
!PYDEVD_DISABLE_FILE_VALIDATION=1

In [19]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
%matplotlib inline
import matplotlib.pyplot as plt

In [21]:
!pip install cassandra-driver --quiet

In [22]:
!pip install pyspark --quiet

In [23]:
import pandas
import cassandra
import pyspark
import re
import os
import random
from random import randint, randrange
import matplotlib.pyplot as plt
from IPython.display import display, Markdown
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#### Helper function to have nicer formatting of Spark DataFrames

In [24]:
#Helper for pretty formatting for Spark DataFrames
def showDF(df, limitRows =  5, truncate = True):
    if(truncate):
        pandas.set_option('display.max_colwidth', 50)
    else:
        pandas.set_option('display.max_colwidth', -1)
    pandas.set_option('display.max_rows', limitRows)
    display(df.limit(limitRows).toPandas())
    pandas.reset_option('display.max_rows')

## Creating Tables and Loading Tables

### Connect to Cassandra

In [11]:
from cassandra.cluster import Cluster

cluster = Cluster(['dse'])
session = cluster.connect()

UnresolvableContactPoints: {}

### Create Demo Keyspace 

In [None]:
session.execute("""
    CREATE KEYSPACE IF NOT EXISTS accelerate 
    WITH REPLICATION = 
    { 'class' : 'SimpleStrategy', 'replication_factor' : 1 }"""
)

### Set keyspace 

In [None]:
session.set_keyspace('accelerate')

### Create table called `speech_data`. Our PRIMARY will be a unique key (subjectId) we generate for each row.  This will have two datasets "train" and "test"

In [None]:
query = "CREATE TABLE IF NOT EXISTS speech_data \
                                   (subject_id varchar, jitter_percent float, jitter_abs float, rap float, ppq float, \
                                   apq_3 float, apq_5 float, apq_11 float, status int, \
                                   PRIMARY KEY (subject_id))"
session.execute(query)

### Load the train and test datasets from csv files

#### Insert all speech data into the DSE table `speech_data`

In [None]:
fileName = '/Users/mariannelynemanaog/PycharmProjects/vs-ml-pd-detector/src/data/train_and_test_sets/train_data.csv'
input_file = open(fileName, 'r')
i = 1
for line in input_file:
    subject_id = i
    row = line.split(';')
        
    query = "INSERT INTO speech_data (subject_id, jitter_percent, jitter_abs, rap, ppq, \
                               apq_3, apq_5, apq_11, status)"
    query = query + " VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
    session.execute(query, (subject_id, float(row[0]), float(row[1]), float(row[2]), float(row[3]), float(row[4]), float(row[5]), float(row[6]), float(row[7])))
    i = i + 1

fileName = '/Users/mariannelynemanaog/PycharmProjects/vs-ml-pd-detector/src/data/train_and_test_sets/test_data.csv'
input_file = open(fileName, 'r')

for line in input_file:
    subject_id = i
    row = line.split(';')
        
    query = "INSERT INTO speech_data (subject_id, jitter_percent, jitter_abs, rap, ppq, \
                               apq_3, apq_5, apq_11, status)"
    query = query + " VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)"
    session.execute(query, (subject_id, float(row[0]), float(row[1]), float(row[2]), float(row[3]), float(row[4]), float(row[5]), float(row[6]), float(row[7])))
    i = i + 1
    

## Machine Learning with Apache Cassandra and Apache Spark

#### Create a spark session that is connected to the database. From there load each table into a Spark Dataframe and take a count of the number of rows in each.

In [None]:
spark = SparkSession.builder.appName('demo').master("local").getOrCreate()


speechDF = spark.read.format("org.apache.spark.sql.cassandra").options(table="speech_data", keyspace="accelerate").load()

print ("Table Speech Data Row Count: ")
print (speechDF.count())

In [None]:
showDF(speechDF)

#### Create Vector with all elements of the speech data 

In [None]:
assembler = VectorAssembler(
    inputCols=['jitter_percent', 'jitter_abs', 'rap', 'ppq', 'apq_3', 'apq_5', 'apq_11'],
    outputCol='status')

trainingData = assembler.transform(speechDF)

labelIndexer = StringIndexer(inputCol="status", outputCol="label", handleInvalid='keep')
trainingData1 = labelIndexer.fit(trainingData).transform(trainingData)

showDF(trainingData1)
print(trainingData1.count())

### We will be training a model with Random Forest, and because of this we need to split up our dataset in to a training and test set. Will split 80/20. 

## TODO: To split data based on the train and test sets already determined.

In [None]:
# Split the data into train and test
splits = trainingData1.randomSplit([0.8, 0.2], 1234)
train = splits[0]
test = splits[1]

print ("Train Dataframe Row Count: ")
print (train.count())
print ("Test Datafram Row Count: ")
print (test.count())

In [None]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

model = rf.fit(train)

predictions = model.transform(test)
#predictions.show()
print (predictions.count())
showDF(predictions)

In [None]:
showDF(predictions.select("status", "label", "prediction", "probability"))

### We can now use the MulticlassClassificationEvaluator to evalute the accuracy of our predictions. 

In [None]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

In [None]:
session.execute("""drop table speech_data""")