In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
import matplotlib.pyplot as plt
import numpy as np

In [0]:
df = spark.read.csv('dataset/drug200.csv', header = True, inferSchema = True)
display(df)

Age,Sex,BP,Cholesterol,Na_to_K,Drug
23,F,HIGH,HIGH,25.355,DrugY
47,M,LOW,HIGH,13.093,drugC
47,M,LOW,HIGH,10.114,drugC
28,F,NORMAL,HIGH,7.798,drugX
61,F,LOW,HIGH,18.043,DrugY
22,F,NORMAL,HIGH,8.607,drugX
49,F,NORMAL,HIGH,16.275,DrugY
41,M,LOW,HIGH,11.037,drugC
60,M,NORMAL,HIGH,15.171,DrugY
43,M,LOW,NORMAL,19.368,DrugY


In [0]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

stages = []

# StringIndexer for output
label_stringIdx = StringIndexer(inputCol = 'Drug', outputCol = 'label')
stages += [label_stringIdx]

# One Hot Encoder for categorical feature
categoricalColumns = ['Sex', 'BP', 'Cholesterol']
for categoricalCol in categoricalColumns:
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]
    
numericCols = ['Age', 'Na_to_K']
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols

# Assembler all input to feature
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

stages += [assembler]

In [0]:
# data transform
pipeline = Pipeline(stages=stages)
fullData = pipeline.fit(df).transform(df)

train, test = fullData.randomSplit([0.8, 0.2], seed = 2020)

In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# define RandomForestClassifier input and output
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# evaluate base on MulticlassClassification
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# Grid Search Space
paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, list( range(3,10) ) ) \
    .addGrid(rf.maxDepth, list( range(3,10) ) ) \
    .build()

# CrossValidator
cv = CrossValidator(estimator = rf, estimatorParamMaps = paramGrid, evaluator = evaluator, numFolds = 4)
cvModel = cv.fit(train)

In [0]:
X = []
Y = []
Z = []
for tem in zip(paramGrid,cvModel.avgMetrics):
  print( list(tem[0].values())[0], list(tem[0].values())[1], tem[1] )
  X.append( list(tem[0].values())[0] )
  Y.append( list(tem[0].values())[1] )
  Z.append( tem[1] )

import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = go.Figure(data=[go.Mesh3d(x=X, y=Y, z=Z, color='lightpink', opacity=0.3)] )

fig.update_layout(
    autosize=False,
    width=800,
    height=800)

fig.show()

In [0]:
# print best parameter
print( "the best parameter is:" )
print('numTrees - ', cvModel.bestModel.getNumTrees)
print('maxDepth - ', cvModel.bestModel.getOrDefault('maxDepth'))

cv_predictions = cvModel.bestModel.transform(test)
evaluator.evaluate(cv_predictions)

In [0]:
# display predict result
display(cv_predictions.select('Sex', 'BP', 'Cholesterol','Age', 'Na_to_K',"label","prediction"))

Sex,BP,Cholesterol,Age,Na_to_K,label,prediction
M,HIGH,NORMAL,15,17.206,0.0,0.0
F,HIGH,NORMAL,16,15.516,0.0,0.0
F,HIGH,HIGH,18,37.188,0.0,0.0
F,HIGH,NORMAL,18,24.276,0.0,0.0
F,HIGH,HIGH,19,13.313,2.0,2.0
M,LOW,HIGH,22,8.151,3.0,3.0
M,HIGH,HIGH,23,8.011,2.0,2.0
M,LOW,HIGH,23,7.298,3.0,3.0
M,NORMAL,HIGH,23,31.686,0.0,0.0
M,NORMAL,NORMAL,23,14.02,1.0,1.0


In [0]:
import pandas as pd
import numpy as np
from bayes_opt import BayesianOptimization
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

In [0]:
df = spark.read.csv('dataset/drug200.csv', header = True, inferSchema = True)
display(df)

df = df.toPandas()
df = pd.concat([df[['Age','Na_to_K']],pd.get_dummies(df[['Sex', 'BP','Cholesterol']], prefix=['Sex', 'BP','Cholesterol']),df[['Drug']]],axis=1)
 
x = df.iloc[:,:-1]
y = df.iloc[:,-1]
 
x_train,x_test,y_train,y_test = train_test_split( x, y, test_size=0.25, random_state=0 )

Age,Sex,BP,Cholesterol,Na_to_K,Drug
23,F,HIGH,HIGH,25.355,DrugY
47,M,LOW,HIGH,13.093,drugC
47,M,LOW,HIGH,10.114,drugC
28,F,NORMAL,HIGH,7.798,drugX
61,F,LOW,HIGH,18.043,DrugY
22,F,NORMAL,HIGH,8.607,drugX
49,F,NORMAL,HIGH,16.275,DrugY
41,M,LOW,HIGH,11.037,drugC
60,M,NORMAL,HIGH,15.171,DrugY
43,M,LOW,NORMAL,19.368,DrugY


In [0]:
# define RandomForest 
def function(n_estimators, max_depth):
  return cross_val_score(
         RandomForestClassifier(
             n_estimators=int(max(n_estimators,0)),                                                               
             max_depth=int(max(max_depth,1)),
             random_state=42,   
             class_weight="balanced"),  
         X=x_train, 
         y=y_train, 
         cv = 4).mean()

In [0]:
# define search space
parameters = {"n_estimators": (3,10),
                  "max_depth": (3,10)}

# Bayesian Search
BO = BayesianOptimization(function, parameters)
BO.maximize(n_iter=10)
best_solution = BO.max

# keep best params
params = best_solution["params"]
print( "best_params:",params )

In [0]:
X = []
Y = []
Z = []

for tem in BO.res:
  X.append( int(tem['params']['n_estimators']) )
  Y.append( int(tem['params']['max_depth']) )
  Z.append( tem['target'] )
  
fig = go.Figure(data=[go.Mesh3d(x=X, y=Y, z=Z, color='lightpink', opacity=0.3)] )

fig.update_layout(
    autosize=False,
    width=800,
    height=800)

fig.show()

In [0]:
# build model on best solution
model = RandomForestClassifier(
         n_estimators=int(params["n_estimators"]),
         max_depth=int(params["max_depth"]),
         random_state=42,   
         class_weight="balanced")

# train best model
model.fit(x_train,y_train)

# display predict result
res_DF = pd.concat( [x_test.reset_index(),y_test.reset_index(), pd.DataFrame(  list(model.predict(x_test))  ,columns =['predict_Drug'])],axis =1   )
res_DF['flag'] = res_DF['Drug'] == res_DF['predict_Drug']
res_DF

Unnamed: 0,index,Age,Na_to_K,Sex_F,Sex_M,BP_HIGH,BP_LOW,BP_NORMAL,Cholesterol_HIGH,Cholesterol_NORMAL,index.1,Drug,predict_Drug,flag
0,18,23,7.298,0,1,0,1,0,1,0,18,drugC,drugC,True
1,170,28,12.879,1,0,0,0,1,1,0,170,drugX,drugX,True
2,107,42,20.013,0,1,0,1,0,1,0,107,DrugY,DrugY,True
3,98,20,35.639,0,1,1,0,0,0,1,98,DrugY,DrugY,True
4,177,25,19.011,0,1,0,0,1,1,0,177,DrugY,DrugY,True
5,182,20,11.686,1,0,0,1,0,0,1,182,drugX,drugX,True
6,5,22,8.607,1,0,0,0,1,1,0,5,drugX,drugX,True
7,146,37,12.006,1,0,0,1,0,0,1,146,drugX,drugX,True
8,12,43,15.376,0,1,0,1,0,1,0,12,DrugY,DrugY,True
9,152,55,7.261,0,1,0,0,1,0,1,152,drugX,drugX,True
