# ML Trial 01 
### _Test predictive performance using all Property features_

* specify spark environment for this pc

In [2]:
# MSM VM config prep
import findspark
findspark.init('/home/mitch/spark-3.3.0-bin-hadoop2')
import pyspark

* create a spark session & load data

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BApredsV1').getOrCreate()

# --- suppress future spark warnings/error/etc output ---
spark.sparkContext.setLogLevel("OFF")

import pandas as pd
data = pd.read_pickle("data/bioavailability_data_wFeatures.pkl")
data = data.rename(columns={'Name':'name','_c0':'index'})
data = data.drop(columns='drug_name')
data = spark.createDataFrame(data)

#### first regression and classification test

* prepare feature vectors

In [11]:
cols_to_feature = ['MolWt','ExactMolWt','qed','MolLogP','MolMR','VSA_total','LabuteASA','TPSA',
                   'MaxPartialCharge','MinPartialCharge','MaxAbsPartialCharge','MinAbsPartialCharge',
                   'NumHAcceptors','NumHDonors','HeavyAtomCount','NumHeteroatoms','NumRotatableBonds',
                   'NHOHCount','NOCount','FractionCSP3','RingCount','NumAliphaticRings','NumAromaticRings',
                   'NumAliphaticHeterocycles','NumAromaticHeterocycles','NumSaturatedHeterocycles',
                   'NumSaturatedRings','BalabanJ','BertzCT','HallKierAlpha',
                   'fracVSA_PEOE01','fracVSA_PEOE02','fracVSA_PEOE03','fracVSA_PEOE04','fracVSA_PEOE05',
                   'fracVSA_PEOE06','fracVSA_PEOE07','fracVSA_PEOE08','fracVSA_PEOE09','fracVSA_PEOE10',
                   'fracVSA_PEOE11','fracVSA_PEOE12','fracVSA_PEOE13','fracVSA_PEOE14',
                   'fracVSA_SMR01','fracVSA_SMR02','fracVSA_SMR03','fracVSA_SMR04','fracVSA_SMR05',
                   'fracVSA_SMR06','fracVSA_SMR07','fracVSA_SMR08','fracVSA_SMR09','fracVSA_SMR10',
                   'fracVSA_SlogP01','fracVSA_SlogP02','fracVSA_SlogP03','fracVSA_SlogP04',
                   'fracVSA_SlogP05','fracVSA_SlogP06','fracVSA_SlogP07','fracVSA_SlogP08',
                   'fracVSA_SlogP09','fracVSA_SlogP10','fracVSA_SlogP11','fracVSA_SlogP12']

from pyspark.ml.linalg import Vector
from pyspark.ml.feature import (VectorAssembler,VectorIndexer)

vec_assembler = VectorAssembler(inputCols = cols_to_feature, outputCol='features')
data_w_features = vec_assembler.transform(data)

* index/encode categorical columns

In [12]:
from pyspark.ml.feature import (StringIndexer,OneHotEncoder)

label_quant0 = 'BA_pct'

label_cat1_index = StringIndexer(inputCol='label1',outputCol='label_cat1_index')

label_cat2_index = StringIndexer(inputCol='label2',outputCol='label_cat2_index')

label_cat3_index = StringIndexer(inputCol='label3a',outputCol='label_cat3_index')

label_cat4_index = StringIndexer(inputCol='label3b',outputCol='label_cat4_index')

from pyspark.ml import Pipeline

data_pipeline = Pipeline(stages=[label_cat1_index,
                                 label_cat2_index,
                                 label_cat3_index,
                                 label_cat4_index])

data_w_features = data_w_features.select(['Name','BA_pct','label_QD5','label1','label2','label3a','label3b','features'])
data_prefinal = data_pipeline.fit(data_w_features).transform(data_w_features)

                                                                                

In [13]:
# clean up data
data_prefinal2 = data_prefinal.withColumnRenamed('BA_pct','label_q0')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_QD5','label_cat0')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat1_index','label_cat1')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat2_index','label_cat2')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat3_index','label_cat3')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat4_index','label_cat4')

data_final = data_prefinal2.select(['Name',
                                    'label_q0',
                                    'label_cat0','label_cat1',
                                    'label_cat2','label_cat3','label_cat4',
                                    'features'])

* test a linear regression model

In [14]:
subset_q0 = data_final.select(['label_q0','features'])
train1_q0,test1_q0 = subset_q0.randomSplit([0.7,0.3])

from pyspark.ml.regression import LinearRegression
lm_A = LinearRegression(featuresCol='features',labelCol='label_q0',predictionCol='prediction')

lmModel_1A = lm_A.fit(train1_q0)
lmResults1A = lmModel_1A.evaluate(test1_q0)

print(lmResults1A.rootMeanSquaredError, lmResults1A.r2)

[Stage 18:>                                                         (0 + 4) / 4]

28.792385189676 0.21404417800257503




<font color='purple'> ***Observations:*** </font> <br>
The r-squared value of the Linear Regression model isn't great, which is rather disappointing.
<br>
<font color='orange'> ***Next step:*** </font> <br>
Let's also test the performance of a Classification model, to predict the categorical label columns

* test a logistic regression model using `label_cat0` (Spark's QuantileDiscretizer label column)

In [16]:
def log_reg_test(dataset,modelname,labelName,featuresName,eval_comparison):
    ''' # Select and split data 
    '''
    subset = dataset.select([labelName,featuresName])
    train,test = subset.randomSplit([0.7,0.3])

    ''' # Instantiate and run model 
    '''
    from pyspark.ml.classification import LogisticRegression
    lr = LogisticRegression(featuresCol=featuresName,labelCol=labelName,predictionCol='prediction')

    mymodel = lr.fit(train)
    myresults = mymodel.transform(test)

    ''' # Evaluate results on multiple metrics, output to df
    '''
    datasetName = myresults

    from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
    multiEvaluator = MulticlassClassificationEvaluator(labelCol=labelName, predictionCol="prediction")
    binEvaluator = BinaryClassificationEvaluator(labelCol=labelName, rawPredictionCol="prediction")

    evalMetrics = {binEvaluator:['areaUnderROC','areaUnderPR'], 
                   multiEvaluator:['f1','weightedPrecision','weightedRecall','accuracy']}
    evaluation = []
    for each_evaluator in [binEvaluator,multiEvaluator]:
        evaluator = each_evaluator
        for each_metric in evalMetrics[evaluator]:        
            metric = each_metric
            result = evaluator.evaluate(datasetName, {evaluator.metricName: metric})
            evaluation.append((metric,result))

    column0 = [x for x,y in evaluation]
    column1 = [y for x,y in evaluation]
    eval_comparison['metric'] = column0
    eval_comparison[modelname] = column1

    return eval_comparison

In [29]:
eval_comparison = pd.DataFrame()
dataset = data_final
featuresName = 'features'
labelName = 'label_cat0'
modelname = 'lr_cat0'

eval_comparison = log_reg_test(dataset,modelname,labelName,featuresName,eval_comparison)

eval_comparison.head(6)

NameError: name 'log_reg_test' is not defined

<font color='purple'> ***Observations:*** </font> <br>
Logistic Regression shows 33.5% accuracy when predicting BA as either: _very low, low, mid, high, very high_
<br>
<font color='orange'> ***Next step:*** </font> <br>
Check performance when predicting between a 3-class label column

* test a logistic regression model using `label_cat1` (3-category BA labels: _low, medium, high_)

In [18]:
dataset = data_final
featuresName = 'features'
labelName = 'label_cat1'
modelname = 'lr_cat1'

eval_comparison = log_reg_test(dataset,modelname,labelName,featuresName,eval_comparison)

eval_comparison.head(6)

Unnamed: 0,metric,lr_cat0,lr_cat1
0,areaUnderROC,0.664308,0.673774
1,areaUnderPR,0.834057,0.686729
2,f1,0.337569,0.527839
3,weightedPrecision,0.343967,0.535
4,weightedRecall,0.334773,0.54898
5,accuracy,0.334773,0.54898


<font color='purple'> ***Observations:*** </font> <br>
Logistic Regression shows 54.9% accuracy when predicting BA between: _Low, Mid, High_
<br>
<font color='orange'> ***Next step:*** </font> <br>
In the next notebook, we'll work on some feature engineering to see if we can improve predictive quality.