# ML Trial 01 
### _Test predictive performance using all Property features_

* specify spark environment for this pc

In [1]:
# MSM VM config prep
import findspark
findspark.init('/home/mitch/spark-3.3.0-bin-hadoop2')
import pyspark

* create a spark session & load data

In [43]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BApredsV1').getOrCreate()

# --- suppress future spark warnings/error/etc output ---
spark.sparkContext.setLogLevel("OFF")

In [4]:
import pandas as pd
data = pd.read_pickle("data/bioavailability_data_wFeatures.pkl")
data = data.rename(columns={'Name':'name','_c0':'index'})
data = data.drop(columns='drug_name')
data = spark.createDataFrame(data)

#### first regression and classification test

* prepare feature vectors

In [5]:
cols_to_feature = ['MolWt','ExactMolWt','qed','MolLogP','MolMR','VSA_total','LabuteASA','TPSA',
                   'MaxPartialCharge','MinPartialCharge','MaxAbsPartialCharge','MinAbsPartialCharge',
                   'NumHAcceptors','NumHDonors','HeavyAtomCount','NumHeteroatoms','NumRotatableBonds',
                   'NHOHCount','NOCount','FractionCSP3','RingCount','NumAliphaticRings','NumAromaticRings',
                   'NumAliphaticHeterocycles','NumAromaticHeterocycles','NumSaturatedHeterocycles',
                   'NumSaturatedRings','BalabanJ','BertzCT','HallKierAlpha',
                   'fracVSA_PEOE01','fracVSA_PEOE02','fracVSA_PEOE03','fracVSA_PEOE04','fracVSA_PEOE05',
                   'fracVSA_PEOE06','fracVSA_PEOE07','fracVSA_PEOE08','fracVSA_PEOE09','fracVSA_PEOE10',
                   'fracVSA_PEOE11','fracVSA_PEOE12','fracVSA_PEOE13','fracVSA_PEOE14',
                   'fracVSA_SMR01','fracVSA_SMR02','fracVSA_SMR03','fracVSA_SMR04','fracVSA_SMR05',
                   'fracVSA_SMR06','fracVSA_SMR07','fracVSA_SMR08','fracVSA_SMR09','fracVSA_SMR10',
                   'fracVSA_SlogP01','fracVSA_SlogP02','fracVSA_SlogP03','fracVSA_SlogP04',
                   'fracVSA_SlogP05','fracVSA_SlogP06','fracVSA_SlogP07','fracVSA_SlogP08',
                   'fracVSA_SlogP09','fracVSA_SlogP10','fracVSA_SlogP11','fracVSA_SlogP12']

from pyspark.ml.linalg import Vector
from pyspark.ml.feature import (VectorAssembler,VectorIndexer)

vec_assembler = VectorAssembler(inputCols = cols_to_feature, outputCol='features')
data_w_features = vec_assembler.transform(data)

* index/encode categorical columns

In [6]:
from pyspark.ml.feature import (StringIndexer,OneHotEncoder)

label_quant0 = 'BA_pct'

label_cat1_index = StringIndexer(inputCol='label1',outputCol='label_cat1_index')

label_cat2_index = StringIndexer(inputCol='label2',outputCol='label_cat2_index')

label_cat3_index = StringIndexer(inputCol='label3a',outputCol='label_cat3_index')

label_cat4_index = StringIndexer(inputCol='label3b',outputCol='label_cat4_index')

from pyspark.ml import Pipeline

data_pipeline = Pipeline(stages=[label_cat1_index,
                                 label_cat2_index,
                                 label_cat3_index,
                                 label_cat4_index])

data_w_features = data_w_features.select(['Name','BA_pct','label_QD5','label1','label2','label3a','label3b','features'])
data_prefinal = data_pipeline.fit(data_w_features).transform(data_w_features)

                                                                                

In [7]:
# clean up data
data_prefinal2 = data_prefinal.withColumnRenamed('BA_pct','label_q0')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_QD5','label_cat0')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat1_index','label_cat1')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat2_index','label_cat2')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat3_index','label_cat3')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat4_index','label_cat4')

data_final = data_prefinal2.select(['Name',
                                    'label_q0',
                                    'label_cat0','label_cat1',
                                    'label_cat2','label_cat3','label_cat4',
                                    'features'])

# Set up ML Flow experiment for ML Trial 1

* test a linear regression model

In [9]:
import mlflow
import mlflow.spark
from pathlib import Path

In [10]:
experiment_id = mlflow.create_experiment(
    "ML Trial 1",
    artifact_location=Path.cwd().joinpath("mlruns/trial1").as_uri(),
    tags={"version": "v1", "priority": "P1"},
)
experiment = mlflow.get_experiment(experiment_id)
print("Name: {}".format(experiment.name))
print("Experiment_id: {}".format(experiment.experiment_id))
print("Artifact Location: {}".format(experiment.artifact_location))
print("Tags: {}".format(experiment.tags))
print("Lifecycle_stage: {}".format(experiment.lifecycle_stage))
print("Creation timestamp: {}".format(experiment.creation_time))

Name: ML Trial 1
Experiment_id: 1
Artifact Location: file:///media/sf_Public/new%20BA%20folder/mlruns/trial1
Tags: {'priority': 'P1', 'version': 'v1'}
Lifecycle_stage: active
Creation timestamp: 1664390446509


# Define trial runs with MLflow tracking

In [45]:
import os
import warnings
import sys
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import mlflow
import mlflow.spark

import logging
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)
warnings.filterwarnings("ignore")

import mlflow
import mlflow.spark
from pyspark.ml.regression import (LinearRegression,RandomForestRegressor,GBTRegressor,
                                   DecisionTreeRegressor,GeneralizedLinearRegression)
from pyspark.ml.evaluation import RegressionEvaluator
    
def evaluation(myresults):
    
    regEvaluator = RegressionEvaluator(labelCol='label_q0',predictionCol='prediction')
    
    evalMetrics = {regEvaluator:['rmse','mse','mae','r2','var']}
    evaluator = regEvaluator
    eval_results = {}
    for each_metric in evalMetrics[evaluator]:
        metric = each_metric
        result = evaluator.evaluate(myresults, {evaluator.metricName: metric})

        eval_results[each_metric] = result

    #df_out = pd.DataFrame.from_dict(eval_results)
    return eval_results

def trial_run(train,test,trialName='trial1',modelType='linreg'):
    features_choice = 'rdkit'
    
    import logging
    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)
    warnings.filterwarnings("ignore")
    with mlflow.start_run(experiment_id=experiment_id,run_name=modelType):

        if modelType=='linreg':
            model = LinearRegression(featuresCol='features',labelCol='label_q0',predictionCol='prediction')
        elif modelType=='glr':
            model = GeneralizedLinearRegression(featuresCol='features',labelCol='label_q0',predictionCol='prediction')
        elif modelType=='rfr':
            model = RandomForestRegressor(featuresCol='features',labelCol='label_q0',predictionCol='prediction')
        elif modelType=='dtr':
            model = DecisionTreeRegressor(featuresCol='features',labelCol='label_q0',predictionCol='prediction')
        elif modelType=='gbtr':
            model = GBTRegressor(featuresCol='features',labelCol='label_q0',predictionCol='prediction')

        mymodel = model.fit(train)
        myresults = mymodel.transform(test)

        #print(lmResults1A.rootMeanSquaredError, lmResults1A.r2)
        eval_results = evaluation(myresults)
        
        print(f"MODEL:\t{modelType}\nRMSE:\t{eval_results['rmse']}\nR2:\t{eval_results['r2']}")

        ''' # LOG PARAMS & METRICS IN MLFLOW '''
        mlflow.log_param("experiment",mlflow.get_experiment(experiment_id).name)
        mlflow.log_param("features",features_choice)
        mlflow.log_param("model type",modelType)
        mlflow.log_metric("rmse", eval_results['rmse'])
        mlflow.log_metric("mae", eval_results['mae'])
        mlflow.log_metric("r2", eval_results['r2'])
        mlflow.spark.log_model(mymodel,trialName)

In [34]:
subset = data_final.select(['label_q0','features'])
        
train,test = subset.randomSplit([0.7,0.3])

In [35]:
trial_run(train,test,'trial1','linreg')

                                                                                

MODEL:	linreg
RMSE:	29.93428244704524
R2:	0.18841203511998927


In [37]:
trial_run(train,test,'trial1','glr')

                                                                                

MODEL:	glr
RMSE:	29.93434126369391
R2:	0.1884088458047838


In [47]:
trial_run(train,test,'trial1','rfr')

                                                                                

MODEL:	rfr
RMSE:	29.704073253033936
R2:	0.20084704741940584


In [46]:
trial_run(train,test,'trial1','dtr')

                                                                                

MODEL:	dtr
RMSE:	32.38434214493323
R2:	0.05012151182790403


In [40]:
trial_run(train,test,'trial1','gbtr')

                                                                                

MODEL:	gbtr
RMSE:	31.570289689979017
R2:	0.097275918803156


<font color='purple'> ***Observations:*** </font> <br>
The r-squared value was best for the Random Forest regressor, followed by Linear Regression model.
<br>
<font color='orange'> ***Next step:*** </font> <br>
This trial was conducted on all rdkit features, comparing different Regression models. <br> The next trial will compare different feature set variants of the rdkit features.  