# ML Trial 2

### <font color='blue'> Test the attempted improvements in feature selection </font>

* Recall, in the last notebook we did some feature engineering to provide the following feature sets:

In [1]:
import pandas as pd
featuresDF = pd.read_parquet('featuresCatalogDF.parquet')
featuresDF.head(10)

Unnamed: 0,name,description,featureCount,features
0,F1a,"F1a: All calculations, with Property-VSA range...",66,"[MolWt, ExactMolWt, qed, MolLogP, MolMR, VSA_t..."
1,F1b,"F1b: All calculations, with Property-VSA range...",66,"[MolWt, ExactMolWt, qed, MolLogP, MolMR, VSA_t..."
2,F2a,"F2a: Same as F1a, but excluding all features h...",57,"[MolWt, qed, MolLogP, TPSA, MaxPartialCharge, ..."
3,F2b,"F2b: Same as F1b, but excluding all features h...",57,"[MolWt, qed, MolLogP, TPSA, MaxPartialCharge, ..."
4,F3,"F3: Same as F2a/F2b, but excluding Property-VS...",21,"[MolWt, qed, MolLogP, TPSA, MaxPartialCharge, ..."
5,F4a,F4a: Metrics related to common filters (Lipins...,9,"[MolWt, MolLogP, MolMR, TPSA, NHOHCount, NOCou..."
6,F4b,F4a: Metrics related to common filters (Lipins...,9,"[MolWt, MolLogP, MolMR, TPSA, NumHAcceptors, N..."
7,F1bANOVA,F1bANOVA: F1b vars passing ANOVA (0.05 Sig.) a...,49,"[MolWt, ExactMolWt, qed, MolLogP, MolMR, VSA_t..."
8,F2bANOVA,F2bANOVA: F2b vars passing ANOVA (0.05 Sig.) a...,41,"[MolWt, qed, MolLogP, TPSA, MinPartialCharge, ..."


1. With all of the new feature sets saved in the **features catalog**, we can load the original data set and run through the data processing steps

In [6]:
# MSM VM config prep
import findspark
findspark.init('/home/mitch/spark-3.3.0-bin-hadoop2')
import pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BApredsV1').getOrCreate()

import pandas as pd

# load the data
data = spark.read.csv("bioavailability_data_final.csv",inferSchema=True,sep=',',header=True)
# --- suppress future spark warnings/error/etc output ---
spark.sparkContext.setLogLevel("OFF")

In [7]:
''' # FEATURE SELECTION:
'''
featuresDF = pd.read_parquet('featuresCatalogDF.parquet')
feature_set1a = featuresDF.loc[0,'features']
feature_set1b = featuresDF.loc[1,'features']
feature_set2a = featuresDF.loc[2,'features']
feature_set2b = featuresDF.loc[3,'features']
feature_set3  = featuresDF.loc[4,'features']
feature_set4a = featuresDF.loc[5,'features']
feature_set4b = featuresDF.loc[6,'features']
F1bANOVA = featuresDF.loc[7,'features']
F2bANOVA = featuresDF.loc[8,'features']

''' # VECTORIZE FEATURES
'''
# VECTOR ASSEMBLY - feature sets 1a,1b,2a,2b,3,4a,4b
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import (VectorAssembler,VectorIndexer)

vec_assembler1a = VectorAssembler(inputCols = feature_set1a, outputCol='features1a')
vec_assembler1b = VectorAssembler(inputCols = feature_set1b, outputCol='features1b')
vec_assembler2a = VectorAssembler(inputCols = feature_set2a, outputCol='features2a')
vec_assembler2b = VectorAssembler(inputCols = feature_set2b, outputCol='features2b')
vec_assembler3 = VectorAssembler(inputCols = feature_set3, outputCol='features3')
vec_assembler4a = VectorAssembler(inputCols = feature_set4a, outputCol='features4a')
vec_assembler4b = VectorAssembler(inputCols = feature_set4b, outputCol='features4b')
vec_assembler1bANOVA = VectorAssembler(inputCols = F1bANOVA, outputCol='F1bANOVA')
vec_assembler2bANOVA = VectorAssembler(inputCols = F2bANOVA, outputCol='F2bANOVA')

from pyspark.ml import Pipeline
feature_pipeline = Pipeline(stages=[vec_assembler1a,
                                    vec_assembler1b,
                                    vec_assembler2a,
                                    vec_assembler2b,
                                    vec_assembler3,
                                    vec_assembler4a,
                                    vec_assembler4b,
                                    vec_assembler1bANOVA,
                                   vec_assembler2bANOVA])
data_features = feature_pipeline.fit(data).transform(data)


''' # DEPENDENT VARIABLE LABELS 
'''
from pyspark.ml.feature import QuantileDiscretizer
import pandas as pd
qd5 = QuantileDiscretizer(numBuckets=5,inputCol='BA_pct',outputCol='label_QD5')

data_features = qd5.fit(data_features).transform(data_features)

# -- INDEX / ENCODE LABELS
from pyspark.ml.feature import (StringIndexer,OneHotEncoder)

label_quant0 = 'BA_pct'
label_cat0_vector = OneHotEncoder(inputCol='label_QD5',outputCol='label_cat0_vector')

label_cat1_index = StringIndexer(inputCol='label1',outputCol='label_cat1_index')
label_cat1_vector = OneHotEncoder(inputCol='label_cat1_index',outputCol='label_cat1_vector')

label_cat2_index = StringIndexer(inputCol='label2',outputCol='label_cat2_index')
label_cat2_vector = OneHotEncoder(inputCol='label_cat2_index',outputCol='label_cat2_vector')

label_cat3_index = StringIndexer(inputCol='label3a',outputCol='label_cat3_index')
label_cat3_vector = OneHotEncoder(inputCol='label_cat3_index',outputCol='label_cat3_vector')

label_cat4_index = StringIndexer(inputCol='label3b',outputCol='label_cat4_index')
label_cat4_vector = OneHotEncoder(inputCol='label_cat4_index',outputCol='label_cat4_vector')

from pyspark.ml import Pipeline
label_pipeline = Pipeline(stages=[label_cat0_vector,
                                 label_cat1_index,label_cat1_vector,
                                 label_cat2_index,label_cat2_vector,
                                 label_cat3_index,label_cat3_vector,
                                 label_cat4_index,label_cat4_vector])

data_features = data_features.select(['Name','BA_pct',
                                      'label_QD5','label1','label2','label3a','label3b',
                                      'features1a','features1b','features2a','features2b',
                                      'features3','features4a','features4b','F1bANOVA','F2bANOVA'])

data_prefinal = label_pipeline.fit(data_features).transform(data_features)

data_prefinal2 = data_prefinal.withColumnRenamed('BA_pct','label_q0')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_QD5','label_cat0')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat1_index','label_cat1')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat2_index','label_cat2')
data_prefinal2 = data_prefinal2.withColumnRenamed('label3a','label3')
data_prefinal2 = data_prefinal2.withColumnRenamed('label3b','label4')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat3_index','label_cat3')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat4_index','label_cat4')

data_final = data_prefinal2.select(['Name',
                                    'label_q0',
                                    'label_cat0','label_cat1',
                                    'label_cat2','label_cat3','label_cat4',
                                    'features1a','features1b','features2a','features2b',
                                    'features3','features4a','features4b',
                                    'F1bANOVA','F2bANOVA'])

2. let's also check if **Feature Scaling** has any effect; let's add that in

In [8]:
''' # TEST FEATURE SCALING
'''
# Scale values
from pyspark.ml.feature import StandardScaler

scaler1 = StandardScaler(inputCol="features2b", outputCol="features2bSs", withStd=True, withMean=False)
scaler2 = StandardScaler(inputCol="features2b", outputCol="features2bSm", withStd=False, withMean=True)

data_final = scaler1.fit(data_final).transform(data_final)
data_final = scaler2.fit(data_final).transform(data_final)

<font color='gray'> ..._now, all of the different labels and feature sets have been added to the data._ </font>

3. Let's **backup the dataset** so we can avoid executing all of the above data prep again in a new session

In [4]:
data_final.toPandas().to_pickle("bioavailability_data_final_wVectorFeaturesV1.pkl")

4. Now, we can easily load the processed data and get right into ML testing:

In [14]:
# load the dataset
data_final = pd.read_pickle("bioavailability_data_final_wVectorFeaturesV1.pkl")
data_final = spark.createDataFrame(data_final)

''' 
# RUN ML COMPARISON ACROSS ALL FEATURE SETS
'''
from pyspark.ml.regression import LinearRegression

labelName = 'label_q0'  # SPECIFY
lr_df = pd.DataFrame()
df = lr_df

allFeatures = ['features1a','features1b',
               'features2a','features2b',
               'features3','features4a','features4b',
               'F1bANOVA','F2bANOVA',"features2bSs","features2bSm"]

subset = data_final.select(['Name',labelName,
                            'features1a','features1b',
                            'features2a','features2b',
                            'features3','features4a','features4b',
                            'F1bANOVA','F2bANOVA',"features2bSs","features2bSm"]) 
train,test = subset.randomSplit([0.7,0.3])


i = 1
for index,features in enumerate(allFeatures):
    featnum = ['F1a','F1b','F2a','F2b','F3','F4a','F4b','F1bANOVA','F2bANOVA','F2bSs','F2bSm']
    featuresName = features
    
    lr = LinearRegression(featuresCol=featuresName,labelCol=labelName,predictionCol='prediction')
    
    ''' # SPECIFY MODEL 
    '''
    modeltype = lr 
    modeltypeVariantNo = '1' 
    modelname = f"lr{modeltypeVariantNo}_{featnum[index]}"  
    
    # FIT/TRAIN MODEL & TRANSFORM DATA
    mymodel = modeltype.fit(train)
    myresults = mymodel.transform(test)
    
    
    # CALCULATE KEY EVALS
    from pyspark.ml.evaluation import RegressionEvaluator
    regEvaluator = RegressionEvaluator(labelCol=labelName,predictionCol='prediction')
    evalMetrics = {regEvaluator:['rmse','mse','mae','r2','var']}
    evaluation2 = []
    evaluator = regEvaluator
    for each_metric in evalMetrics[evaluator]:        
        metric = each_metric
        result = evaluator.evaluate(myresults, {evaluator.metricName: metric})
        evaluation2.append((metric,result))
    r2_adj = mymodel.summary.r2adj
    evaluation2.append(('r2_adj',r2_adj))
    column0 = [x for x,y in evaluation2]
    column1 = [y for x,y in evaluation2]
    lr_df['metric'] = column0
    lr_df[modelname] = column1
    
    i += 1

lr_df.head(6)

                                                                                

Unnamed: 0,metric,lr1_F1a,lr1_F1b,lr1_F2a,lr1_F2b,lr1_F3,lr1_F4a,lr1_F4b,lr1_F1bANOVA,lr1_F2bANOVA,lr1_F2bSs,lr1_F2bSm
0,rmse,30.234244,29.754981,30.150807,29.618337,29.768845,30.465028,30.454849,29.512901,29.393705,29.618337,29.618337
1,mse,914.109504,885.3589,909.07119,877.245879,886.184156,928.117943,927.497829,871.011301,863.989874,877.24587,877.245858
2,mae,25.219772,24.791387,25.238552,24.705971,24.906565,25.834481,25.621583,24.453239,24.576838,24.705971,24.705971
3,r2,0.160231,0.186643,0.164859,0.194096,0.185885,0.147361,0.147931,0.199824,0.206274,0.194096,0.194096
4,var,338.481319,329.195543,324.194645,299.214879,242.757866,184.293318,183.516265,310.216213,280.727415,299.215002,299.215157
5,r2_adj,0.206959,0.213314,0.205683,0.201723,0.159328,0.121521,0.126379,0.202875,0.190284,0.201723,0.201723


<font color='blue'> ___for easier comparison of feature sets, we can transpose the Eval DF and sort by r2 value:___ </font>

In [15]:
# best features in terms of r2: 
testx = lr_df.set_index('metric')
testx = testx.transpose()
testx.sort_values(by=['r2'],ascending=False)

metric,rmse,mse,mae,r2,var,r2_adj
lr1_F2bANOVA,29.393705,863.989874,24.576838,0.206274,280.727415,0.190284
lr1_F1bANOVA,29.512901,871.011301,24.453239,0.199824,310.216213,0.202875
lr1_F2bSm,29.618337,877.245858,24.705971,0.194096,299.215157,0.201723
lr1_F2bSs,29.618337,877.24587,24.705971,0.194096,299.215002,0.201723
lr1_F2b,29.618337,877.245879,24.705971,0.194096,299.214879,0.201723
lr1_F1b,29.754981,885.3589,24.791387,0.186643,329.195543,0.213314
lr1_F3,29.768845,886.184156,24.906565,0.185885,242.757866,0.159328
lr1_F2a,30.150807,909.07119,25.238552,0.164859,324.194645,0.205683
lr1_F1a,30.234244,914.109504,25.219772,0.160231,338.481319,0.206959
lr1_F4b,30.454849,927.497829,25.621583,0.147931,183.516265,0.126379


### Investigate ML model Hyperparameter Optimization

* ML hyperparameter tuning

we will optimize hyperparameters such as maxIter, and regularization method
* Regularization by Ordinary Least Squares (OLS) method
  * `regParam = 0`
* Regularization by  Ridge regression
  * `regParam > 0`, `elasticNetParam = 0` 
* Regularization by  LASSO method
  * `regParam  > 0`, `elasticNetParam = 1` 
* Regularization by Elastic Net method
  * `regParam > 0` , `1 > elasticNetParam > 0` 

In [11]:
'''
# TEST 1: Optimize regParam, elasticNet, maxIter
'''
subset_opt = data_final.select(['label_q0','F1bANOVA']) 

train_opt,test_opt = subset_opt.randomSplit([0.7,0.3])

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

modelname = 'linreg_optimized'

from pyspark.ml.evaluation import RegressionEvaluator

lr=LinearRegression(labelCol='label_q0',featuresCol='F1bANOVA')

# Choose parameters to vary over test
paramGrid = ParamGridBuilder()\
.addGrid(lr.regParam, [0, 0.0001, 0.01, 0.1, 0.5, 1.0, 2.0])\
.addGrid(lr.elasticNetParam, [0.0, 0.3, 0.7, 1.0])\
.addGrid(lr.maxIter, [1,2, 10, 100])\
.build()

evaluator=RegressionEvaluator(predictionCol='prediction',labelCol='label_q0',metricName='r2')
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)

cvModel = cv.fit(train_opt)

# Test the NLP model
testresults = cvModel.transform(test_opt)

* let's see which was the best model

In [12]:
best_mod = cvModel.bestModel
param_dict = best_mod.extractParamMap() 

sane_dict = {}
for k, v in param_dict.items():
    sane_dict[k.name] = v

best_reg = sane_dict["regParam"]
best_elastic_net = sane_dict["elasticNetParam"]
best_max_iter = sane_dict["maxIter"]
sane_dict

{'aggregationDepth': 2,
 'elasticNetParam': 0.3,
 'epsilon': 1.35,
 'featuresCol': 'F1bANOVA',
 'fitIntercept': True,
 'labelCol': 'label_q0',
 'loss': 'squaredError',
 'maxBlockSizeInMB': 0.0,
 'maxIter': 100,
 'predictionCol': 'prediction',
 'regParam': 0.5,
 'solver': 'auto',
 'standardization': True,
 'tol': 1e-06}

* It appears that the best model used **Elastic Net** regularization (since regParam=0.5, elasticNetParam=0.3)

* let's see how it compares 

In [17]:
'''
RE RUN ML COMPARISON WITH NEW ANOVA FEATURES & SCALED FEATURES
'''
finalColumns = ['Name','label_q0','features2b','features2bSs','features2bSm','F1bANOVA','F2bANOVA','features1a','features1b','features3','features4a','features4b']
from pyspark.ml.regression import LinearRegression

featuresName = '' # temp value, redefined below

#lr_df = pd.DataFrame()     # already exists from prior run
labelName = 'label_q0'  # SPECIFY

df = lr_df

i = 1
coefficients = {}
allFeatures = finalColumns
allFeatures.remove('Name')
allFeatures.remove(labelName)

for index,features in enumerate(allFeatures):
    featnum = allFeatures
    featuresName = features
    
    lr1_opt = LinearRegression(featuresCol=featuresName,
                                labelCol='label_q0',
                                regParam=best_reg, #0.1,
                                elasticNetParam=best_elastic_net, #1,
                                maxIter=best_max_iter) #100)
    
    modeltype = lr1_opt #lr1
    modeltypeVariantNo = 'lr1_opt' # SPECIFY modeltype variation number
    modelname = f"{modeltypeVariantNo}_{featnum[index]}" 
    
    mymodel = modeltype.fit(train)
    myresults = mymodel.transform(test)
    
    # CALCULATE KEY EVALS
    from pyspark.ml.evaluation import RegressionEvaluator
    regEvaluator = RegressionEvaluator(labelCol=labelName,predictionCol='prediction')
    evalMetrics = {regEvaluator:['rmse','mse','mae','r2','var']}
    evaluation2 = []
    evaluator = regEvaluator
    for each_metric in evalMetrics[evaluator]:        
        metric = each_metric
        
        result = evaluator.evaluate(myresults, {evaluator.metricName: metric})
        evaluation2.append((metric,result))
        
    r2_adj = mymodel.summary.r2adj
    evaluation2.append(('r2_adj(Training)',r2_adj))
    column0 = [x for x,y in evaluation2]
    column1 = [y for x,y in evaluation2]
    lr_df['metric'] = column0
    lr_df[modelname] = column1
    
    i += 1


testx = lr_df.set_index('metric')
testx = testx.transpose()
testx.sort_values(by=['r2'],ascending=False)

metric,rmse,mse,mae,r2,var,r2_adj(Training)
lr1_F2bANOVA,29.393705,863.989874,24.576838,0.206274,280.727415,0.190284
lr1_opt_features1b,29.504458,870.513044,24.704965,0.200282,271.69712,0.187619
lr1_opt_F1bANOVA,29.507031,870.664904,24.722409,0.200142,260.884935,0.183945
lr1_F1bANOVA,29.512901,871.011301,24.453239,0.199824,310.216213,0.202875
lr1_opt_features2b,29.540723,872.654297,24.738766,0.198315,257.119047,0.188303
lr1_opt_features2bSm,29.540723,872.654297,24.738766,0.198315,257.119047,0.188303
lr1_opt_features2bSs,29.540723,872.654297,24.738766,0.198315,257.119047,0.188303
lr1_opt_F2bANOVA,29.598158,876.050976,24.814714,0.195194,247.205445,0.183302
lr1_F2bSm,29.618337,877.245858,24.705971,0.194096,299.215157,0.201723
lr1_F2bSs,29.618337,877.24587,24.705971,0.194096,299.215002,0.201723
