# ML Trial 2

### <font color='blue'> Test the attempted improvements in feature selection </font>

* Recall, in the last notebook we did some feature engineering to provide the following feature sets:

In [None]:
featuresDF = pd.read_parquet('featuresCatalogDF_2022-08-17_new.parquet')
featuresDF.head(10)

1. With all of the new feature sets saved in the **features catalog**, we can load the original data set and run through the data processing steps

In [None]:
# MSM VM config prep
import findspark
findspark.init('/home/mitch/spark-3.3.0-bin-hadoop2')
import pyspark

import pandas as pd

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BApredsV1').getOrCreate()

# load the data
data = spark.read.csv("bioavailability_data_final.csv",inferSchema=True,sep=',',header=True)

''' # FEATURE SELECTION:
'''
featuresDF = pd.read_parquet('featuresCatalogDF_2022-08-17_new.parquet')
feature_set1a = featuresDF.loc[0,'features']
feature_set1b = featuresDF.loc[1,'features']
feature_set2a = featuresDF.loc[2,'features']
feature_set2b = featuresDF.loc[3,'features']
feature_set3  = featuresDF.loc[4,'features']
feature_set4a = featuresDF.loc[5,'features']
feature_set4b = featuresDF.loc[6,'features']
F1bANOVA = featuresDF.loc[7,'features']
F2bANOVA = featuresDF.loc[8,'features']

''' # VECTORIZE FEATURES
'''
# VECTOR ASSEMBLY - feature sets 1a,1b,2a,2b,3,4a,4b
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import (VectorAssembler,VectorIndexer)

vec_assembler1a = VectorAssembler(inputCols = feature_set1a, outputCol='features1a')
vec_assembler1b = VectorAssembler(inputCols = feature_set1b, outputCol='features1b')
vec_assembler2a = VectorAssembler(inputCols = feature_set2a, outputCol='features2a')
vec_assembler2b = VectorAssembler(inputCols = feature_set2b, outputCol='features2b')
vec_assembler3 = VectorAssembler(inputCols = feature_set3, outputCol='features3')
vec_assembler4a = VectorAssembler(inputCols = feature_set4a, outputCol='features4a')
vec_assembler4b = VectorAssembler(inputCols = feature_set4b, outputCol='features4b')
vec_assembler1bANOVA = VectorAssembler(inputCols = F1bANOVA, outputCol='F1bANOVA')
vec_assembler2bANOVA = VectorAssembler(inputCols = F2bANOVA, outputCol='F2bANOVA')

from pyspark.ml import Pipeline
feature_pipeline = Pipeline(stages=[vec_assembler1a,
                                    vec_assembler1b,
                                    vec_assembler2a,
                                    vec_assembler2b,
                                    vec_assembler3,
                                    vec_assembler4a,
                                    vec_assembler4b,
                                    vec_assembler1bANOVA,
                                   vec_assembler2bANOVA])
data_features = feature_pipeline.fit(data).transform(data)


''' # DEPENDENT VARIABLE LABELS 
'''
from pyspark.ml.feature import QuantileDiscretizer
import pandas as pd
qd5 = QuantileDiscretizer(numBuckets=5,inputCol='BA_pct',outputCol='label_QD5')

data_features = qd5.fit(data_features).transform(data_features)

# -- INDEX / ENCODE LABELS
from pyspark.ml.feature import (StringIndexer,OneHotEncoder)

label_quant0 = 'BA_pct'
label_cat0_vector = OneHotEncoder(inputCol='label_QD5',outputCol='label_cat0_vector')

label_cat1_index = StringIndexer(inputCol='label1',outputCol='label_cat1_index')
label_cat1_vector = OneHotEncoder(inputCol='label_cat1_index',outputCol='label_cat1_vector')

label_cat2_index = StringIndexer(inputCol='label2',outputCol='label_cat2_index')
label_cat2_vector = OneHotEncoder(inputCol='label_cat2_index',outputCol='label_cat2_vector')

label_cat3_index = StringIndexer(inputCol='label3a',outputCol='label_cat3_index')
label_cat3_vector = OneHotEncoder(inputCol='label_cat3_index',outputCol='label_cat3_vector')

label_cat4_index = StringIndexer(inputCol='label3b',outputCol='label_cat4_index')
label_cat4_vector = OneHotEncoder(inputCol='label_cat4_index',outputCol='label_cat4_vector')

from pyspark.ml import Pipeline
label_pipeline = Pipeline(stages=[label_cat0_vector,
                                 label_cat1_index,label_cat1_vector,
                                 label_cat2_index,label_cat2_vector,
                                 label_cat3_index,label_cat3_vector,
                                 label_cat4_index,label_cat4_vector])

data_features = data_features.select(['Name','BA_pct',
                                      'label_QD5','label1','label2','label3a','label3b',
                                      'features1a','features1b','features2a','features2b',
                                      'features3','features4a','features4b','F1bANOVA','F2bANOVA'])

data_prefinal = label_pipeline.fit(data_features).transform(data_features)

data_prefinal2 = data_prefinal.withColumnRenamed('BA_pct','label_q0')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_QD5','label_cat0')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat1_index','label_cat1')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat2_index','label_cat2')
data_prefinal2 = data_prefinal2.withColumnRenamed('label3a','label3')
data_prefinal2 = data_prefinal2.withColumnRenamed('label3b','label4')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat3_index','label_cat3')
data_prefinal2 = data_prefinal2.withColumnRenamed('label_cat4_index','label_cat4')

data_final = data_prefinal2.select(['Name',
                                    'label_q0',
                                    'label_cat0','label_cat1',
                                    'label_cat2','label_cat3','label_cat4',
                                    'features1a','features1b','features2a','features2b',
                                    'features3','features4a','features4b',
                                    'F1bANOVA','F2bANOVA'])

2. let's also check if **Feature Scaling** has any effect; let's add that in

In [None]:
''' # TEST FEATURE SCALING
'''
# Scale values
from pyspark.ml.feature import StandardScaler

scaler1 = StandardScaler(inputCol="features2b", outputCol="features2bSs", withStd=True, withMean=False)
scaler2 = StandardScaler(inputCol="features2b", outputCol="features2bSm", withStd=False, withMean=True)

data_final = scaler1.fit(data_final).transform(data_final)
data_final = scaler2.fit(data_final).transform(data_final)

<font color='gray'> ..._now, all of the different labels and feature sets have been added to the data._ </font>

3. Let's **backup the dataset** so we can avoid executing all of the above data prep again in a new session

In [None]:
data_final.toPandas().to_pickle("bioavailability_data_final_withLabelsAndFeaturesV1.pkl")

4. Now, we can easily load the processed data and get right into ML testing:

In [None]:
# load the dataset
data_final = pd.read_pickle("bioavailability_data_final_withLabelsAndFeaturesV1.pkl")
data_final = spark.createDataFrame(data_final)

''' 
# RUN ML COMPARISON ACROSS ALL FEATURE SETS
'''
from pyspark.ml.regression import LinearRegression

labelName = 'label_q0'  # SPECIFY
lr_df = pd.DataFrame()
df = lr_df

allFeatures = ['features1a','features1b',
               'features2a','features2b',
               'features3','features4a','features4b',
               'F1bANOVA','F2bANOVA',"features2bSs","features2bSm"]

subset = data_final.select(['Name',labelName,
                            'features1a','features1b',
                            'features2a','features2b',
                            'features3','features4a','features4b',
                            'F1bANOVA','F2bANOVA',"features2bSs","features2bSm"]) 
train,test = subset.randomSplit([0.7,0.3])

#coefficients = {}
i = 1
for index,features in enumerate(allFeatures):
    featnum = ['F1a','F1b','F2a','F2b','F3','F4a','F4b','F1bANOVA','F2bANOVA','F2bSs','F2bSm']
    featuresName = features
    
    lr = LinearRegression(featuresCol=featuresName,labelCol=labelName,predictionCol='prediction')
    
    ''' # SPECIFY MODEL 
    '''
    modeltype = lr 
    modeltypeVariantNo = '1' 
    modelname = f"lr{modeltypeVariantNo}_{featnum[index]}"  
    
    # FIT/TRAIN MODEL & TRANSFORM DATA
    mymodel = modeltype.fit(train)
    myresults = mymodel.transform(test)
    
    '''
    # Save predictions
    export = myresults.select(['Name','prediction'])
    export = export.withColumnRenamed('prediction',f'pred_{modelname}')
    export = export.withColumnRenamed('Name','Name2')
    if index == 0:
        comparedPredictions = data_final.join(export,data_final.Name == export.Name2,how="leftouter")
    else:
        comparedPredictions = comparedPredictions.join(export,comparedPredictions.Name == export.Name2,how="leftouter")
    '''
    
    # CALCULATE KEY EVALS
    from pyspark.ml.evaluation import RegressionEvaluator
    regEvaluator = RegressionEvaluator(labelCol=labelName,predictionCol='prediction')
    evalMetrics = {regEvaluator:['rmse','mse','mae','r2','var']}
    evaluation2 = []
    evaluator = regEvaluator
    for each_metric in evalMetrics[evaluator]:        
        metric = each_metric
        result = evaluator.evaluate(myresults, {evaluator.metricName: metric})
        evaluation2.append((metric,result))
    r2_adj = mymodel.summary.r2adj
    evaluation2.append(('r2_adj',r2_adj))
    column0 = [x for x,y in evaluation2]
    column1 = [y for x,y in evaluation2]
    lr_df['metric'] = column0
    lr_df[modelname] = column1
    
    '''
    # save coefficients for each LR model
    for k, v in data_final.schema[featuresName].metadata["ml_attr"]["attrs"].items():
        coefficients_df = pd.DataFrame(v)
        # print coefficient and intercept
        #print(mymodel.coefficients, mymodel.intercept)
        coefficients_df[modelname] = mymodel.coefficients
        #coefficients.append(coefficients_df)
        coefficients[i] = coefficients_df
    '''
    i += 1

# combine all coefficient tables into 1 df
'''
for key in range (1,8):
    if key == 1:
        dfx = coefficients[key]
    elif key > 1:
        dfx = pd.merge(dfx,coefficients[key],on='name',how='left').fillna(' ')
    coefficients_vs_features = dfx
'''
lr_df.head(6)

<font color='gray'> _for easier comparison of feature sets, we can transpose the Eval DF and sort by r2 value:_ </font>

In [None]:
# best features in terms of r2: 
testx = lr_df.set_index('metric')
testx = testx.transpose()
testx.sort_values(by=['r2'],ascending=False)

<font color='red'> If this part above worked: </font>
* delete the commented code blocks above, as they're not needed
* delete the code below which seems messier 

### Investigate ML model Hyperparameter Optimization

* ML hyperparameter tuning

we will optimize hyperparameters such as maxIter, and regularization method
* Regularization by Ordinary Least Squares (OLS) method
  * `regParam = 0`
* Regularization by  Ridge regression
  * `regParam > 0`, `elasticNetParam = 0` 
* Regularization by  LASSO method
  * `regParam  > 0`, `elasticNetParam = 1` 
* Regularization by Elastic Net method
  * `regParam > 0` , `1 > elasticNetParam > 0` 

In [None]:
'''
# TEST 1: Optimize regParam, elasticNet, maxIter
'''
subset = data_final.select(['label_q0','F1bANOVA']) 

train,test = subset.randomSplit([0.7,0.3])

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

modelname = 'linreg_optimized'

from pyspark.ml.evaluation import RegressionEvaluator

lr=LinearRegression(labelCol='label_q0',featuresCol='F1bANOVA')

# Choose parameters to vary over test
paramGrid = ParamGridBuilder()\
.addGrid(lr.regParam, [0, 0.0001, 0.01, 0.1, 0.5, 1.0, 2.0])\
.addGrid(lr.elasticNetParam, [0.0, 0.2, 0.5, 0.8, 1.0])\
.addGrid(lr.maxIter, [2, 4, 16, 32, 100])\
.build()

evaluator=RegressionEvaluator(predictionCol='prediction',labelCol='label_q0',metricName='r2')
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=3)

cvModel = cv.fit(train)

# Test the NLP model
testresults = cvModel.transform(test)

* let's see which was the best model

In [None]:
best_mod = cvModel.bestModel
param_dict = best_mod.extractParamMap() 

sane_dict = {}
for k, v in param_dict.items():
    sane_dict[k.name] = v

best_reg = sane_dict["regParam"]
best_elastic_net = sane_dict["elasticNetParam"]
best_max_iter = sane_dict["maxIter"]
sane_dict

* It appears that the best model was **LASSO** method of regularization: <br>
{'aggregationDepth': 2,
 'elasticNetParam': 1.0,
 'epsilon': 1.35,
 'featuresCol': 'F1bANOVA',
 'fitIntercept': True,
 'labelCol': 'label_q0',
 'loss': 'squaredError',
 'maxBlockSizeInMB': 0.0,
 'maxIter': 100,
 'predictionCol': 'prediction',
 'regParam': 0.1,
 'solver': 'auto',
 'standardization': True,
 'tol': 1e-06}

* let's see how it compares 

In [None]:
'''
RE RUN ML COMPARISON WITH NEW ANOVA FEATURES & SCALED FEATURES
'''
finalColumns = ['Name','label_q0','features2b','features2bSs','features2bSm','F1bANOVA','F2bANOVA','features1a','features1b','features3','features4a','features4b']
from pyspark.ml.regression import (LinearRegression,
                                   DecisionTreeRegressor,RandomForestRegressor,GBTRegressor,
                                   GeneralizedLinearRegression,IsotonicRegression)

featuresName = '' # temp value, redefined below

#lr_df = pd.DataFrame()     # already exists from prior run
labelName = 'label_q0'  # SPECIFY

df = lr_df

i = 1
coefficients = {}
allFeatures = finalColumns
allFeatures.remove('Name')
allFeatures.remove(labelName)

for index,features in enumerate(allFeatures):
    featnum = allFeatures
    featuresName = features
    
    #modeltype = lr  # SPECIFY (lr,dtr,rfr,gbtr,glr,ir)
    
    # lr_opt2 (LASSO) was detected as bestmodel for optimizing r2
    lr1_opt2 = LinearRegression(featuresCol=featuresName,labelCol='label_q0',regParam=0.1,elasticNetParam=1,maxIter=100)
    
    modeltype = lr1_opt2 #lr1
    modeltypeVariantNo = 'lr1_opt2' # SPECIFY modeltype variation number
    modelname = f"{modeltypeVariantNo}_{featnum[index]}" 
    
    mymodel = modeltype.fit(train)
    myresults = mymodel.transform(test)
    
    # CALCULATE KEY EVALS
    from pyspark.ml.evaluation import RegressionEvaluator
    regEvaluator = RegressionEvaluator(labelCol=labelName,predictionCol='prediction')
    evalMetrics = {regEvaluator:['rmse','mse','mae','r2','var']}
    evaluation2 = []
    evaluator = regEvaluator
    for each_metric in evalMetrics[evaluator]:        
        metric = each_metric
        
        result = evaluator.evaluate(myresults, {evaluator.metricName: metric})
        evaluation2.append((metric,result))
        
    r2_adj = mymodel.summary.r2adj
    evaluation2.append(('r2_adj(Training)',r2_adj))
    column0 = [x for x,y in evaluation2]
    column1 = [y for x,y in evaluation2]
    lr_df['metric'] = column0
    lr_df[modelname] = column1
    
    i += 1


testx = lr_df.set_index('metric')
testx = testx.drop(columns='lr1_features2bSm')
testx = testx.transpose()
testx.sort_values(by=['r2'],ascending=False)