In [14]:
# MSM VM config prep
import findspark
findspark.init('/home/mitch/spark-3.3.0-bin-hadoop2')
import pyspark
 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BApredsV2').getOrCreate()

# --- suppress future spark warnings/error/etc output ---
spark.sparkContext.setLogLevel("OFF")

In [15]:
import pandas as pd
def load_data_and_merge():
    labels_and_calcs = spark.read.csv("bioavailability_data_final.csv",inferSchema=True,sep=',',header=True)
    df1 = labels_and_calcs.toPandas()
    
    df3 = pd.read_pickle('bioavailabilityData_w_Frags__final.pkl')
    df3 = df3.drop(columns=['ba_pct'])
    
    #temp1 = pd.merge(df1,df2,how='left',left_on='_c0',right_on=df2.index)
    
    temp1 = labels_and_calcs
    #temp1 = spark.createDataFrame(temp1)
    
    temp2 = spark.createDataFrame(df3)
    
    data = temp2.join(temp1,(temp2.drug_smiles==temp1.smile),"left")
    
    return data
    

data = load_data_and_merge()

In [16]:
''' 
# INTIAL LABELS:
# --- Data has 1 continuous label column, and 4 categorical label columns (discretized variants of continuous label).
# ------ categorical labels applied by dividing the continuous label values into 3-5 categories 
# ------ the value range associated with each group were selected based on histogram dist./mean/stdev
# --- We'll add one more discretization variant,  using Spark's built-in QuantileDiscretizer
'''
# -- Add QuantileDiscretizer labels
from pyspark.ml.feature import QuantileDiscretizer
import pandas as pd
qd5 = QuantileDiscretizer(numBuckets=5,inputCol='BA_pct',outputCol='label_QD5')

data_wLabels = qd5.fit(data).transform(data)

# -- INDEX / ENCODE LABELS
from pyspark.ml.feature import (StringIndexer,OneHotEncoder)

label_quant0 = 'BA_pct'
#label_cat0_vector = OneHotEncoder(inputCol='label_QD5',outputCol='label_cat0_vector')

label_cat1_index = StringIndexer(inputCol='label1',outputCol='label_cat1_index')
#label_cat1_vector = OneHotEncoder(inputCol='label_cat1_index',outputCol='label_cat1_vector')

label_cat2_index = StringIndexer(inputCol='label2',outputCol='label_cat2_index')
#label_cat2_vector = OneHotEncoder(inputCol='label_cat2_index',outputCol='label_cat2_vector')

label_cat3_index = StringIndexer(inputCol='label3a',outputCol='label_cat3_index')
#label_cat3_vector = OneHotEncoder(inputCol='label_cat3_index',outputCol='label_cat3_vector')

label_cat4_index = StringIndexer(inputCol='label3b',outputCol='label_cat4_index')
#label_cat4_vector = OneHotEncoder(inputCol='label_cat4_index',outputCol='label_cat4_vector')

from pyspark.ml import Pipeline
label_pipeline = Pipeline(stages=[#label_cat0_vector,
                                 label_cat1_index,#label_cat1_vector,
                                 label_cat2_index,#label_cat2_vector,
                                 label_cat3_index,#label_cat3_vector,
                                 label_cat4_index])#,label_cat4_vector])
data_wLabels = label_pipeline.fit(data_wLabels).transform(data_wLabels)

## Test predictions by fragment data alone

In [17]:
test_subset = data_wLabels.select(['Name','BA_pct',
                                   'label1','label2','label3a','label3b',
                                   'label_QD5',
                                   'label_cat1_index','label_cat2_index','label_cat3_index','label_cat4_index',
                                   'frags_all','frags_subset','frags_subset2','frags_efgs','frags_brics'])

In [18]:
from pyspark.ml.feature import NGram,Word2Vec,CountVectorizer,HashingTF,IDF

In [20]:
''' # Fragments NLP processing  - NEWER -
'''

from pyspark.ml.feature import NGram,Word2Vec,CountVectorizer,IDF

fragment_types = ['frags_all','frags_subset','frags_subset2','frags_efgs','frags_brics']
                  
fragment_shortname = ['f_all','f_subset','f_subset2','f_efgs','f_brics']

test_subset_nlp = test_subset
for i,frag_type in enumerate(fragment_types):
    
    frag_type_short = fragment_shortname[i]
    
    cv2 = CountVectorizer(inputCol=frag_type, outputCol=f"{frag_type_short}_cv2", minDF=2.0)
    cv2_idf = IDF(inputCol=f"{frag_type_short}_cv2", outputCol=f"{frag_type_short}_cv2_idf")
    
    cv5 = CountVectorizer(inputCol=frag_type, outputCol=f"{frag_type_short}_cv5")
    
    w2v = Word2Vec(inputCol=frag_type, outputCol=f"{frag_type_short}_w2v")
    
    n2gram = NGram(n=2, inputCol=frag_type, outputCol=f"{frag_type_short}_n2g")
    n2gram_cv2 = CountVectorizer(inputCol=f"{frag_type_short}_n2g", outputCol=f"{frag_type_short}_n2g_cv2", minDF=2.0)
    
    n2gram_cv5 = CountVectorizer(inputCol=f"{frag_type_short}_n2g", outputCol=f"{frag_type_short}_n2g_cv5")
    
    nlp_pipeline = Pipeline(stages=[cv2, cv2_idf, 
                                    cv5, w2v,
                                    n2gram, n2gram_cv2, n2gram_cv5])
    
    test_subset_nlp = nlp_pipeline.fit(test_subset_nlp).transform(test_subset_nlp)
    
    column_to_drop = f"{frag_type_short}_n2g"
    test_subset_nlp = test_subset_nlp.drop(column_to_drop)

                                                                                

In [21]:
test_subset_nlp.columns

['Name',
 'BA_pct',
 'label1',
 'label2',
 'label3a',
 'label3b',
 'label_QD5',
 'label_cat1_index',
 'label_cat2_index',
 'label_cat3_index',
 'label_cat4_index',
 'frags_all',
 'frags_subset',
 'frags_subset2',
 'frags_efgs',
 'frags_brics',
 'f_all_cv2',
 'f_all_cv2_idf',
 'f_all_cv5',
 'f_all_w2v',
 'f_all_n2g_cv2',
 'f_all_n2g_cv5',
 'f_subset_cv2',
 'f_subset_cv2_idf',
 'f_subset_cv5',
 'f_subset_w2v',
 'f_subset_n2g_cv2',
 'f_subset_n2g_cv5',
 'f_subset2_cv2',
 'f_subset2_cv2_idf',
 'f_subset2_cv5',
 'f_subset2_w2v',
 'f_subset2_n2g_cv2',
 'f_subset2_n2g_cv5',
 'f_efgs_cv2',
 'f_efgs_cv2_idf',
 'f_efgs_cv5',
 'f_efgs_w2v',
 'f_efgs_n2g_cv2',
 'f_efgs_n2g_cv5',
 'f_brics_cv2',
 'f_brics_cv2_idf',
 'f_brics_cv5',
 'f_brics_w2v',
 'f_brics_n2g_cv2',
 'f_brics_n2g_cv5']

In [22]:
test_subset_nlp = test_subset_nlp.select(['Name','BA_pct',
                                          'label_cat1_index','label_cat2_index','label_cat3_index','label_cat4_index','label_QD5',
                                         
                                          'f_all_cv2','f_all_cv2_idf','f_all_cv5','f_all_w2v',
                                          'f_all_n2g_cv2','f_all_n2g_cv5',
                                          
                                          'f_subset_cv2','f_subset_cv2_idf','f_subset_cv5','f_subset_w2v',
                                          'f_subset_n2g_cv2','f_subset_n2g_cv5',
                                          
                                          'f_subset2_cv2','f_subset2_cv2_idf','f_subset2_cv5','f_subset2_w2v',
                                          'f_subset2_n2g_cv2','f_subset2_n2g_cv5',
                                          
                                          'f_efgs_cv2','f_efgs_cv2_idf','f_efgs_cv5','f_efgs_w2v',
                                          'f_efgs_n2g_cv2','f_efgs_n2g_cv5',
                                          
                                          'f_brics_cv2','f_brics_cv2_idf','f_brics_cv5','f_brics_w2v',
                                          'f_brics_n2g_cv2','f_brics_n2g_cv5'])

from pyspark.ml.linalg import Vector
from pyspark.ml.feature import (VectorAssembler,VectorIndexer)

vector_assemblers = []

alternative_features = ['f_all_cv2','f_all_cv2_idf','f_all_cv5','f_all_w2v',
                        'f_all_n2g_cv2','f_all_n2g_cv5',
                        
                        'f_subset_cv2','f_subset_cv2_idf','f_subset_cv5','f_subset_w2v',
                        'f_subset_n2g_cv2','f_subset_n2g_cv5',
                        
                        'f_subset2_cv2','f_subset2_cv2_idf','f_subset2_cv5','f_subset2_w2v',
                        'f_subset2_n2g_cv2','f_subset2_n2g_cv5',
                        
                        'f_efgs_cv2','f_efgs_cv2_idf','f_efgs_cv5','f_efgs_w2v',
                        'f_efgs_n2g_cv2','f_efgs_n2g_cv5',
                        
                        'f_brics_cv2','f_brics_cv2_idf','f_brics_cv5','f_brics_w2v',
                        'f_brics_n2g_cv2','f_brics_n2g_cv5']
output_features = ""
for feats in alternative_features:
    #feats_input = ['MolWt','MolLogP','TPSA',feats]
    feats_input = [feats]
    feats_output = f"FEAT_{feats}"
    
    vec_assembler = VectorAssembler(inputCols=feats_input, outputCol=feats_output)
    
    vector_assemblers.append(vec_assembler)
    
    output_features += "'"+feats_output+"'"+", "
output_features = output_features[0:len(output_features)-2]
print(output_features)

from pyspark.ml import Pipeline
feature_pipeline = Pipeline(stages=[x for x in vector_assemblers])

test_subset_nlpFeatures = feature_pipeline.fit(test_subset_nlp).transform(test_subset_nlp)

'FEAT_f_all_cv2', 'FEAT_f_all_cv2_idf', 'FEAT_f_all_cv5', 'FEAT_f_all_w2v', 'FEAT_f_all_n2g_cv2', 'FEAT_f_all_n2g_cv5', 'FEAT_f_subset_cv2', 'FEAT_f_subset_cv2_idf', 'FEAT_f_subset_cv5', 'FEAT_f_subset_w2v', 'FEAT_f_subset_n2g_cv2', 'FEAT_f_subset_n2g_cv5', 'FEAT_f_subset2_cv2', 'FEAT_f_subset2_cv2_idf', 'FEAT_f_subset2_cv5', 'FEAT_f_subset2_w2v', 'FEAT_f_subset2_n2g_cv2', 'FEAT_f_subset2_n2g_cv5', 'FEAT_f_efgs_cv2', 'FEAT_f_efgs_cv2_idf', 'FEAT_f_efgs_cv5', 'FEAT_f_efgs_w2v', 'FEAT_f_efgs_n2g_cv2', 'FEAT_f_efgs_n2g_cv5', 'FEAT_f_brics_cv2', 'FEAT_f_brics_cv2_idf', 'FEAT_f_brics_cv5', 'FEAT_f_brics_w2v', 'FEAT_f_brics_n2g_cv2', 'FEAT_f_brics_n2g_cv5'


In [23]:
test_subset_final = test_subset_nlpFeatures.select(['Name','BA_pct',
                                                   'FEAT_f_all_cv2', 'FEAT_f_all_cv2_idf', 'FEAT_f_all_cv5', 
                                                    'FEAT_f_all_w2v', 'FEAT_f_all_n2g_cv2', 'FEAT_f_all_n2g_cv5',
                                                    'FEAT_f_subset_cv2', 'FEAT_f_subset_cv2_idf', 'FEAT_f_subset_cv5',
                                                    'FEAT_f_subset_w2v', 'FEAT_f_subset_n2g_cv2', 'FEAT_f_subset_n2g_cv5',
                                                    'FEAT_f_subset2_cv2', 'FEAT_f_subset2_cv2_idf', 'FEAT_f_subset2_cv5',
                                                    'FEAT_f_subset2_w2v', 'FEAT_f_subset2_n2g_cv2', 'FEAT_f_subset2_n2g_cv5',
                                                    'FEAT_f_efgs_cv2', 'FEAT_f_efgs_cv2_idf', 'FEAT_f_efgs_cv5', 
                                                    'FEAT_f_efgs_w2v', 'FEAT_f_efgs_n2g_cv2', 'FEAT_f_efgs_n2g_cv5', 
                                                    'FEAT_f_brics_cv2', 'FEAT_f_brics_cv2_idf', 'FEAT_f_brics_cv5', 
                                                    'FEAT_f_brics_w2v', 'FEAT_f_brics_n2g_cv2', 'FEAT_f_brics_n2g_cv5'])

(training,testing) = test_subset_final.randomSplit([0.7,0.3])

In [24]:
allFeatures =  ['FEAT_f_all_cv2', 'FEAT_f_all_cv2_idf', 'FEAT_f_all_cv5', 
                'FEAT_f_all_w2v', 'FEAT_f_all_n2g_cv2', 'FEAT_f_all_n2g_cv5',
                'FEAT_f_subset_cv2', 'FEAT_f_subset_cv2_idf', 'FEAT_f_subset_cv5',
                'FEAT_f_subset_w2v', 'FEAT_f_subset_n2g_cv2', 'FEAT_f_subset_n2g_cv5',
                'FEAT_f_subset2_cv2', 'FEAT_f_subset2_cv2_idf', 'FEAT_f_subset2_cv5',
                'FEAT_f_subset2_w2v', 'FEAT_f_subset2_n2g_cv2', 'FEAT_f_subset2_n2g_cv5',
                'FEAT_f_efgs_cv2', 'FEAT_f_efgs_cv2_idf', 'FEAT_f_efgs_cv5', 
                'FEAT_f_efgs_w2v', 'FEAT_f_efgs_n2g_cv2', 'FEAT_f_efgs_n2g_cv5', 
                'FEAT_f_brics_cv2', 'FEAT_f_brics_cv2_idf', 'FEAT_f_brics_cv5', 
                'FEAT_f_brics_w2v', 'FEAT_f_brics_n2g_cv2', 'FEAT_f_brics_n2g_cv5']
featuresName = '' # temp value, redefined below

eval_df = pd.DataFrame() # already exists from prior run
labelName = 'BA_pct'  # SPECIFY

modelname_short = 'rfr'
iteration = 0


for index,features in enumerate(allFeatures):
    featuresName = features
    
    '''# SPECIFY MODEL 
    '''
    from pyspark.ml.regression import RandomForestRegressor
    rfr = RandomForestRegressor(featuresCol=features,labelCol='BA_pct')
    modeltype = rfr  # SPECIFY (lr,dtr,rfr,gbtr,glr,ir)
    
    
    modelname = f"rfr{iteration}_{allFeatures[index]}"
    modelname = modelname.replace('FEAT_f_','')
    
    #evaluation_history[modelname_short][iteration][features] = {}
    
    # FIT/TRAIN MODEL & TRANSFORM DATA
    mymodel = modeltype.fit(training)
    myresults = mymodel.transform(testing)
    
    # CALCULATE KEY EVALS
    from pyspark.ml.evaluation import RegressionEvaluator
    regEvaluator = RegressionEvaluator(labelCol=labelName,predictionCol='prediction')

    evaluator = regEvaluator
    evalMetrics = {regEvaluator:['rmse','mse','mae','r2','var']}
    
    evaluation = []
    
    for each_metric in evalMetrics[evaluator]:        
        metric = each_metric

        result = evaluator.evaluate(myresults, {evaluator.metricName: metric})

        evaluation.append((metric,result))
        
        #evaluation_history[modelname_short][iteration][features][metric] = result
        
    #r2_adj = mymodel.summary.r2adj
    #evaluation.append(('r2_adj(Training)',r2_adj))
    column0 = [x for x,y in evaluation]
    column1 = [y for x,y in evaluation]
    eval_df['metric'] = column0
    eval_df[modelname] = column1

                                                                                

In [25]:
# iteration 4
pd.set_option('display.max_columns',None)
eval_df

Unnamed: 0,metric,rfr0_all_cv2,rfr0_all_cv2_idf,rfr0_all_cv5,rfr0_all_w2v,rfr0_all_n2g_cv2,rfr0_all_n2g_cv5,rfr0_subset_cv2,rfr0_subset_cv2_idf,rfr0_subset_cv5,rfr0_subset_w2v,rfr0_subset_n2g_cv2,rfr0_subset_n2g_cv5,rfr0_subset2_cv2,rfr0_subset2_cv2_idf,rfr0_subset2_cv5,rfr0_subset2_w2v,rfr0_subset2_n2g_cv2,rfr0_subset2_n2g_cv5,rfr0_efgs_cv2,rfr0_efgs_cv2_idf,rfr0_efgs_cv5,rfr0_efgs_w2v,rfr0_efgs_n2g_cv2,rfr0_efgs_n2g_cv5,rfr0_brics_cv2,rfr0_brics_cv2_idf,rfr0_brics_cv5,rfr0_brics_w2v,rfr0_brics_n2g_cv2,rfr0_brics_n2g_cv5
0,rmse,30.320921,30.320921,30.391067,31.083407,30.280319,30.601199,30.213488,30.213488,30.245852,31.439973,30.354182,30.531797,30.460687,30.460687,30.173174,31.525836,30.497086,30.629666,30.73837,30.73837,30.78809,30.606959,31.620247,31.521389,31.39609,31.39609,31.594195,31.553144,32.313564,32.368579
1,mse,919.358221,919.358221,923.616958,966.178188,916.897711,936.433398,912.854853,912.854853,914.811545,988.471889,921.376373,932.190626,927.853449,927.853449,910.420429,993.878353,930.072284,938.176411,944.847368,944.847368,947.906484,936.785948,999.840037,993.597935,985.714457,985.714457,998.193148,995.600903,1044.166434,1047.72492
2,mae,26.106168,26.106168,26.208338,26.839702,26.104657,26.335899,25.951259,25.951259,26.078349,27.047467,26.076333,26.279762,26.253976,26.253976,26.015198,27.372333,26.207888,26.478834,26.461278,26.461278,26.571943,26.573573,27.274667,27.283648,27.244932,27.244932,27.335514,26.949939,27.803498,27.889929
3,r2,0.140859,0.140859,0.13688,0.097106,0.143159,0.124903,0.146937,0.146937,0.145108,0.076273,0.138973,0.128868,0.132921,0.132921,0.149212,0.07122,0.130847,0.123274,0.11704,0.11704,0.114181,0.124573,0.065649,0.071482,0.07885,0.07885,0.067188,0.069611,0.024226,0.020901
4,var,124.318748,124.318748,119.974605,85.785854,116.108132,105.615612,125.334503,125.334503,118.208406,95.195071,121.775725,119.702673,117.628682,117.628682,121.584871,102.727707,128.229044,107.647532,57.294891,57.294891,55.560586,103.232881,27.579935,28.728255,37.872567,37.872567,33.557996,78.885398,24.904375,22.465992


In [29]:
# best features in terms of r2: 
testx = eval_df.set_index('metric')
testx = testx.transpose()
#testx.sort_values(by=['r2'],ascending=False)
testx.sort_values(by=['r2'],ascending=False).reset_index()

metric,index,rmse,mse,mae,r2,var
0,rfr0_subset2_cv5,30.173174,910.420429,26.015198,0.149212,121.584871
1,rfr0_subset_cv2_idf,30.213488,912.854853,25.951259,0.146937,125.334503
2,rfr0_subset_cv2,30.213488,912.854853,25.951259,0.146937,125.334503
3,rfr0_subset_cv5,30.245852,914.811545,26.078349,0.145108,118.208406
4,rfr0_all_n2g_cv2,30.280319,916.897711,26.104657,0.143159,116.108132
5,rfr0_all_cv2,30.320921,919.358221,26.106168,0.140859,124.318748
6,rfr0_all_cv2_idf,30.320921,919.358221,26.106168,0.140859,124.318748
7,rfr0_subset_n2g_cv2,30.354182,921.376373,26.076333,0.138973,121.775725
8,rfr0_all_cv5,30.391067,923.616958,26.208338,0.13688,119.974605
9,rfr0_subset2_cv2_idf,30.460687,927.853449,26.253976,0.132921,117.628682


### observations
* The top 5 positions were:
  1. subset2 .. **cv5**
  2. subset ... **cv2** / **cv2-idf**
  3. subset ... **cv5** 
  4. all  ......... **n2g-cv2**
  5. all  ......... **cv2** / **cv2-idf**

* this shows that we can ignore **brics** and **efgs** fragment sets.
* this also shows that we can disregard **w2v** and **n2g-cv5** NLP processing.

* <font color='red'> Still unknown, however, is the effect of the train/test split. </font>
  * we should average the _r2_ values over several train/test splits and find the top-performing set & NLP model

In [40]:
eval_df.loc[0][0]

'rmse'

#### let's construct a dictionary of the previous trial's evaluation results

In [43]:
allFeatures =  ['FEAT_f_all_cv2', 'FEAT_f_all_cv5', 'FEAT_f_all_n2g_cv2', 
                'FEAT_f_subset_cv2', 'FEAT_f_subset_cv5','FEAT_f_subset_n2g_cv2',
                'FEAT_f_subset2_cv2', 'FEAT_f_subset2_cv5', 'FEAT_f_subset2_n2g_cv2']

metric_averages = {}

for index,features in enumerate(allFeatures):

    modelname = f"rfr0_{allFeatures[index]}"
    modelname = modelname.replace('FEAT_f_','')

    metric_averages[modelname] = {}
    
    for (metric,N) in [('rmse',0),('r2',3),('var',4)]:
        metric_averages[modelname]['num_trials'] = 1
        metric_averages[modelname][metric] = eval_df.loc[N][modelname]

In [59]:
# manual entry of the metric_averages after the first trial run

metric_averages = {'rfr0_all_cv2': {'num_trials': 1,
                  'rmse': 30.32092051137974,
                  'r2': 0.14085944821279428,
                  'var': 124.31874808234403},
                 'rfr0_all_cv5': {'num_trials': 1,
                  'rmse': 30.391067075572746,
                  'r2': 0.13687965680905978,
                  'var': 119.97460462038228},
                 'rfr0_all_n2g_cv2': {'num_trials': 1,
                  'rmse': 30.280318872939436,
                  'r2': 0.14315879523222386,
                  'var': 116.10813226263414},
                 'rfr0_subset_cv2': {'num_trials': 1,
                  'rmse': 30.213487937176403,
                  'r2': 0.14693684706417076,
                  'var': 125.33450341337398},
                 'rfr0_subset_cv5': {'num_trials': 1,
                  'rmse': 30.245851692871423,
                  'r2': 0.1451083184199059,
                  'var': 118.20840619976119},
                 'rfr0_subset_n2g_cv2': {'num_trials': 1,
                  'rmse': 30.354182128097218,
                  'r2': 0.1389734845143883,
                  'var': 121.77572510571312},
                 'rfr0_subset2_cv2': {'num_trials': 1,
                  'rmse': 30.4606869491698,
                  'r2': 0.13292065421646182,
                  'var': 117.62868221829186},
                 'rfr0_subset2_cv5': {'num_trials': 1,
                  'rmse': 30.17317399527205,
                  'r2': 0.14921181742820566,
                  'var': 121.58487100619823},
                 'rfr0_subset2_n2g_cv2': {'num_trials': 1,
                  'rmse': 30.497086481216126,
                  'r2': 0.13084715273304437,
                  'var': 128.22904356766273}}

#### now let's perform additional trials, _each with a new train/test split_, & update the average evaluation result each time
#### <font color='red'> Each time the below cell runs, it performs another trial and updates the average value </font>

In [64]:
allFeatures =  ['FEAT_f_all_cv2', 'FEAT_f_all_cv5', 'FEAT_f_all_n2g_cv2', 
                'FEAT_f_subset_cv2', 'FEAT_f_subset_cv5','FEAT_f_subset_n2g_cv2',
                'FEAT_f_subset2_cv2', 'FEAT_f_subset2_cv5', 'FEAT_f_subset2_n2g_cv2']
featuresName = '' # temp value, redefined below

eval_df = pd.DataFrame()   # already exists from prior run
labelName = 'BA_pct'  # SPECIFY

modelname_short = 'rfr'
iteration = 0

#for i in [1,2,3]:
(training,testing) = test_subset_final.randomSplit([0.7,0.3])
#evaluation = []

for index,features in enumerate(allFeatures):
    featuresName = features

    '''# SPECIFY MODEL 
    '''
    from pyspark.ml.regression import RandomForestRegressor
    rfr = RandomForestRegressor(featuresCol=features,labelCol='BA_pct')
    modeltype = rfr  # SPECIFY (lr,dtr,rfr,gbtr,glr,ir)


    modelname = f"rfr{iteration}_{allFeatures[index]}"
    modelname = modelname.replace('FEAT_f_','')

    #evaluation_history[modelname_short][iteration][features] = {}

    # FIT/TRAIN MODEL & TRANSFORM DATA
    mymodel = modeltype.fit(training)
    myresults = mymodel.transform(testing)

    # CALCULATE KEY EVALS
    from pyspark.ml.evaluation import RegressionEvaluator
    regEvaluator = RegressionEvaluator(labelCol=labelName,predictionCol='prediction')

    evaluator = regEvaluator
    evalMetrics = {regEvaluator:['rmse','r2','var']}
    evaluation = []

    for each_metric in evalMetrics[evaluator]:        
        metric = each_metric

        number_of_trials = metric_averages[modelname]['num_trials']
        previous_average = metric_averages[modelname][metric]

        result = evaluator.evaluate(myresults, {evaluator.metricName: metric})

        new_average = ((number_of_trials*previous_average) + result)/(number_of_trials+1)

        evaluation.append((metric,new_average))

        metric_averages[modelname][metric] = new_average
    
    column0 = [x for x,y in evaluation]
    column1 = [y for x,y in evaluation]
    #column2 = [z for x,y,z in evaluation]
    eval_df['metric'] = column0
    eval_df[modelname] = column1
    #eval_df['num_trials'] = column2

metric_averages[modelname]['num_trials'] = (number_of_trials+1)

                                                                                

#### finally, display the ranking of _average r2_ for each Feature/NLP combo after 4 trials

In [65]:
# best features in terms of r2: 
testx = eval_df.set_index('metric')
testx = testx.transpose()

num_trials = metric_averages[modelname]['num_trials']
print(f"Updated ranking of average r2 after {num_trials} trials")

testx.sort_values(by=['r2'],ascending=False).reset_index()

Updated ranking of average r2 after 3 trials


metric,index,rmse,r2,var
0,rfr0_all_n2g_cv2,29.859975,0.154954,107.332755
1,rfr0_all_cv2,29.964635,0.148975,115.25793
2,rfr0_subset2_cv2,30.0037,0.146811,110.744062
3,rfr0_subset2_cv5,30.00772,0.14659,108.869176
4,rfr0_subset_n2g_cv2,30.025197,0.145618,105.443823
5,rfr0_subset_cv2,30.047092,0.144318,115.071874
6,rfr0_subset2_n2g_cv2,30.148189,0.142212,110.59647
7,rfr0_all_cv5,30.103133,0.141111,114.482404
8,rfr0_subset_cv5,30.139479,0.138993,114.135159


### Results:
**Findings:**
1. the most effective fragment set is **`frags_all`**
2. the most effective NLP representation is **`n2g_cv2`**