In [2]:
# MSM VM config prep
import findspark
findspark.init('/home/mitch/spark-3.3.0-bin-hadoop2')
import pyspark
 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BApredsV3').getOrCreate()

# --- suppress future spark warnings/error/etc output ---
spark.sparkContext.setLogLevel("OFF")

In [3]:
import pandas as pd
def load_data_and_merge():
    labels_and_calcs = spark.read.csv("bioavailability_data_final.csv",inferSchema=True,sep=',',header=True)
    df1 = labels_and_calcs.toPandas()
    
    df3 = pd.read_pickle('bioavailabilityData_w_Frags__final.pkl')
    df3 = df3.drop(columns=['ba_pct'])
    
    #temp1 = pd.merge(df1,df2,how='left',left_on='_c0',right_on=df2.index)
    
    temp1 = labels_and_calcs
    #temp1 = spark.createDataFrame(temp1)
    
    temp2 = spark.createDataFrame(df3)
    
    data = temp2.join(temp1,(temp2.drug_smiles==temp1.smile),"left")
    
    return data
    

data = load_data_and_merge()

[Stage 1:>                                                          (0 + 1) / 1]                                                                                

* Label data

In [12]:
''' 
# INTIAL LABELS:
# --- Data has 1 continuous label column, and 4 categorical label columns (discretized variants of continuous label).
# ------ categorical labels applied by dividing the continuous label values into 3-5 categories 
# ------ the value range associated with each group were selected based on histogram dist./mean/stdev
# --- We'll add one more discretization variant,  using Spark's built-in QuantileDiscretizer
'''
# -- Add QuantileDiscretizer labels
from pyspark.ml.feature import QuantileDiscretizer
import pandas as pd
qd5 = QuantileDiscretizer(numBuckets=5,inputCol='BA_pct',outputCol='label_QD5')

data_wLabels = qd5.fit(data).transform(data)

# -- INDEX LABELS
from pyspark.ml.feature import (StringIndexer,OneHotEncoder)

label_quant0 = 'BA_pct'

label_cat1_index = StringIndexer(inputCol='label1',outputCol='label_cat1_index')

label_cat2_index = StringIndexer(inputCol='label2',outputCol='label_cat2_index')

label_cat3_index = StringIndexer(inputCol='label3a',outputCol='label_cat3_index')

label_cat4_index = StringIndexer(inputCol='label3b',outputCol='label_cat4_index')

from pyspark.ml import Pipeline
label_pipeline = Pipeline(stages=[label_cat1_index,label_cat2_index,label_cat3_index,label_cat4_index])

data_wLabels = label_pipeline.fit(data_wLabels).transform(data_wLabels)



In [14]:
''' # Fragments NLP processing  - NEWER -
'''
from pyspark.ml.feature import NGram,Word2Vec,CountVectorizer,IDF

fragment_types = ['frags_all','frags_subset','frags_subset2','frags_efgs','frags_brics']
                  
fragment_shortname = ['f_all','f_subset','f_subset2','f_efgs','f_brics']

data_wLabels_NLP = data_wLabels
for i,frag_type in enumerate(fragment_types):
    
    frag_type_short = fragment_shortname[i]
    
    cv2 = CountVectorizer(inputCol=frag_type, outputCol=f"{frag_type_short}_cv2", minDF=2.0)
    cv2_idf = IDF(inputCol=f"{frag_type_short}_cv2", outputCol=f"{frag_type_short}_cv2_idf")
    
    cv5 = CountVectorizer(inputCol=frag_type, outputCol=f"{frag_type_short}_cv5")
    
    w2v = Word2Vec(inputCol=frag_type, outputCol=f"{frag_type_short}_w2v")
    
    n2gram = NGram(n=2, inputCol=frag_type, outputCol=f"{frag_type_short}_n2g")
    n2gram_cv2 = CountVectorizer(inputCol=f"{frag_type_short}_n2g", outputCol=f"{frag_type_short}_n2g_cv2", minDF=2.0)
    
    n2gram_cv5 = CountVectorizer(inputCol=f"{frag_type_short}_n2g", outputCol=f"{frag_type_short}_n2g_cv5")
    
    nlp_pipeline = Pipeline(stages=[cv2, cv2_idf, 
                                    cv5, w2v,
                                    n2gram, n2gram_cv2, n2gram_cv5])
    
    data_wLabels_NLP = nlp_pipeline.fit(data_wLabels_NLP).transform(data_wLabels_NLP)
    
    column_to_drop = f"{frag_type_short}_n2g"
    data_wLabels_NLP = data_wLabels_NLP.drop(column_to_drop)

                                                                                

* make fragment NLP feature vectors

In [15]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import (VectorAssembler,VectorIndexer)

vector_assemblers = []

alternative_features = ['f_all_cv2','f_all_cv2_idf','f_all_cv5','f_all_w2v',
                        'f_all_n2g_cv2','f_all_n2g_cv5',
                        
                        'f_subset_cv2','f_subset_cv2_idf','f_subset_cv5','f_subset_w2v',
                        'f_subset_n2g_cv2','f_subset_n2g_cv5',
                        
                        'f_subset2_cv2','f_subset2_cv2_idf','f_subset2_cv5','f_subset2_w2v',
                        'f_subset2_n2g_cv2','f_subset2_n2g_cv5',
                        
                        'f_efgs_cv2','f_efgs_cv2_idf','f_efgs_cv5','f_efgs_w2v',
                        'f_efgs_n2g_cv2','f_efgs_n2g_cv5',
                        
                        'f_brics_cv2','f_brics_cv2_idf','f_brics_cv5','f_brics_w2v',
                        'f_brics_n2g_cv2','f_brics_n2g_cv5']
output_features = ""
for feats in alternative_features:
    #feats_input = ['MolWt','MolLogP','TPSA',feats]
    feats_input = [feats]
    feats_output = f"FEAT_{feats}"
    
    vec_assembler = VectorAssembler(inputCols=feats_input, outputCol=feats_output)
    
    vector_assemblers.append(vec_assembler)
    
    output_features += "'"+feats_output+"'"+", "
output_features = output_features[0:len(output_features)-2]
print(output_features)

from pyspark.ml import Pipeline
feature_pipeline = Pipeline(stages=[x for x in vector_assemblers])

data_wLabels_NLPFeatures = feature_pipeline.fit(data_wLabels_NLP).transform(data_wLabels_NLP)

'FEAT_f_all_cv2', 'FEAT_f_all_cv2_idf', 'FEAT_f_all_cv5', 'FEAT_f_all_w2v', 'FEAT_f_all_n2g_cv2', 'FEAT_f_all_n2g_cv5', 'FEAT_f_subset_cv2', 'FEAT_f_subset_cv2_idf', 'FEAT_f_subset_cv5', 'FEAT_f_subset_w2v', 'FEAT_f_subset_n2g_cv2', 'FEAT_f_subset_n2g_cv5', 'FEAT_f_subset2_cv2', 'FEAT_f_subset2_cv2_idf', 'FEAT_f_subset2_cv5', 'FEAT_f_subset2_w2v', 'FEAT_f_subset2_n2g_cv2', 'FEAT_f_subset2_n2g_cv5', 'FEAT_f_efgs_cv2', 'FEAT_f_efgs_cv2_idf', 'FEAT_f_efgs_cv5', 'FEAT_f_efgs_w2v', 'FEAT_f_efgs_n2g_cv2', 'FEAT_f_efgs_n2g_cv5', 'FEAT_f_brics_cv2', 'FEAT_f_brics_cv2_idf', 'FEAT_f_brics_cv5', 'FEAT_f_brics_w2v', 'FEAT_f_brics_n2g_cv2', 'FEAT_f_brics_n2g_cv5'


* export data w/ NLP vector features

* Prepare Vector Features for RDKit calculations 

In [16]:
''' # RDKit "1D" FEATURE SELECTION:
'''
# to load the Features Information, use the command:
featuresDF = pd.read_parquet('featuresCatalogDF.parquet')
feature_set1b = featuresDF.loc[1,'features']
feature_set2b = featuresDF.loc[3,'features']
F1bANOVA = featuresDF.loc[7,'features']
F2bANOVA = featuresDF.loc[8,'features']

# VECTOR ASSEMBLY - feature sets 
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import (VectorAssembler,VectorIndexer)

vec_assembler1b = VectorAssembler(inputCols = feature_set1b, outputCol='FEAT_rdkit_1b')
vec_assembler2b = VectorAssembler(inputCols = feature_set2b, outputCol='FEAT_rdkit_2b')
vec_assembler1bANOVA = VectorAssembler(inputCols = F1bANOVA, outputCol='FEAT_rdkit_1bANOVA')
vec_assembler2bANOVA = VectorAssembler(inputCols = F2bANOVA, outputCol='FEAT_rdkit_2bANOVA')

from pyspark.ml import Pipeline
feature_pipeline = Pipeline(stages=[vec_assembler1b,
                                    vec_assembler2b,
                                    vec_assembler1bANOVA,
                                    vec_assembler2bANOVA])

data_allFeaturesAndLabels = feature_pipeline.fit(data_wLabels_NLPFeatures).transform(data_wLabels_NLPFeatures)

* clean up the data

In [17]:
data_allFeaturesAndLabels.columns

['drug_name',
 'drug_smiles',
 'frags_all',
 'num_frags_all',
 'frags_subset',
 'num_frags_subset',
 'frags_subset2',
 'frags_efgs',
 'frags_brics',
 '_c0',
 'Name',
 'smile',
 'BA_pct',
 'MolWt',
 'ExactMolWt',
 'qed',
 'MolLogP',
 'MolMR',
 'VSA_total',
 'LabuteASA',
 'TPSA',
 'MaxPartialCharge',
 'MinPartialCharge',
 'MaxAbsPartialCharge',
 'MinAbsPartialCharge',
 'NumHAcceptors',
 'NumHDonors',
 'HeavyAtomCount',
 'NumHeteroatoms',
 'NumRotatableBonds',
 'NHOHCount',
 'NOCount',
 'FractionCSP3',
 'RingCount',
 'NumAliphaticRings',
 'NumAromaticRings',
 'NumAliphaticHeterocycles',
 'NumAromaticHeterocycles',
 'NumSaturatedHeterocycles',
 'NumSaturatedRings',
 'BalabanJ',
 'BertzCT',
 'HallKierAlpha',
 'PEOE_VSA1',
 'PEOE_VSA2',
 'PEOE_VSA3',
 'PEOE_VSA4',
 'PEOE_VSA5',
 'PEOE_VSA6',
 'PEOE_VSA7',
 'PEOE_VSA8',
 'PEOE_VSA9',
 'PEOE_VSA10',
 'PEOE_VSA11',
 'PEOE_VSA12',
 'PEOE_VSA13',
 'PEOE_VSA14',
 'SMR_VSA1',
 'SMR_VSA2',
 'SMR_VSA3',
 'SMR_VSA4',
 'SMR_VSA5',
 'SMR_VSA6',
 'SMR_VS

In [18]:
data_allFeaturesAndLabels = data_allFeaturesAndLabels.drop('label1')
data_allFeaturesAndLabels = data_allFeaturesAndLabels.drop('label2')
data_allFeaturesAndLabels = data_allFeaturesAndLabels.drop('label3a')
data_allFeaturesAndLabels = data_allFeaturesAndLabels.drop('label3b')

data_allFeaturesAndLabels = data_allFeaturesAndLabels.withColumnRenamed('BA_pct','label_q0')
data_allFeaturesAndLabels = data_allFeaturesAndLabels.withColumnRenamed('label_QD5','label_cat0')
data_allFeaturesAndLabels = data_allFeaturesAndLabels.withColumnRenamed('label_cat1_index','label_cat1')
data_allFeaturesAndLabels = data_allFeaturesAndLabels.withColumnRenamed('label_cat2_index','label_cat2')
data_allFeaturesAndLabels = data_allFeaturesAndLabels.withColumnRenamed('label_cat3_index','label_cat3')
data_allFeaturesAndLabels = data_allFeaturesAndLabels.withColumnRenamed('label_cat4_index','label_cat4')

features_to_drop = [
    'PEOE_VSA1','PEOE_VSA2','PEOE_VSA3','PEOE_VSA4','PEOE_VSA5','PEOE_VSA6','PEOE_VSA7','PEOE_VSA8',
    'PEOE_VSA9','PEOE_VSA10','PEOE_VSA11','PEOE_VSA12','PEOE_VSA13','PEOE_VSA14','SMR_VSA1','SMR_VSA2',
    'SMR_VSA3','SMR_VSA4','SMR_VSA5','SMR_VSA6','SMR_VSA7','SMR_VSA8','SMR_VSA9','SMR_VSA10','SlogP_VSA1',
    'SlogP_VSA2','SlogP_VSA3','SlogP_VSA4','SlogP_VSA5','SlogP_VSA6','SlogP_VSA7','SlogP_VSA8','SlogP_VSA9',
    'SlogP_VSA10','SlogP_VSA11','SlogP_VSA12','PEOE_VSA1.1','PEOE_VSA2.1','PEOE_VSA3.1','PEOE_VSA4.1',
    'PEOE_VSA5.1','PEOE_VSA6.1','PEOE_VSA7.1','PEOE_VSA8.1','PEOE_VSA9.1','PEOE_VSA10.1','PEOE_VSA11.1',
    'PEOE_VSA12.1','PEOE_VSA13.1','PEOE_VSA14.1','SMR_VSA1.1','SMR_VSA2.1','SMR_VSA3.1','SMR_VSA4.1',
    'SMR_VSA5.1','SMR_VSA6.1','SMR_VSA7.1','SMR_VSA8.1','SMR_VSA9.1','SMR_VSA10.1','SlogP_VSA1.1',
    'SlogP_VSA2.1','SlogP_VSA3.1','SlogP_VSA4.1','SlogP_VSA5.1','SlogP_VSA6.1','SlogP_VSA7.1','SlogP_VSA8.1',
    'SlogP_VSA9.1','SlogP_VSA10.1','SlogP_VSA11.1','SlogP_VSA12.1']

for x in features_to_drop:
    data_allFeaturesAndLabels = data_allFeaturesAndLabels.drop(x)

In [24]:
final_column_order = ['_c0','Name','smile','drug_name','drug_smiles',                    
    # labels
    'label_q0','label_cat0','label_cat1','label_cat2','label_cat3','label_cat4',
                      
    # fragment features
    #'FEAT_f_all_cv2', 'FEAT_f_all_cv2_idf', 'FEAT_f_all_cv5', 'FEAT_f_all_w2v', 'FEAT_f_all_n2g_cv2', 'FEAT_f_all_n2g_cv5',
    'FEAT_f_all_cv2', 'FEAT_f_all_cv2_idf', 'FEAT_f_all_cv5','FEAT_f_all_n2g_cv2', 'FEAT_f_all_n2g_cv5', 
    #'FEAT_f_subset_cv2', 'FEAT_f_subset_cv2_idf', 'FEAT_f_subset_cv5', 'FEAT_f_subset_w2v', 'FEAT_f_subset_n2g_cv2', 'FEAT_f_subset_n2g_cv5',
    'FEAT_f_subset_cv2', 'FEAT_f_subset_cv2_idf', 'FEAT_f_subset_cv5', 'FEAT_f_subset_n2g_cv2', 'FEAT_f_subset_n2g_cv5', 
    #'FEAT_f_subset2_cv2', 'FEAT_f_subset2_cv2_idf', 'FEAT_f_subset2_cv5', 'FEAT_f_subset2_w2v', 'FEAT_f_subset2_n2g_cv2', 'FEAT_f_subset2_n2g_cv5', 
    'FEAT_f_subset2_cv2', 'FEAT_f_subset2_cv2_idf', 'FEAT_f_subset2_cv5', 'FEAT_f_subset2_n2g_cv2', 'FEAT_f_subset2_n2g_cv5',
    #'FEAT_f_efgs_cv2', 'FEAT_f_efgs_cv2_idf', 'FEAT_f_efgs_cv5', 'FEAT_f_efgs_w2v', 'FEAT_f_efgs_n2g_cv2', 'FEAT_f_efgs_n2g_cv5', 
    #'FEAT_f_brics_cv2', 'FEAT_f_brics_cv2_idf', 'FEAT_f_brics_cv5', 'FEAT_f_brics_w2v', 'FEAT_f_brics_n2g_cv2', 'FEAT_f_brics_n2g_cv5',
                      
    # rdkit features
    'FEAT_rdkit_1b','FEAT_rdkit_2b','FEAT_rdkit_1bANOVA','FEAT_rdkit_2bANOVA',
                      
    # fragment data
    #'frags_all','frags_subset','frags_subset2','f_all_n2g_cv2','f_subset2_cv5','f_subset_n2g_cv2',
                      
    # rdkit data
    'MolWt','ExactMolWt','qed','MolLogP','MolMR','VSA_total','LabuteASA',
    'TPSA','MaxPartialCharge','MinPartialCharge','MaxAbsPartialCharge','MinAbsPartialCharge','NumHAcceptors',
    'NumHDonors','HeavyAtomCount','NumHeteroatoms','NumRotatableBonds','NHOHCount','NOCount','FractionCSP3',
    'RingCount','NumAliphaticRings','NumAromaticRings','NumAliphaticHeterocycles','NumAromaticHeterocycles',
    'NumSaturatedHeterocycles','NumSaturatedRings','BalabanJ','BertzCT','HallKierAlpha','fracVSA_PEOE01',
    'fracVSA_PEOE02','fracVSA_PEOE03','fracVSA_PEOE04','fracVSA_PEOE05','fracVSA_PEOE06','fracVSA_PEOE07',
    'fracVSA_PEOE08','fracVSA_PEOE09','fracVSA_PEOE10','fracVSA_PEOE11','fracVSA_PEOE12','fracVSA_PEOE13',
    'fracVSA_PEOE14','fracVSA_SMR01','fracVSA_SMR02','fracVSA_SMR03','fracVSA_SMR04','fracVSA_SMR05',
    'fracVSA_SMR06','fracVSA_SMR07','fracVSA_SMR08','fracVSA_SMR09','fracVSA_SMR10','fracVSA_SlogP01',
    'fracVSA_SlogP02','fracVSA_SlogP03','fracVSA_SlogP04','fracVSA_SlogP05','fracVSA_SlogP06','fracVSA_SlogP07',
    'fracVSA_SlogP08','fracVSA_SlogP09','fracVSA_SlogP10','fracVSA_SlogP11','fracVSA_SlogP12']

data_masterFinal_clean = data_allFeaturesAndLabels.select(final_column_order)

* export cleaned data

In [22]:
data_masterFinal_clean.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- smile: string (nullable = true)
 |-- drug_name: string (nullable = true)
 |-- drug_smiles: string (nullable = true)
 |-- label_q0: double (nullable = true)
 |-- label_cat0: double (nullable = true)
 |-- label_cat1: double (nullable = false)
 |-- label_cat2: double (nullable = false)
 |-- label_cat3: double (nullable = false)
 |-- label_cat4: double (nullable = false)
 |-- FEAT_f_all_cv2: vector (nullable = true)
 |-- FEAT_f_all_cv2_idf: vector (nullable = true)
 |-- FEAT_f_all_cv5: vector (nullable = true)
 |-- FEAT_f_all_w2v: vector (nullable = true)
 |-- FEAT_f_all_n2g_cv2: vector (nullable = true)
 |-- FEAT_f_all_n2g_cv5: vector (nullable = true)
 |-- FEAT_f_subset_cv2: vector (nullable = true)
 |-- FEAT_f_subset_cv2_idf: vector (nullable = true)
 |-- FEAT_f_subset_cv5: vector (nullable = true)
 |-- FEAT_f_subset_w2v: vector (nullable = true)
 |-- FEAT_f_subset_n2g_cv2: vector (nullable = true)
 |-- FE

In [25]:
data_masterFinal_clean.toPandas().to_pickle("bioavailability_data_masterFinal2.pkl")

                                                                                