In [None]:
# MSM VM config prep
import findspark
findspark.init('/home/mitch/spark-3.3.0-bin-hadoop2')
import pyspark
 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('BApredsV3').getOrCreate()

# --- suppress future spark warnings/error/etc output ---
spark.sparkContext.setLogLevel("OFF")

In [None]:
import pandas as pd
def load_data_and_merge():
    labels_and_calcs = spark.read.csv("bioavailability_data_final.csv",inferSchema=True,sep=',',header=True)
    df1 = labels_and_calcs.toPandas()

    df2 = pd.read_pickle('bioavailabilityData_w_Frags_simpler.pkl')
    df2 = df2.drop(columns=['drug_smiles','ba_pct'])

    data = pd.merge(df1,df2,how='left',left_on='_c0',right_on=df2.index)
    data = spark.createDataFrame(data)

    return data

data = load_data_and_merge()

* Label data

In [None]:
''' 
# INTIAL LABELS:
# --- Data has 1 continuous label column, and 4 categorical label columns (discretized variants of continuous label).
# ------ categorical labels applied by dividing the continuous label values into 3-5 categories 
# ------ the value range associated with each group were selected based on histogram dist./mean/stdev
# --- We'll add one more discretization variant,  using Spark's built-in QuantileDiscretizer
'''
# -- Add QuantileDiscretizer labels
from pyspark.ml.feature import QuantileDiscretizer
import pandas as pd
qd5 = QuantileDiscretizer(numBuckets=5,inputCol='BA_pct',outputCol='label_QD5')

data_wLabels = qd5.fit(data).transform(data)

# -- INDEX / ENCODE LABELS
from pyspark.ml.feature import (StringIndexer,OneHotEncoder)

label_quant0 = 'BA_pct'
label_cat0_vector = OneHotEncoder(inputCol='label_QD5',outputCol='label_cat0_vector')

label_cat1_index = StringIndexer(inputCol='label1',outputCol='label_cat1_index')
label_cat1_vector = OneHotEncoder(inputCol='label_cat1_index',outputCol='label_cat1_vector')

label_cat2_index = StringIndexer(inputCol='label2',outputCol='label_cat2_index')
label_cat2_vector = OneHotEncoder(inputCol='label_cat2_index',outputCol='label_cat2_vector')

label_cat3_index = StringIndexer(inputCol='label3a',outputCol='label_cat3_index')
label_cat3_vector = OneHotEncoder(inputCol='label_cat3_index',outputCol='label_cat3_vector')

label_cat4_index = StringIndexer(inputCol='label3b',outputCol='label_cat4_index')
label_cat4_vector = OneHotEncoder(inputCol='label_cat4_index',outputCol='label_cat4_vector')

from pyspark.ml import Pipeline
label_pipeline = Pipeline(stages=[label_cat0_vector,
                                 label_cat1_index,label_cat1_vector,
                                 label_cat2_index,label_cat2_vector,
                                 label_cat3_index,label_cat3_vector,
                                 label_cat4_index,label_cat4_vector])
data_wLabels = label_pipeline.fit(data_wLabels).transform(data_wLabels)

In [None]:
from pyspark.ml.feature import NGram,Word2Vec,CountVectorizer,HashingTF,IDF

In [None]:
''' # Fragments NLP processing
'''
from pyspark.ml.feature import NGram,Word2Vec,CountVectorizer,HashingTF,IDF

frag_type = 'frags_all'
frag_type_short = frag_type.replace('_all','')

cv = CountVectorizer(inputCol=frag_type, outputCol=f"{frag_type_short}_cv", minDF=2.0)
cv_idf = IDF(inputCol=f"{frag_type_short}_cv", outputCol=f"{frag_type_short}_cv_idf")

tf = HashingTF(inputCol=frag_type, outputCol=f"{frag_type_short}_tf", numFeatures=2867)
tf_idf = IDF(inputCol=f"{frag_type_short}_tf", outputCol=f"{frag_type_short}_tf_idf")

w2v = Word2Vec(inputCol=frag_type, outputCol=f"{frag_type_short}_w2v")

n2gram = NGram(n=2, inputCol=frag_type, outputCol=f"{frag_type_short}_n2g")
n2gram_cv = CountVectorizer(inputCol=f"{frag_type_short}_n2g", outputCol=f"{frag_type_short}_n2g_cv", minDF=2.0)
n2gram_cv_idf = IDF(inputCol=f"{frag_type_short}_n2g_cv", outputCol=f"{frag_type_short}_n2g_cv_idf")

nlp_pipeline = Pipeline(stages=[cv,cv_idf,tf,tf_idf,w2v,n2gram,n2gram_cv,n2gram_cv_idf])

data_wLabels_NLP = nlp_pipeline.fit(data_wLabels).transform(data_wLabels)

data_wLabels_NLP.toPandas().to_pickle("data_wLabels_NLP_aspandas.pkl")

* make fragment NLP feature vectors

In [None]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import (VectorAssembler,VectorIndexer)

vector_assemblers = []

alternative_features = ['frags_cv', 'frags_cv_idf', 'frags_tf', 'frags_tf_idf', 
                        'frags_w2v', 'frags_n2g_cv', 'frags_n2g_cv_idf']
output_features = ""
for feats in alternative_features:
    
    feats_input = [feats]
    feats_output = f"FEAT_{feats}"
    
    vec_assembler = VectorAssembler(inputCols=feats_input, outputCol=feats_output)
    
    vector_assemblers.append(vec_assembler)
    
    output_features += "'"+feats_output+"'"+", "

output_features = output_features[0:len(output_features)-2]
print(output_features)

from pyspark.ml import Pipeline
feature_pipeline = Pipeline(stages=[x for x in vector_assemblers])

data_wLabels_NLPFeatures = feature_pipeline.fit(data_wLabels_NLP).transform(data_wLabels_NLP)

* export data w/ NLP vector features

In [None]:
data_wLabels_NLPFeatures.toPandas().to_pickle("data_wLabels_NLPFeatures_aspandas.pkl")

* Prepare Vector Features for RDKit calculations 

In [None]:
''' 
# RDKit "1D" FEATURE SELECTION:
'''
# to load the Features Information, use the command:
featuresDF = pd.read_parquet('featuresCatalogDF.parquet')

#index_pos = featuresDF[featuresDF['name']=='F1a'].index[0]
feature_set1a = featuresDF.loc[0,'features']
feature_set1b = featuresDF.loc[1,'features']
feature_set2a = featuresDF.loc[2,'features']
feature_set2b = featuresDF.loc[3,'features']
feature_set3  = featuresDF.loc[4,'features']
feature_set4a = featuresDF.loc[5,'features']
feature_set4b = featuresDF.loc[6,'features']
F1bANOVA = featuresDF.loc[7,'features']
F2bANOVA = featuresDF.loc[8,'features']

# VECTOR ASSEMBLY - feature sets 1a,1b,2a,2b,3,4a,4b
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import (VectorAssembler,VectorIndexer)

vec_assembler1a = VectorAssembler(inputCols = feature_set1a, outputCol='FEAT_rdkit_1a')
vec_assembler1b = VectorAssembler(inputCols = feature_set1b, outputCol='FEAT_rdkit_1b')
vec_assembler2a = VectorAssembler(inputCols = feature_set2a, outputCol='FEAT_rdkit_2a')
vec_assembler2b = VectorAssembler(inputCols = feature_set2b, outputCol='FEAT_rdkit_2b')
vec_assembler3 = VectorAssembler(inputCols = feature_set3, outputCol='FEAT_rdkit_3')
vec_assembler4a = VectorAssembler(inputCols = feature_set4a, outputCol='FEAT_rdkit_4a')
vec_assembler4b = VectorAssembler(inputCols = feature_set4b, outputCol='FEAT_rdkit_4b')
vec_assembler1bANOVA = VectorAssembler(inputCols = F1bANOVA, outputCol='FEAT_rdkit_1bANOVA')
vec_assembler2bANOVA = VectorAssembler(inputCols = F2bANOVA, outputCol='FEAT_rdkit_2bANOVA')

from pyspark.ml import Pipeline
feature_pipeline = Pipeline(stages=[vec_assembler1a,
                                    vec_assembler1b,
                                    vec_assembler2a,
                                    vec_assembler2b,
                                    vec_assembler3,
                                    vec_assembler4a,
                                    vec_assembler4b,
                                    vec_assembler1bANOVA,
                                   vec_assembler2bANOVA])
data_features = feature_pipeline.fit(data).transform(data)

* clean up the data

In [None]:
data_features = data_features.drop('label1')
data_features = data_features.drop('label2')
data_features = data_features.drop('label3a')
data_features = data_features.drop('label3b')
data_features = data_features.drop('label_cat0_vector')
data_features = data_features.drop('label_cat1_vector')
data_features = data_features.drop('label_cat2_vector')
data_features = data_features.drop('label_cat3_vector')
data_features = data_features.drop('label_cat4_vector')

data_features = data_features.withColumnRenamed('BA_pct','label_q0')
data_features = data_features.withColumnRenamed('label_QD5','label_cat0')
data_features = data_features.withColumnRenamed('label_cat1_index','label_cat1')
data_features = data_features.withColumnRenamed('label_cat2_index','label_cat2')
data_features = data_features.withColumnRenamed('label_cat3_index','label_cat3')
data_features = data_features.withColumnRenamed('label_cat4_index','label_cat4')


features_to_drop = [
    'PEOE_VSA1','PEOE_VSA2','PEOE_VSA3','PEOE_VSA4','PEOE_VSA5','PEOE_VSA6','PEOE_VSA7','PEOE_VSA8',
    'PEOE_VSA9','PEOE_VSA10','PEOE_VSA11','PEOE_VSA12','PEOE_VSA13','PEOE_VSA14','SMR_VSA1','SMR_VSA2',
    'SMR_VSA3','SMR_VSA4','SMR_VSA5','SMR_VSA6','SMR_VSA7','SMR_VSA8','SMR_VSA9','SMR_VSA10','SlogP_VSA1',
    'SlogP_VSA2','SlogP_VSA3','SlogP_VSA4','SlogP_VSA5','SlogP_VSA6','SlogP_VSA7','SlogP_VSA8','SlogP_VSA9',
    'SlogP_VSA10','SlogP_VSA11','SlogP_VSA12','PEOE_VSA1.1','PEOE_VSA2.1','PEOE_VSA3.1','PEOE_VSA4.1',
    'PEOE_VSA5.1','PEOE_VSA6.1','PEOE_VSA7.1','PEOE_VSA8.1','PEOE_VSA9.1','PEOE_VSA10.1','PEOE_VSA11.1',
    'PEOE_VSA12.1','PEOE_VSA13.1','PEOE_VSA14.1','SMR_VSA1.1','SMR_VSA2.1','SMR_VSA3.1','SMR_VSA4.1',
    'SMR_VSA5.1','SMR_VSA6.1','SMR_VSA7.1','SMR_VSA8.1','SMR_VSA9.1','SMR_VSA10.1','SlogP_VSA1.1',
    'SlogP_VSA2.1','SlogP_VSA3.1','SlogP_VSA4.1','SlogP_VSA5.1','SlogP_VSA6.1','SlogP_VSA7.1','SlogP_VSA8.1',
    'SlogP_VSA9.1','SlogP_VSA10.1','SlogP_VSA11.1','SlogP_VSA12.1']
for x in features_to_drop:
    data_features = data_features.drop(x)


In [None]:
final_column_order = [
    '_c0','Name','drug_name', 
    'label_q0','label_cat0','label_cat1','label_cat2','label_cat3','label_cat4',
    'smile','MolWt','ExactMolWt','qed','MolLogP','MolMR','VSA_total','LabuteASA',
    'TPSA','MaxPartialCharge','MinPartialCharge','MaxAbsPartialCharge','MinAbsPartialCharge','NumHAcceptors',
    'NumHDonors','HeavyAtomCount','NumHeteroatoms','NumRotatableBonds','NHOHCount','NOCount','FractionCSP3',
    'RingCount','NumAliphaticRings','NumAromaticRings','NumAliphaticHeterocycles','NumAromaticHeterocycles',
    'NumSaturatedHeterocycles','NumSaturatedRings','BalabanJ','BertzCT','HallKierAlpha','fracVSA_PEOE01',
    'fracVSA_PEOE02','fracVSA_PEOE03','fracVSA_PEOE04','fracVSA_PEOE05','fracVSA_PEOE06','fracVSA_PEOE07',
    'fracVSA_PEOE08','fracVSA_PEOE09','fracVSA_PEOE10','fracVSA_PEOE11','fracVSA_PEOE12','fracVSA_PEOE13',
    'fracVSA_PEOE14','fracVSA_SMR01','fracVSA_SMR02','fracVSA_SMR03','fracVSA_SMR04','fracVSA_SMR05',
    'fracVSA_SMR06','fracVSA_SMR07','fracVSA_SMR08','fracVSA_SMR09','fracVSA_SMR10','fracVSA_SlogP01',
    'fracVSA_SlogP02','fracVSA_SlogP03','fracVSA_SlogP04','fracVSA_SlogP05','fracVSA_SlogP06','fracVSA_SlogP07',
    'fracVSA_SlogP08','fracVSA_SlogP09','fracVSA_SlogP10','fracVSA_SlogP11','fracVSA_SlogP12',
    'FEAT_rdkit_1a','FEAT_rdkit_1b','FEAT_rdkit_2a','FEAT_rdkit_2b','FEAT_rdkit_3','FEAT_rdkit_4a',
    'FEAT_rdkit_4b','FEAT_rdkit_1bANOVA','FEAT_rdkit_2bANOVA',
    'frags_all','frags_better','frags_best','frags_efgs','frags_brics',
    'frags_cv','frags_cv_idf','frags_tf','frags_tf_idf','frags_w2v','frags_n2g','frags_n2g_cv','frags_n2g_cv_idf',
    'FEAT_frags_cv','FEAT_frags_cv_idf','FEAT_frags_tf','FEAT_frags_tf_idf',
    'FEAT_frags_w2v','FEAT_frags_n2g_cv','FEAT_frags_n2g_cv_idf']

data_features_clean = data_features.select(final_column_order)

* export cleaned data

In [None]:
data_features_clean.toPandas().to_pickle("data_final_NEW_2022-09-05_aspandas.pkl")