# Summary

# Intro

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from common_imports import *

In [None]:
import az_dream.functions as fn

In [355]:
import rdkit
from common import dat

In [356]:
%matplotlib inline

# Functions

In [357]:
from rdkit import Chem, DataStructs

def get_similarity_all(fp1, fp2):
    """
    Get similarity score for fingerprints that are supplied always as SparseBitVect
    RDKit has the following similarity measures:
        Tanimoto, Dice, Cosine, Sokal, Russel, Kulczynski, McConnaughey, and Tversky.
    """
    similarity_scores = [
        DataStructs.TanimotoSimilarity(fp1,fp2),
        DataStructs.DiceSimilarity(fp1,fp2),
        DataStructs.CosineSimilarity(fp1,fp2),
#        DataStructs.SokalSimilarity(fp1,fp2),
        DataStructs.RusselSimilarity(fp1,fp2),
        DataStructs.KulczynskiSimilarity(fp1,fp2),
        DataStructs.McConnaugheySimilarity(fp1,fp2)]

    return similarity_scores


def get_similarity_subset(fp1, fp2):
    """
    Get similarity score for fingerprints that are supplied as ExplicitBitVect
    or some other format.
    The following similarity metrics work with different intput formats:
        Tanimoto, Dice
    """
    similarity_scores = [
        DataStructs.TanimotoSimilarity(fp1,fp2),
        DataStructs.DiceSimilarity(fp1,fp2)]

    return similarity_scores


def calculate_similarity_vector(smile_pair):
    """
    Calculate fingerprints between two smile terms using different fingerprinters,
    and use different similarity metrics to calculate the difference between those fingerprints.
    """
#    smile1, smile2 = smile_pair.split('_')
    smile1, smile2 = smile_pair

    mol1 = Chem.MolFromSmiles(smile1)
    mol2 = Chem.MolFromSmiles(smile2)

    molecule_similarity = list()

    # RDK topological fingerprint for a molecule
    fp1 = Chem.RDKFingerprint(mol1)
    fp2 = Chem.RDKFingerprint(mol2)
    molecule_similarity.extend(get_similarity_all(fp1, fp2))
    #print 'RDK fingerprint: ', DataStructs.KulczynskiSimilarity(fp1,fp2)

    ## LayeredFingerprint, a fingerprint using SMARTS patterns
    #fp1 = Chem.LayeredFingerprint(mol1)
    #fp2 = Chem.LayeredFingerprint(mol2)
    #print 'RDK fingerprint: ', DataStructs.TanimotoSimilarity(fp1,fp2)

    # PatternFingerprint, a fingerprint using SMARTS patterns
    #fp1 = Chem.PatternFingerprint(mol1)
    #fp2 = Chem.PatternFingerprint(mol2)
    #print 'RDK fingerprint: ', DataStructs.TanimotoSimilarity(fp1,fp2)

    ###############################################################################

    # Topological Fingerprints
    # Uses Chem.RDKFingerprint internally, but with different parameters, I guess...
    # http://www.rdkit.org/docs/GettingStartedInPython.html#topological-fingerprints
    from rdkit.Chem.Fingerprints import FingerprintMols
    fp1 = FingerprintMols.FingerprintMol(mol1)
    fp2 = FingerprintMols.FingerprintMol(mol2)
    molecule_similarity.extend(get_similarity_all(fp1, fp2))
    #print 'RDK fingerprint: ', DataStructs.TanimotoSimilarity(fp1,fp2)

    ###############################################################################

    # MACCS Keys
    # There is a SMARTS-based implementation of the 166 public MACCS keys.
    # http://www.rdkit.org/docs/GettingStartedInPython.html#maccs-keys
    from rdkit.Chem import MACCSkeys
    fp1 = MACCSkeys.GenMACCSKeys(mol1)
    fp2 = MACCSkeys.GenMACCSKeys(mol2)
    molecule_similarity.extend(get_similarity_all(fp1, fp2))
    #print "RDK fingerprint: ", DataStructs.TanimotoSimilarity(fp1,fp2)

    ###############################################################################

    # Atom Pairs and Topological Torsions
    # Atom-pair descriptors [3] are available in several different forms.
    # The standard form is as fingerprint including counts for each bit instead of just zeros and ones:
    # http://www.rdkit.org/docs/GettingStartedInPython.html#atom-pairs-and-topological-torsions
    from rdkit.Chem.AtomPairs import Pairs
    fp1 = Pairs.GetAtomPairFingerprintAsBitVect(mol1)
    fp2 = Pairs.GetAtomPairFingerprintAsBitVect(mol2)
    molecule_similarity.extend(get_similarity_all(fp1, fp2))
    #print "RDK fingerprint: ", DataStructs.DiceSimilarity(fp1,fp2)
    from rdkit.Chem.AtomPairs import Torsions
    fp1 = Torsions.GetTopologicalTorsionFingerprint(mol1)
    fp2 = Torsions.GetTopologicalTorsionFingerprint(mol2)
    molecule_similarity.extend(get_similarity_subset(fp1, fp2))
    #print "RDK fingerprint: ", DataStructs.TanimotoSimilarity(fp1,fp2)

    ###############################################################################

    # Morgan Fingerprints (Circular Fingerprints)
    #This family of fingerprints, better known as circular fingerprints [5],
    #is built by applying the Morgan algorithm to a set of user-supplied atom invariants.
    #When generating Morgan fingerprints, the radius of the fingerprint must also be provided...
    # http://www.rdkit.org/docs/GettingStartedInPython.html#morgan-fingerprints-circular-fingerprints
    from rdkit.Chem import rdMolDescriptors
    fp1 = rdMolDescriptors.GetMorganFingerprint(mol1,2)
    fp2 = rdMolDescriptors.GetMorganFingerprint(mol2,2)
    molecule_similarity.extend(get_similarity_subset(fp1, fp2))

    fp1 = rdMolDescriptors.GetMorganFingerprint(mol1,2,useFeatures=True)
    fp2 = rdMolDescriptors.GetMorganFingerprint(mol2,2,useFeatures=True)
    molecule_similarity.extend(get_similarity_subset(fp1, fp2))

    #print "RDK fingerprint: ", DataStructs.TanimotoSimilarity(fp1,fp2)

    ###############################################################################

    return molecule_similarity

In [358]:
similarity_metric_all = ['Tanimoto', 'Dice', 'Cosine', 'Russel', 'Kulczynski', 'McConnaughey']
similarity_metric_subset = ['Tanimoto', 'Dice']
chemical_similarity_feature_names = (
    ['RDKFingerprint_' + sim for sim in similarity_metric_all] +
    ['FingerprintMol_' + sim for sim in similarity_metric_all] +
    ['MACCSkeys_' + sim for sim in similarity_metric_all] +
    ['AtomPairFingerprint_' + sim for sim in similarity_metric_all] +
    ['TopologicalTorsionFingerprint_' + sim for sim in similarity_metric_subset] +
    ['MorganFingerprintR2_' + sim for sim in similarity_metric_subset] +
    ['MorganFingerprintR2withFeatures_' + sim for sim in similarity_metric_subset])

# Import data

In [359]:
drug_info = pd.read_sql_table(
    'drug_info_release_3', 
    sa.create_engine('{}/az_dream'.format(os.environ['BIODB_CONNECTION_STR']))
)

In [360]:
drug_info.head(2)

Unnamed: 0,ChallengeName,Target(Official Symbol),HBA,cLogP,HBD,Lipinski,SMILES or PubChem ID,MW,AtomStereoCount,BondStereoCount,CID,CanonicalSMILES,Charge,Complexity,ConformerCount3D,ConformerModelRMSD3D,CovalentUnitCount,DefinedAtomStereoCount,DefinedBondStereoCount,EffectiveRotorCount3D,ExactMass,FeatureAcceptorCount3D,FeatureAnionCount3D,FeatureCationCount3D,FeatureCount3D,FeatureDonorCount3D,FeatureHydrophobeCount3D,FeatureRingCount3D,Fingerprint2D,HBondAcceptorCount,HBondDonorCount,HeavyAtomCount,IUPACName,InChI,InChIKey,IsomericSMILES,IsotopeAtomCount,MolecularFormula,MolecularWeight,MonoisotopicMass,RotatableBondCount,TPSA,UndefinedAtomStereoCount,UndefinedBondStereoCount,Volume3D,XLogP,XStericQuadrupole3D,YStericQuadrupole3D,ZStericQuadrupole3D,cids,targets,targets_bioentities,enzymes,enzymes_bioentities,transporters,transporters_bioentities,atcs
0,ADAM17,ADAM17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,AKT,"AKT1, AKT2, AKT3",8.0,1.18,5.0,0.0,c1cc(ccc1[C@H](CCO)NC(=O)C2(CCN(CC2)c3c4cc[nH]...,428.9,1.0,0.0,25227436.0,C1CN(CCC1(C(=O)NC(CCO)C2=CC=C(C=C2)Cl)N)C3=NC=...,0.0,580.0,256.0,1.0,1.0,1.0,0.0,8.2,428.172752,2.0,0.0,4.0,14.0,4.0,0.0,4.0,AAADceB7sAAEAAAAAAAAAAAAAAAAAWAAAAA8WIAAAAAAAF...,6.0,4.0,30.0,4-amino-N-[(1S)-1-(4-chlorophenyl)-3-hydroxypr...,InChI=1S/C21H25ClN6O2/c22-15-3-1-14(2-4-15)17(...,JDUBGYFRJFOXQC-KRWDZBQOSA-N,C1CN(CCC1(C(=O)N[C@@H](CCO)C2=CC=C(C=C2)Cl)N)C...,0.0,C21H25ClN6O2,428.9152,428.172752,6.0,120.0,0.0,0.0,319.3,1.7,14.27,5.17,1.08,252274364260226057750340,,,,,,,


In [361]:
training_data = pd.read_sql_query("""\
SELECT d_1, d_2, avg(synergy_score) synergy_score FROM
ALL_TRAINING_DATA_WSYNERGY where source = 'train' and qa = 1
group by d_1, d_2
""", sa.create_engine('{}/az_dream_2015'.format(os.environ['BIODB_CONNECTION_STR']))

In [362]:
training_data.head(2)

Unnamed: 0,d_1,d_2,synergy_score
0,ADAM17,AKT,9.837743
1,ADAM17,BCL2_BCL2L1,-3.37564


# Format features

## Rename

In [363]:
drug_info.rename(columns={'ChallengeName': 'd'}, inplace=True)
drug_info.columns = [dat.format_column(c) for c in drug_info.columns]

In [364]:
drug_info.head(2)

Unnamed: 0,d,target__official__symbol,hba,c_log_p,hbd,lipinski,smiles_or__pub_chem_id,mw,atom_stereo_count,bond_stereo_count,cid,canonical_smiles,charge,complexity,conformer_count3_d,conformer_model_rmsd3_d,covalent_unit_count,defined_atom_stereo_count,defined_bond_stereo_count,effective_rotor_count3_d,exact_mass,feature_acceptor_count3_d,feature_anion_count3_d,feature_cation_count3_d,feature_count3_d,feature_donor_count3_d,feature_hydrophobe_count3_d,feature_ring_count3_d,fingerprint2_d,h_bond_acceptor_count,h_bond_donor_count,heavy_atom_count,iupac_name,in_ch_i,in_ch_i_key,isomeric_smiles,isotope_atom_count,molecular_formula,molecular_weight,monoisotopic_mass,rotatable_bond_count,tpsa,undefined_atom_stereo_count,undefined_bond_stereo_count,volume3_d,x_log_p,x_steric_quadrupole3_d,y_steric_quadrupole3_d,z_steric_quadrupole3_d,cids,targets,targets_bioentities,enzymes,enzymes_bioentities,transporters,transporters_bioentities,atcs
0,ADAM17,ADAM17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,AKT,"AKT1, AKT2, AKT3",8.0,1.18,5.0,0.0,c1cc(ccc1[C@H](CCO)NC(=O)C2(CCN(CC2)c3c4cc[nH]...,428.9,1.0,0.0,25227436.0,C1CN(CCC1(C(=O)NC(CCO)C2=CC=C(C=C2)Cl)N)C3=NC=...,0.0,580.0,256.0,1.0,1.0,1.0,0.0,8.2,428.172752,2.0,0.0,4.0,14.0,4.0,0.0,4.0,AAADceB7sAAEAAAAAAAAAAAAAAAAAWAAAAA8WIAAAAAAAF...,6.0,4.0,30.0,4-amino-N-[(1S)-1-(4-chlorophenyl)-3-hydroxypr...,InChI=1S/C21H25ClN6O2/c22-15-3-1-14(2-4-15)17(...,JDUBGYFRJFOXQC-KRWDZBQOSA-N,C1CN(CCC1(C(=O)N[C@@H](CCO)C2=CC=C(C=C2)Cl)N)C...,0.0,C21H25ClN6O2,428.9152,428.172752,6.0,120.0,0.0,0.0,319.3,1.7,14.27,5.17,1.08,252274364260226057750340,,,,,,,


## Make drug pairs

In [365]:
drug_info['tmp'] = 1
drug_pair = drug_info.merge(drug_info, on='tmp', suffixes=('_1', '_2'))
drug_pair.drop('tmp', axis=1, inplace=True)
drug_pair = drug_pair[drug_pair['d_1'].str.lower() <= drug_pair['d_2'].str.lower()]
drug_pair['same_drugs'] = (drug_pair['d_1'] == drug_pair['d_2']).astype(int)

In [366]:
drug_pair.head(2)

Unnamed: 0,d_1,target__official__symbol_1,hba_1,c_log_p_1,hbd_1,lipinski_1,smiles_or__pub_chem_id_1,mw_1,atom_stereo_count_1,bond_stereo_count_1,cid_1,canonical_smiles_1,charge_1,complexity_1,conformer_count3_d_1,conformer_model_rmsd3_d_1,covalent_unit_count_1,defined_atom_stereo_count_1,defined_bond_stereo_count_1,effective_rotor_count3_d_1,exact_mass_1,feature_acceptor_count3_d_1,feature_anion_count3_d_1,feature_cation_count3_d_1,feature_count3_d_1,feature_donor_count3_d_1,feature_hydrophobe_count3_d_1,feature_ring_count3_d_1,fingerprint2_d_1,h_bond_acceptor_count_1,h_bond_donor_count_1,heavy_atom_count_1,iupac_name_1,in_ch_i_1,in_ch_i_key_1,isomeric_smiles_1,isotope_atom_count_1,molecular_formula_1,molecular_weight_1,monoisotopic_mass_1,rotatable_bond_count_1,tpsa_1,undefined_atom_stereo_count_1,undefined_bond_stereo_count_1,volume3_d_1,x_log_p_1,x_steric_quadrupole3_d_1,y_steric_quadrupole3_d_1,z_steric_quadrupole3_d_1,cids_1,targets_1,targets_bioentities_1,enzymes_1,enzymes_bioentities_1,transporters_1,transporters_bioentities_1,atcs_1,d_2,target__official__symbol_2,hba_2,c_log_p_2,hbd_2,lipinski_2,smiles_or__pub_chem_id_2,mw_2,atom_stereo_count_2,bond_stereo_count_2,cid_2,canonical_smiles_2,charge_2,complexity_2,conformer_count3_d_2,conformer_model_rmsd3_d_2,covalent_unit_count_2,defined_atom_stereo_count_2,defined_bond_stereo_count_2,effective_rotor_count3_d_2,exact_mass_2,feature_acceptor_count3_d_2,feature_anion_count3_d_2,feature_cation_count3_d_2,feature_count3_d_2,feature_donor_count3_d_2,feature_hydrophobe_count3_d_2,feature_ring_count3_d_2,fingerprint2_d_2,h_bond_acceptor_count_2,h_bond_donor_count_2,heavy_atom_count_2,iupac_name_2,in_ch_i_2,in_ch_i_key_2,isomeric_smiles_2,isotope_atom_count_2,molecular_formula_2,molecular_weight_2,monoisotopic_mass_2,rotatable_bond_count_2,tpsa_2,undefined_atom_stereo_count_2,undefined_bond_stereo_count_2,volume3_d_2,x_log_p_2,x_steric_quadrupole3_d_2,y_steric_quadrupole3_d_2,z_steric_quadrupole3_d_2,cids_2,targets_2,targets_bioentities_2,enzymes_2,enzymes_bioentities_2,transporters_2,transporters_bioentities_2,atcs_2,same_drugs
0,ADAM17,ADAM17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,ADAM17,ADAM17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1
1,ADAM17,ADAM17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,AKT,"AKT1, AKT2, AKT3",8.0,1.18,5.0,0.0,c1cc(ccc1[C@H](CCO)NC(=O)C2(CCN(CC2)c3c4cc[nH]...,428.9,1.0,0.0,25227436.0,C1CN(CCC1(C(=O)NC(CCO)C2=CC=C(C=C2)Cl)N)C3=NC=...,0.0,580.0,256.0,1.0,1.0,1.0,0.0,8.2,428.172752,2.0,0.0,4.0,14.0,4.0,0.0,4.0,AAADceB7sAAEAAAAAAAAAAAAAAAAAWAAAAA8WIAAAAAAAF...,6.0,4.0,30.0,4-amino-N-[(1S)-1-(4-chlorophenyl)-3-hydroxypr...,InChI=1S/C21H25ClN6O2/c22-15-3-1-14(2-4-15)17(...,JDUBGYFRJFOXQC-KRWDZBQOSA-N,C1CN(CCC1(C(=O)N[C@@H](CCO)C2=CC=C(C=C2)Cl)N)C...,0.0,C21H25ClN6O2,428.9152,428.172752,6.0,120.0,0.0,0.0,319.3,1.7,14.27,5.17,1.08,252274364260226057750340,,,,,,,,0


In [367]:
# Add synergy scores
drug_pair_len_before = drug_pair.shape[0]
drug_pair = drug_pair.merge(training_data, on=['d_1', 'd_2'], how='left')

In [368]:
assert not (
    set(training_data[['d_1', 'd_2']].apply('.'.join, axis=1)) - 
    set(drug_pair[['d_1', 'd_2']].apply('.'.join, axis=1))
)

In [369]:
drug_pair_len_start = drug_pair.shape
drug_pair_len_start

(7140, 116)

In [370]:
drug_pair[drug_pair['synergy_score'].notnull()].shape

(167, 116)

In [371]:
# Convert relative differences to absolute differences
drug_pair = fn.get_differences(drug_pair)
for column in drug_pair.columns:
    if column.endswith('_diff'):
        drug_pair[column] = drug_pair[column].abs()

Skipping column 'd_1' because it appears to be a string...
Skipping column 'target__official__symbol_1' because it appears to be a string...
Skipping column 'smiles_or__pub_chem_id_1' because it appears to be a string...
Skipping column 'canonical_smiles_1' because it appears to be a string...
Skipping column 'fingerprint2_d_1' because it appears to be a string...
Skipping column 'iupac_name_1' because it appears to be a string...
Skipping column 'in_ch_i_1' because it appears to be a string...
Skipping column 'in_ch_i_key_1' because it appears to be a string...
Skipping column 'isomeric_smiles_1' because it appears to be a string...
Skipping column 'molecular_formula_1' because it appears to be a string...
Skipping column 'cids_1' because it appears to be a string...
Skipping column 'targets_1' because it appears to be a string...
Skipping column 'targets_bioentities_1' because it appears to be a string...
Skipping column 'enzymes_1' because it appears to be a string...
Skipping colum

## Chemical similarity


In [372]:
drug_pair_smiles = (
    drug_pair
    [drug_pair[['canonical_smiles_1', 'canonical_smiles_2']].notnull().all(axis=1)]
    [['d_1', 'd_2', 'canonical_smiles_1', 'canonical_smiles_2']]
)

In [373]:
drug_pair_smiles.head()

Unnamed: 0,d_1,d_2,canonical_smiles_1,canonical_smiles_2
119,AKT,AKT,C1CN(CCC1(C(=O)NC(CCO)C2=CC=C(C=C2)Cl)N)C3=NC=...,C1CN(CCC1(C(=O)NC(CCO)C2=CC=C(C=C2)Cl)N)C3=NC=...
120,AKT,AKT_1,C1CN(CCC1(C(=O)NC(CCO)C2=CC=C(C=C2)Cl)N)C3=NC=...,C1CC(C1)(C2=CC=C(C=C2)C3=C(C=C4C(=N3)C=CN5C4=N...
123,AKT,ALK,C1CN(CCC1(C(=O)NC(CCO)C2=CC=C(C=C2)Cl)N)C3=NC=...,CN1CCN(CC1)C2CCN(CC2)C3=CC(=C(C=C3)NC4=NC=C(C(...
125,AKT,ALK_IGFR,C1CN(CCC1(C(=O)NC(CCO)C2=CC=C(C=C2)Cl)N)C3=NC=...,COC1=C(C=CC(=C1)N2CCC(CC2)N)NC3=NC=C(C(=N3)C4=...
126,AKT,AR,C1CN(CCC1(C(=O)NC(CCO)C2=CC=C(C=C2)Cl)N)C3=NC=...,CC1(C(=O)N(C(=S)N1C2=CC(=C(C=C2)C(=O)NC)F)C3=C...


In [374]:
chemical_similarity = [
    calculate_similarity_vector(tuple(smile_pair)) 
    for smile_pair
    in drug_pair_smiles[['canonical_smiles_1', 'canonical_smiles_2']].values
]

In [375]:
chemical_similarity_df = pd.DataFrame(
    chemical_similarity, columns=chemical_similarity_feature_names, index=drug_pair_smiles.index
)

In [376]:
chemical_similarity_df.head()

Unnamed: 0,RDKFingerprint_Tanimoto,RDKFingerprint_Dice,RDKFingerprint_Cosine,RDKFingerprint_Russel,RDKFingerprint_Kulczynski,RDKFingerprint_McConnaughey,FingerprintMol_Tanimoto,FingerprintMol_Dice,FingerprintMol_Cosine,FingerprintMol_Russel,FingerprintMol_Kulczynski,FingerprintMol_McConnaughey,MACCSkeys_Tanimoto,MACCSkeys_Dice,MACCSkeys_Cosine,MACCSkeys_Russel,MACCSkeys_Kulczynski,MACCSkeys_McConnaughey,AtomPairFingerprint_Tanimoto,AtomPairFingerprint_Dice,AtomPairFingerprint_Cosine,AtomPairFingerprint_Russel,AtomPairFingerprint_Kulczynski,AtomPairFingerprint_McConnaughey,TopologicalTorsionFingerprint_Tanimoto,TopologicalTorsionFingerprint_Dice,MorganFingerprintR2_Tanimoto,MorganFingerprintR2_Dice,MorganFingerprintR2withFeatures_Tanimoto,MorganFingerprintR2withFeatures_Dice
119,1.0,1.0,1.0,0.471191,1.0,1.0,1.0,1.0,1.0,0.471191,1.0,1.0,1.0,1.0,1.0,0.39521,1.0,1.0,1.0,1.0,1.0,2.8e-05,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
120,0.384458,0.555391,0.556079,0.275391,0.556768,0.113537,0.384458,0.555391,0.556079,0.275391,0.556768,0.113537,0.564103,0.721311,0.723747,0.263473,0.72619,0.452381,0.233974,0.379221,0.387466,9e-06,0.39589,-0.208221,0.165289,0.283688,0.213793,0.352273,0.248227,0.397727
123,0.398794,0.570196,0.571817,0.290527,0.573442,0.146883,0.398794,0.570196,0.571817,0.290527,0.573442,0.146883,0.5875,0.740157,0.740732,0.281437,0.741307,0.482613,0.195011,0.326376,0.328733,1e-05,0.331108,-0.337785,0.113636,0.204082,0.20122,0.335025,0.313333,0.477157
125,0.394001,0.565281,0.566177,0.282227,0.567074,0.134149,0.394001,0.565281,0.566177,0.282227,0.567074,0.134149,0.636364,0.777778,0.778661,0.293413,0.779545,0.559091,0.23545,0.381156,0.381164,1.1e-05,0.381172,-0.237656,0.127119,0.225564,0.234483,0.379888,0.335821,0.502793
126,0.398271,0.569662,0.5716,0.29248,0.573544,0.147087,0.398271,0.569662,0.5716,0.29248,0.573544,0.147087,0.440476,0.61157,0.614113,0.221557,0.616667,0.233333,0.12114,0.216102,0.216133,6e-06,0.216164,-0.567672,0.046512,0.088889,0.14,0.245614,0.276119,0.432749


In [377]:
drug_pair = drug_pair.merge(chemical_similarity_df, how='left', left_index=True, right_index=True)

## Remove columns that are all the same and print stats

In [378]:
results = []
columns_to_drop = []
features = []
for column in drug_pair.columns:
    if column == 'synergy_score':
        continue
    if drug_pair[column].dtype in [int, float]:
        tmp = drug_pair[[column, 'synergy_score']].dropna(how='any')
        if tmp[column].nunique() <= 1:
            print(column)
            columns_to_drop.append(column)
            continue
        corr, pval = sp.stats.pearsonr(tmp[column], tmp['synergy_score'])
        results.append([column, corr, pval, len(tmp)])
        features.append(column)

drug_pair.drop(pd.Index(columns_to_drop), axis=1, inplace=True)

results.sort(key=lambda x: abs(x[1]), reverse=True)
results_df = pd.DataFrame(results, columns=['feature', 'corr', 'p_value', 'count'])
results_df

same_drugs
charge_mean
charge_diff
isotope_atom_count_mean
isotope_atom_count_diff
undefined_bond_stereo_count_mean
undefined_bond_stereo_count_diff


Unnamed: 0,feature,corr,p_value,count
0,volume3_d_mean,0.467309,0.000113,63
1,feature_count3_d_mean,0.392868,0.001448,63
2,feature_ring_count3_d_mean,0.39106,0.00153,63
3,cid_diff,-0.295847,0.00488,89
4,conformer_model_rmsd3_d_mean,0.295689,0.018631,63
5,y_steric_quadrupole3_d_mean,0.286105,0.023017,63
6,feature_count3_d_diff,-0.273193,0.030283,63
7,x_steric_quadrupole3_d_mean,0.266503,0.034748,63
8,RDKFingerprint_Russel,0.238696,0.024279,89
9,feature_cation_count3_d_diff,-0.238167,0.060151,63


## Save to database

In [379]:
assert drug_pair.shape[0] == drug_pair_len_start[0]

In [380]:
import csv2sql
db = csv2sql.DataFrameToMySQL(
    '{}/az_dream_2015_features'.format(os.environ['BIODB_CONNECTION_STR']), 
    'drug_pair_info', 
    os.environ['STG_SERVER_IP'],
    echo=False
)

In [381]:
db.import_table(
    drug_pair[['d_1', 'd_2'] + features], 'f_dd_drug_pair_info', [
        [('d_1', 'd_2'), True],
        [('d_2', 'd_1'), False],        
    ],
)