## Functions used in notebook 1

#### Notebook meant to be exported as .py

In [1]:
import pandas as pd
import numpy as np
import collections
from rdkit import Chem
from rdkit.Chem import PandasTools

In [4]:
# df_before: activity dataset before any selection
# df_after: activity dataset after selection based on number of actives identifed via IDG threshold and per target
# df_after2: activity dataset after selection based on number of documents and per target
# df_final: activity dataset after addition of extra inactives
# df_final: activity dataset after addition of targets identified using a 6.5 threshold for active identification
# df_idg: IDG threshold for each target class
def selected_prot_each_step(df_before, df_after, df_after2, df_final, df_final2, df_idg): 

    before = []
    after = []
    after2 = []
    after3 = []
    diff = []
    final = []
    cplx = []
    
    df_final_count = df_final.groupby(['target_chemblid', 'target_class', 'activity_class'])['usmiles'].count().reset_index()
    df_final2_count = df_final2.groupby(['target_chemblid', 'target_class', 'activity_class'])['usmiles'].count().reset_index()
    df_type_count = df_final2.groupby(['target_class', 'target_type'])['target_chemblid'].nunique().reset_index()


    for tc in df_idg.target_class:
        if tc != 'Others':
            b = df_before.loc[df_before.target_class == tc]['target_chemblid'].nunique()
            before.append(b)
            a = df_after.loc[df_after.target_class == tc]['target_chemblid'].nunique()
            after.append(a)
            a2 = df_after2.loc[df_after2.target_class == tc]['target_chemblid'].nunique()
            after2.append(a2)
            a3 = df_final_count.loc[(df_final_count.activity_class == 'inactive') & (df_final_count.usmiles >= 30) & (df_final_count.target_class == tc)]['target_chemblid'].nunique()
            after3.append(a3)
            f = df_final2_count.loc[(df_final2_count.activity_class == 'inactive') & (df_final2_count.usmiles >= 30) & (df_final2_count.target_class == tc)]['target_chemblid'].nunique()
            final.append(f)
            diff.append((b-f)/b)
            
            # target type fractions
            cplx.append(0 if df_type_count.loc[(df_type_count.target_type == 'PROTEIN COMPLEX') & (df_type_count.target_class == tc)]['target_chemblid'].empty else int(df_type_count.loc[(df_type_count.target_type == 'PROTEIN COMPLEX') & (df_type_count.target_class == tc)]['target_chemblid']))

    
    # Then for the remaining protein classes
    b = df_before.loc[~df_before.target_class.isin(df_idg.target_class)]['target_chemblid'].nunique()
    before.append(b)
    a = df_after.loc[~df_after.target_class.isin(df_idg.target_class)]['target_chemblid'].nunique()
    after.append(a)
    a2 = df_after2.loc[~df_after2.target_class.isin(df_idg.target_class)]['target_chemblid'].nunique()
    after2.append(a2)
    a3 = df_final_count.loc[(df_final_count.activity_class == 'inactive') & (df_final_count.usmiles >= 30) & (~df_final_count.target_class.isin(df_idg.target_class))]['target_chemblid'].nunique()
    after3.append(a3)
    f = df_final2_count.loc[(df_final2_count.activity_class == 'inactive') & (df_final2_count.usmiles >= 30) & (~df_final2_count.target_class.isin(df_idg.target_class))]['target_chemblid'].nunique()
    final.append(f)
    diff.append((b-f)/b)
    # target type fractions
    cplx.append(0 if df_type_count.loc[(df_type_count.target_type == 'PROTEIN COMPLEX') & (~df_type_count.target_class.isin(df_idg.target_class))]['target_chemblid'].empty else sum(df_type_count.loc[(df_type_count.target_type == 'PROTEIN COMPLEX') & (~df_type_count.target_class.isin(df_idg.target_class))]['target_chemblid']))
        
    # df columns
    target_class = df_idg.target_class.tolist()+['Sum']
    before_selection = before+[sum(before)]
    activity_threshold = df_idg.activity_threshold.tolist()+[np.NaN]
    forty_actives = after+[sum(after)]
    two_documents = after2+[sum(after2)]
    forty_actives_thirty_inactives = after3+[sum(after3)]
    final_selection = final+[sum(final)]
    diff = diff+[(before_selection[-1]-final_selection[-1])/before_selection[-1]]
    complex_fraction = cplx+[sum(cplx)]
    
    dic = collections.OrderedDict((('target_class', target_class), ('before_selection', before_selection), ('activity_threshold', activity_threshold), ('40_actives', forty_actives), ('2_documents', two_documents), ('40_actives_30_inactives', forty_actives_thirty_inactives), ('final_selection', final_selection), ('fraction_of_proteins_not_selected', diff), ('protein_complexes', complex_fraction)))
    
    return(pd.DataFrame.from_dict(dic))

In [3]:
def usmiles(df, input_smiles_column):
    # Add molecules...

    PandasTools.AddMoleculeColumnToFrame(df, smilesCol=input_smiles_column, molCol='mol')

    # Check for failures to build a molecule...

    failed_conversion = df.loc[df['mol'].isnull()]

    print('Molecules RDKIT cannot proceed: ' + str(failed_conversion.shape[0]))

    # Remove failed records...

    df = df.loc[~df['mol'].isnull()].reset_index(drop=True)

    print('Molcules to generate usmiles: ' + str(df.shape[0]))

    #### Add USMILES

    df['usmiles'] = df['mol'].apply(lambda x: Chem.MolToSmiles(x, isomericSmiles=False) if x else None)

    failed_usmiles = df.loc[df['usmiles'].isnull()]

    print('Fail to generate usmiles: ' + str(failed_usmiles.shape[0]))
    
    return(df, failed_conversion, failed_usmiles)