In [144]:
import pandas as pd
import numpy as np

In [145]:
def remove_wrong_values(df):
    df['IC50'] = pd.to_numeric(df['IC50'], errors='coerce')
    df = df.dropna()
    return df

In [146]:
def remove_least_used(df, min_perc_used=0):
    occur = pd.DataFrame(df.drop('IC50', axis=1).sum())
    occur.columns = ['number_of_feature_occurrences']
    min_occurrs = int(df.shape[0] * min_perc_used)
    not_qualified = occur[occur['number_of_feature_occurrences']<min_occurrs]
    return df.drop(not_qualified.index, axis=1)

In [147]:
def remove_target_outliers(df):
    return df[(df['IC50']>1) & (df['IC50']<=100_000)]

In [148]:
def make_log_scale(df):
    df['IC50'] = np.log10(df['IC50'])
    return df

In [149]:
def prepare_df(file, min_perc_used=0, remove_outliers=True, log_scale=True):
    print(f'Preparing ({file}) file.')
    df = pd.read_csv(file, low_memory=False)
    print(f'DataFrame base shape: {df.shape}')
    
    df = remove_wrong_values(df)
    print(f'Shape after removing wrong values: {df.shape}')
    
    if min_perc_used != 0:
        df = remove_least_used(df, min_perc_used=min_perc_used)
        print(f'Shape after removing least used features: {df.shape}')
    if remove_outliers:
        df = remove_target_outliers(df)
        print(f'Shape after removing outliers: {df.shape}')
    if log_scale:
        df = make_log_scale(df)    
        
    print()
    return df

In [150]:
def get_MACCSFP_fingerprints(min_perc_used=0, remove_outliers=True, log_scale=True):
    file = 'ready_sets/cardiotoxicity_hERG_MACCSFP.csv'
    df = prepare_df(file, min_perc_used=min_perc_used, remove_outliers=remove_outliers, log_scale=log_scale)
    return df

In [151]:
def get_KlekotaRoth_fingerprints(min_perc_used=0, remove_outliers=True, log_scale=True):
    file = 'ready_sets/cardiotoxicity_hERG_KlekFP.csv'
    df = prepare_df(file, min_perc_used=min_perc_used, remove_outliers=remove_outliers, log_scale=log_scale)
    return df

In [152]:
def get_hashed_fingerprints(min_perc_used=0, remove_outliers=True, log_scale=True):
    file = 'ready_sets/cardiotoxicity_hERG_ExtFP.csv'
    df = prepare_df(file, min_perc_used=min_perc_used, remove_outliers=remove_outliers, log_scale=log_scale)
    return df

In [155]:
def get_mixed_fingerprints(min_perc_used=0, remove_outliers=True, log_scale=True):
    print('Preparing files for mixed fingerprints.\n')
    df1 = get_MACCSFP_fingerprints(min_perc_used=min_perc_used, remove_outliers=remove_outliers, log_scale=log_scale)
    df2 = get_KlekotaRoth_fingerprints(min_perc_used=min_perc_used, remove_outliers=remove_outliers, log_scale=log_scale).drop('IC50',axis=1)
    df3 = get_hashed_fingerprints(min_perc_used=min_perc_used, remove_outliers=remove_outliers, log_scale=log_scale).drop('IC50',axis=1)
    
    return df1.join(df2).join(df3)

In [157]:
df = get_mixed_fingerprints(min_perc_used=0.01)

Preparing files for mixed fingerprints.

Preparing (ready_sets/cardiotoxicity_hERG_MACCSFP.csv) file.
DataFrame base shape: (11504, 167)
Shape after removing wrong values: (10635, 167)
Shape after removing least used features: (10635, 138)
Shape after removing outliers: (10396, 138)

Preparing (ready_sets/cardiotoxicity_hERG_KlekFP.csv) file.
DataFrame base shape: (11504, 4861)
Shape after removing wrong values: (10635, 4861)
Shape after removing least used features: (10635, 619)
Shape after removing outliers: (10396, 619)

Preparing (ready_sets/cardiotoxicity_hERG_ExtFP.csv) file.
DataFrame base shape: (11504, 1025)
Shape after removing wrong values: (10635, 1025)
Shape after removing least used features: (10635, 1008)
Shape after removing outliers: (10396, 1008)

