Plik z pre-processingiem danych do modelu. Zawiera sparametryzowane metody do usuwania niepoprawnych wartości, usuwania outlierów, skalowania wartości do skali logarytmicznej, usuwania najrzadziej używanych związków itd. Kod jest skopiowany także do pliku utils.py, dla łatwiejszego importownia w innych notebookach.

In [1]:
import pandas as pd
import numpy as np

In [2]:
def remove_wrong_values(df):
    df['IC50'] = pd.to_numeric(df['IC50'], errors='coerce')
    df = df.dropna()
    return df

In [3]:
def remove_least_used(df, min_perc_used=0):
    occur = pd.DataFrame(df.drop('IC50', axis=1).sum())
    occur.columns = ['number_of_feature_occurrences']
    min_occurrs = int(df.shape[0] * min_perc_used)
    not_qualified = occur[occur['number_of_feature_occurrences']<min_occurrs]
    return df.drop(not_qualified.index, axis=1)

In [4]:
def remove_target_outliers(df):
    return df[(df['IC50']>1) & (df['IC50']<=100_000)]

In [5]:
def make_log_scale(df):
    df['IC50'] = np.log10(df['IC50'])
    return df

In [6]:
def prepare_df(file, min_perc_used=0, remove_outliers=True, log_scale=True):
    print(f'Preparing ({file}) file.')
    df = pd.read_csv(file, low_memory=False)
    print(f'DataFrame base shape: {df.shape}')
    
    df = remove_wrong_values(df)
    print(f'Shape after removing wrong values: {df.shape}')
    
    if min_perc_used != 0:
        df = remove_least_used(df, min_perc_used=min_perc_used)
        print(f'Shape after removing least used features: {df.shape}')
    if remove_outliers:
        df = remove_target_outliers(df)
        print(f'Shape after removing outliers: {df.shape}')
    if log_scale:
        df = make_log_scale(df)    
        
    print()
    return df

In [21]:
def classify_on_IC50(df, IC50_threshold, log_scale=True):
    if log_scale:
        IC50_threshold = np.log10(IC50_threshold)
    df['IC50'] = np.where(df['IC50']<IC50_threshold, 1, 0)
    return df

In [7]:
def get_MACCSFP_fingerprints(min_perc_used=0, remove_outliers=True, log_scale=True):
    file = 'ready_sets/cardiotoxicity_hERG_MACCSFP.csv'
    df = prepare_df(file, min_perc_used=min_perc_used, remove_outliers=remove_outliers, log_scale=log_scale)
    return df

In [8]:
def get_KlekotaRoth_fingerprints(min_perc_used=0, remove_outliers=True, log_scale=True):
    file = 'ready_sets/cardiotoxicity_hERG_KlekFP.csv'
    df = prepare_df(file, min_perc_used=min_perc_used, remove_outliers=remove_outliers, log_scale=log_scale)
    return df

In [9]:
def get_hashed_fingerprints(min_perc_used=0, remove_outliers=True, log_scale=True):
    file = 'ready_sets/cardiotoxicity_hERG_ExtFP.csv'
    df = prepare_df(file, min_perc_used=min_perc_used, remove_outliers=remove_outliers, log_scale=log_scale)
    return df

In [10]:
def get_mixed_fingerprints(min_perc_used=0, remove_outliers=True, log_scale=True):
    print('Preparing files for mixed fingerprints.\n')
    df1 = get_MACCSFP_fingerprints(min_perc_used=min_perc_used, remove_outliers=remove_outliers, log_scale=log_scale)
    df2 = get_KlekotaRoth_fingerprints(min_perc_used=min_perc_used, remove_outliers=remove_outliers, log_scale=log_scale).drop('IC50',axis=1)
    df3 = get_hashed_fingerprints(min_perc_used=min_perc_used, remove_outliers=remove_outliers, log_scale=log_scale).drop('IC50',axis=1)
    
    return df1.join(df2).join(df3)

In [22]:
df = get_mixed_fingerprints(min_perc_used=0.01)

Preparing files for mixed fingerprints.

Preparing (ready_sets/cardiotoxicity_hERG_MACCSFP.csv) file.
DataFrame base shape: (11504, 167)
Shape after removing wrong values: (10635, 167)
Shape after removing least used features: (10635, 138)
Shape after removing outliers: (10396, 138)

Preparing (ready_sets/cardiotoxicity_hERG_KlekFP.csv) file.
DataFrame base shape: (11504, 4861)
Shape after removing wrong values: (10635, 4861)
Shape after removing least used features: (10635, 619)
Shape after removing outliers: (10396, 619)

Preparing (ready_sets/cardiotoxicity_hERG_ExtFP.csv) file.
DataFrame base shape: (11504, 1025)
Shape after removing wrong values: (10635, 1025)
Shape after removing least used features: (10635, 1008)
Shape after removing outliers: (10396, 1008)



In [23]:
df

Unnamed: 0,IC50,MACCSFP8,MACCSFP11,MACCSFP16,MACCSFP17,MACCSFP19,MACCSFP21,MACCSFP22,MACCSFP23,MACCSFP24,...,ExtFP998,ExtFP999,ExtFP1010,ExtFP1011,ExtFP1012,ExtFP1013,ExtFP1014,ExtFP1015,ExtFP1016,ExtFP1017
0,2.809560,0,0,0,0,0,0,1,0,0,...,0,0,1,1,0,0,1,0,0,0
1,3.819544,0,0,0,0,1,0,1,0,0,...,0,0,1,1,1,1,1,1,0,0
2,3.204120,0,0,0,0,0,0,1,0,0,...,0,0,1,1,1,1,1,1,0,0
3,3.826075,0,0,0,0,0,0,1,0,0,...,0,0,1,1,1,1,1,1,0,0
4,3.819544,0,0,0,0,0,0,1,0,0,...,0,0,1,1,1,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11499,3.819544,0,0,0,0,0,0,1,0,0,...,0,0,1,1,1,1,1,1,0,0
11500,5.000000,0,0,1,0,0,0,1,0,0,...,0,0,1,1,1,1,1,0,0,0
11501,4.477121,0,0,0,0,0,0,1,0,0,...,0,1,1,1,1,1,1,0,0,0
11502,4.414973,0,0,0,0,0,0,1,0,0,...,1,0,1,1,1,1,1,1,0,0
