In [None]:
# Author: Markus Viljanen

In [1]:
import pandas as pd
import numpy as np

### Alternative formats

In [2]:
# Toxicity data
df = pd.read_csv('df_tox.csv')
df['Species'] = df['Species'].astype('category')
df['SMILES'] = df['SMILES'].astype('category')

# Fingerprint
fingerprints = pd.read_csv('fingerprints.csv') #'fingerprints2.csv'
fingerprints.set_index('SMILES', inplace=True)
fingerprints = np.log(1 + fingerprints)
fingerprints = fingerprints/fingerprints.max()
fingerprints.reset_index(inplace=True)

# Merge
feature_columns = ["fp_%d" % i for i in range(1024)]
df = df.merge(fingerprints, on='SMILES', how='left', sort=False)

#### LibSVM and LibFFM versions of data

In [3]:
class ReadCV:
    
    def __init__(self, df, columns):
        self.test_folds = [df[column] for column in columns]
        
    def split(self, X, y=None, groups=None):
        index = np.arange(X.shape[0])
        for test_fold in self.test_folds:
            train = index[~test_fold]
            test = index[test_fold]
            yield train, test

In [4]:
import random 
import csv

from scipy.stats import zscore
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import OneHotEncoder, QuantileTransformer, MinMaxScaler

def save_libsvm(fn, X, y):
    with open(fn,'w') as file:
        writer=csv.writer(file, delimiter=' ',lineterminator='\n')
        n  = X.shape[0]
        for i in range(n):
            row = X.getrow(i)
            row_libsvm = ["%.4f" % y[i]] # label
            row_libsvm.extend(["%d:%.4f" % (i, d) for i, d, in zip(row.indices, row.data)]) # features
            writer.writerow(row_libsvm)

def save_libffm(fn, X, y, X_group):
    with open(fn,'w') as file:
        writer=csv.writer(file, delimiter=' ',lineterminator='\n')
        n  = X.shape[0]
        for i in range(n):
            row = X.getrow(i)
            row_libffm = ["%.4f" % y[i]] # label
            row_libffm.extend(["%d:%d:%.4f" % (g, i, d) for g, i, d, in \
                               zip(X_group[row.indices], row.indices, row.data)])
            writer.writerow(row_libffm)

def create_data_sets(df, setting, libsvm=True):
    
    y = df['Value_Log10'].values
    
    enc = OneHotEncoder()
    Xi = csr_matrix(enc.fit_transform(df[['Species']])) # Species dummy
    Xj = csr_matrix(enc.fit_transform(df[['SMILES']])) # Drug dummy
    Xd = csr_matrix(enc.fit_transform(df[['Duration.Cat']])) # Duration dummy
    Xf = csr_matrix(df[feature_columns]) # Drug fingerprint (numeric)

    # No features: dummy indicators for species, drugs, duration
    Xij  = hstack([Xi, Xj, Xd], format='csr')

    # Features: + class, phylum, drug fingerprint
    Xif = hstack([Xi, Xd, Xf], format='csr')
    Xijf = hstack([Xi, Xj, Xd, Xf], format='csr')


    # Map the originating field for field aware factorization machines
    ij_group = np.concatenate([np.repeat(0, Xi.shape[1]), 
                                np.repeat(1, Xj.shape[1]),
                                np.repeat(2, Xd.shape[1])])
    if_group = np.concatenate([np.repeat(0, Xi.shape[1]), 
                                np.repeat(1, Xd.shape[1]),
                                np.repeat(2, Xf.shape[1])])
    ijf_group = np.concatenate([np.repeat(0, Xi.shape[1]), 
                                np.repeat(1, Xj.shape[1]),
                                np.repeat(2, Xd.shape[1]),
                                np.repeat(3, Xf.shape[1])])

    test_fold = df["{setting}_test".format(setting=setting)]
    index = np.arange(Xij.shape[0])
    train = index[~test_fold]
    test = index[test_fold]

    if setting == "setting1" and libsvm:
        save_libsvm("/mnt/scratch_dir/viljanem/xlearn_data/train_nofeat_setting1.svm", Xij[train], y[train])
        save_libsvm("/mnt/scratch_dir/viljanem/xlearn_data/test_nofeat_setting1.svm", Xij[test], y[test])
        save_libsvm("/mnt/scratch_dir/viljanem/xlearn_data/train_setting1.svm", Xijf[train], y[train])
        save_libsvm("/mnt/scratch_dir/viljanem/xlearn_data/test_setting1.svm", Xijf[test], y[test])

    if setting == "setting1" and not libsvm:
        save_libffm("/mnt/scratch_dir/viljanem/xlearn_data/train_nofeat_setting1.ffm", Xij[train], y[train], ij_group)
        save_libffm("/mnt/scratch_dir/viljanem/xlearn_data/test_nofeat_setting1.ffm", Xij[test], y[test], ij_group)
        save_libffm("/mnt/scratch_dir/viljanem/xlearn_data/train_setting1.ffm", Xijf[train], y[train], ijf_group)
        save_libffm("/mnt/scratch_dir/viljanem/xlearn_data/test_setting1.ffm", Xijf[test], y[test], ijf_group)

    if setting == "setting2" and libsvm:
        save_libsvm("/mnt/scratch_dir/viljanem/xlearn_data/train_setting2.svm", Xif[train], y[train])
        save_libsvm("/mnt/scratch_dir/viljanem/xlearn_data/test_setting2.svm", Xif[test], y[test])
            
    if setting == "setting2" and not libsvm:
        save_libffm("/mnt/scratch_dir/viljanem/xlearn_data/train_setting2.ffm", Xif[train], y[train], if_group)
        save_libffm("/mnt/scratch_dir/viljanem/xlearn_data/test_setting2.ffm", Xif[test], y[test], if_group)
    
    
    
    if setting == "setting1":
        cv = ReadCV(df, columns=["setting1_test{fold}".format(fold=i+1) for i in range(5)])
    
    if setting == "setting2":
        cv = ReadCV(df, columns=["setting2_test{fold}".format(fold=i+1) for i in range(5)])
    
    if setting == "setting1" and libsvm:
        for i, (train, test) in enumerate(cv.split(df)):
            save_libsvm("/mnt/scratch_dir/viljanem/xlearn_data/train_nofeat_setting1_{fold}.svm".format(fold=i+1), Xij[train], y[train])
            save_libsvm("/mnt/scratch_dir/viljanem/xlearn_data/test_nofeat_setting1_{fold}.svm".format(fold=i+1), Xij[test], y[test])
            save_libsvm("/mnt/scratch_dir/viljanem/xlearn_data/train_setting1_{fold}.svm".format(fold=i+1), Xijf[train], y[train])
            save_libsvm("/mnt/scratch_dir/viljanem/xlearn_data/test_setting1_{fold}.svm".format(fold=i+1), Xijf[test], y[test])

    if setting == "setting1" and not libsvm:
        for i, (train, test) in enumerate(cv.split(df)):
            save_libffm("/mnt/scratch_dir/viljanem/xlearn_data/train_nofeat_setting1_{fold}.ffm".format(fold=i+1), Xij[train], y[train], ij_group)
            save_libffm("/mnt/scratch_dir/viljanem/xlearn_data/test_nofeat_setting1_{fold}.ffm".format(fold=i+1), Xij[test], y[test], ij_group)
            save_libffm("/mnt/scratch_dir/viljanem/xlearn_data/train_setting1_{fold}.ffm".format(fold=i+1), Xijf[train], y[train], ijf_group)
            save_libffm("/mnt/scratch_dir/viljanem/xlearn_data/test_setting1_{fold}.ffm".format(fold=i+1), Xijf[test], y[test], ijf_group)

    if setting == "setting2" and libsvm:
        for i, (train, test) in enumerate(cv.split(df)):
            save_libsvm("/mnt/scratch_dir/viljanem/xlearn_data/train_setting2_{fold}.svm".format(fold=i+1), Xif[train], y[train])
            save_libsvm("/mnt/scratch_dir/viljanem/xlearn_data/test_setting2_{fold}.svm".format(fold=i+1), Xif[test], y[test])
            
    if setting == "setting2" and not libsvm:
        for i, (train, test) in enumerate(cv.split(df)):
            save_libffm("/mnt/scratch_dir/viljanem/xlearn_data/train_setting2_{fold}.ffm".format(fold=i+1), Xif[train], y[train], if_group)
            save_libffm("/mnt/scratch_dir/viljanem/xlearn_data/test_setting2_{fold}.ffm".format(fold=i+1), Xif[test], y[test], if_group)
            

In [5]:
create_data_sets(df, "setting1", libsvm=True)
create_data_sets(df, "setting2", libsvm=True)
create_data_sets(df, "setting1", libsvm=False)
create_data_sets(df, "setting2", libsvm=False)

#### LibSVM and LibFFM versions of data (hyperparameter tuning)

In [6]:
from sklearn.model_selection import GroupKFold

In [11]:
class LeaveGroupsOut:
    
    def __init__(self, n_splits, groups):
        self.cv = GroupKFold(n_splits=n_splits)
        self.groups = groups
        
    def split(self, X, y=None, groups=None):
        for train, test in self.cv.split(X,y, groups=self.groups):
            yield train, test


def create_hyperparameter_sets(dfx, setting, libsvm=True):
    
    for j in range(5):
    
        df = dfx[~dfx["{setting}_test{fold}".format(setting=setting, fold=j+1)]]

        ncv  = 5

        drugs = df['SMILES'].astype('category').cat.codes.values
        experiments = (df['SMILES'].astype('str') + ' X ' + 
                       df['Species'].astype('str')).astype('category').cat.codes.values

        if setting == "setting1":
            cv = LeaveGroupsOut(n_splits=ncv, groups=experiments)

        if setting == "setting2":
            cv = LeaveGroupsOut(n_splits=ncv, groups=drugs)

        y = df['Value_Log10'].values

        enc = OneHotEncoder()
        Xi = csr_matrix(enc.fit_transform(df[['Species']])) # Species dummy
        Xj = csr_matrix(enc.fit_transform(df[['SMILES']])) # Drug dummy
        Xd = csr_matrix(enc.fit_transform(df[['Duration.Cat']])) # Duration dummy
        Xf = csr_matrix(df[feature_columns]) # Drug fingerprint (numeric)

        # No features: dummy indicators for species, drugs, duration
        Xij  = hstack([Xi, Xj, Xd], format='csr')

        # Features: + class, phylum, drug fingerprint
        Xif = hstack([Xi, Xd, Xf], format='csr')
        Xijf = hstack([Xi, Xj, Xd, Xf], format='csr')

        # Map the originating field for field aware factorization machines
        ij_group = np.concatenate([np.repeat(0, Xi.shape[1]), 
                                    np.repeat(1, Xj.shape[1]),
                                    np.repeat(2, Xd.shape[1])])
        if_group = np.concatenate([np.repeat(0, Xi.shape[1]), 
                                    np.repeat(1, Xd.shape[1]),
                                    np.repeat(2, Xf.shape[1])])
        ijf_group = np.concatenate([np.repeat(0, Xi.shape[1]), 
                                    np.repeat(1, Xj.shape[1]),
                                    np.repeat(2, Xd.shape[1]),
                                    np.repeat(3, Xf.shape[1])])

        if setting == "setting1" and libsvm:
            for i, (train, test) in enumerate(cv.split(df)):
                save_libsvm("/mnt/scratch_dir/viljanem/xlearn_fold1/train_nofeat_setting1_{fold}_{inner}.svm".format(fold=j+1,inner=i+1), Xij[train], y[train])
                save_libsvm("/mnt/scratch_dir/viljanem/xlearn_fold1/test_nofeat_setting1_{fold}_{inner}.svm".format(fold=j+1,inner=i+1), Xij[test], y[test])
                save_libsvm("/mnt/scratch_dir/viljanem/xlearn_fold1/train_setting1_{fold}_{inner}.svm".format(fold=j+1,inner=i+1), Xijf[train], y[train])
                save_libsvm("/mnt/scratch_dir/viljanem/xlearn_fold1/test_setting1_{fold}_{inner}.svm".format(fold=j+1,inner=i+1), Xijf[test], y[test])

        if setting == "setting1" and not libsvm:
            for i, (train, test) in enumerate(cv.split(df)):
                save_libffm("/mnt/scratch_dir/viljanem/xlearn_fold1/train_nofeat_setting1_{fold}_{inner}.ffm".format(fold=j+1,inner=i+1), Xij[train], y[train], ij_group)
                save_libffm("/mnt/scratch_dir/viljanem/xlearn_fold1/test_nofeat_setting1_{fold}_{inner}.ffm".format(fold=j+1,inner=i+1), Xij[test], y[test], ij_group)
                save_libffm("/mnt/scratch_dir/viljanem/xlearn_fold1/train_setting1_{fold}_{inner}.ffm".format(fold=j+1,inner=i+1), Xijf[train], y[train], ijf_group)
                save_libffm("/mnt/scratch_dir/viljanem/xlearn_fold1/test_setting1_{fold}_{inner}.ffm".format(fold=j+1,inner=i+1), Xijf[test], y[test], ijf_group)

        if setting == "setting2" and libsvm:
            for i, (train, test) in enumerate(cv.split(df)):
                save_libsvm("/mnt/scratch_dir/viljanem/xlearn_fold1/train_setting2_{fold}_{inner}.svm".format(fold=j+1,inner=i+1), Xif[train], y[train])
                save_libsvm("/mnt/scratch_dir/viljanem/xlearn_fold1/test_setting2_{fold}_{inner}.svm".format(fold=j+1,inner=i+1), Xif[test], y[test])

        if setting == "setting2" and not libsvm:
            for i, (train, test) in enumerate(cv.split(df)):
                save_libffm("/mnt/scratch_dir/viljanem/xlearn_fold1/train_setting2_{fold}_{inner}.ffm".format(fold=j+1,inner=i+1), Xif[train], y[train], if_group)
                save_libffm("/mnt/scratch_dir/viljanem/xlearn_fold1/test_setting2_{fold}_{inner}.ffm".format(fold=j+1,inner=i+1), Xif[test], y[test], if_group)


In [12]:
create_hyperparameter_sets(df, "setting1", libsvm=True)
create_hyperparameter_sets(df, "setting2", libsvm=True)
create_hyperparameter_sets(df, "setting1", libsvm=False)
create_hyperparameter_sets(df, "setting2", libsvm=False)

#### XGBoost versions of data

In [13]:
import xgboost as xgb

def create_binary_sets():
    train_fn = "/mnt/scratch_dir/viljanem/xlearn_data/train_setting1.svm"
    train_buffer = "/mnt/scratch_dir/viljanem/xlearn_data/train_setting1.xgb"
    dtrain = xgb.DMatrix(train_fn)
    dtrain.save_binary(train_buffer)

    test_fn = "/mnt/scratch_dir/viljanem/xlearn_data/test_setting1.svm"
    test_buffer = "/mnt/scratch_dir/viljanem/xlearn_data/test_setting1.xgb"
    dtest = xgb.DMatrix(test_fn)
    dtest.save_binary(test_buffer)

    train_fn = "/mnt/scratch_dir/viljanem/xlearn_data/train_setting2.svm"
    train_buffer = "/mnt/scratch_dir/viljanem/xlearn_data/train_setting2.xgb"
    dtrain = xgb.DMatrix(train_fn)
    dtrain.save_binary(train_buffer)

    test_fn = "/mnt/scratch_dir/viljanem/xlearn_data/test_setting2.svm"
    test_buffer = "/mnt/scratch_dir/viljanem/xlearn_data/test_setting2.xgb"
    dtest = xgb.DMatrix(test_fn)
    dtest.save_binary(test_buffer)
    
    for i in range(5):
        train_fn = "/mnt/scratch_dir/viljanem/xlearn_data/train_setting1_{fold}.svm".format(fold=i+1)
        train_buffer = "/mnt/scratch_dir/viljanem/xlearn_data/train_setting1_{fold}.xgb".format(fold=i+1)
        dtrain = xgb.DMatrix(train_fn)
        dtrain.save_binary(train_buffer)

        test_fn = "/mnt/scratch_dir/viljanem/xlearn_data/test_setting1_{fold}.svm".format(fold=i+1)
        test_buffer = "/mnt/scratch_dir/viljanem/xlearn_data/test_setting1_{fold}.xgb".format(fold=i+1)
        dtest = xgb.DMatrix(test_fn)
        dtest.save_binary(test_buffer)

        train_fn = "/mnt/scratch_dir/viljanem/xlearn_data/train_setting2_{fold}.svm".format(fold=i+1)
        train_buffer = "/mnt/scratch_dir/viljanem/xlearn_data/train_setting2_{fold}.xgb".format(fold=i+1)
        dtrain = xgb.DMatrix(train_fn)
        dtrain.save_binary(train_buffer)

        test_fn = "/mnt/scratch_dir/viljanem/xlearn_data/test_setting2_{fold}.svm".format(fold=i+1)
        test_buffer = "/mnt/scratch_dir/viljanem/xlearn_data/test_setting2_{fold}.xgb".format(fold=i+1)
        dtest = xgb.DMatrix(test_fn)
        dtest.save_binary(test_buffer)
    

In [14]:
create_binary_sets()

#### XGBoost versions of data (hyperparameter tuning)

In [15]:
def create_binary_sets():
    # Run XLearn create_hyperparameter_sets before this
    for j in range(5):
        ncv  = 5
        for i in range(ncv):
            train_fn = "/mnt/scratch_dir/viljanem/xlearn_fold1/train_setting1_{fold}_{inner}.svm".format(fold=j+1,inner=i+1)
            train_buffer = "/mnt/scratch_dir/viljanem/xlearn_fold1/train_setting1_{fold}_{inner}.xgb".format(fold=j+1,inner=i+1)
            dtrain = xgb.DMatrix(train_fn)
            dtrain.save_binary(train_buffer)

            test_fn = "/mnt/scratch_dir/viljanem/xlearn_fold1/test_setting1_{fold}_{inner}.svm".format(fold=j+1,inner=i+1)
            test_buffer = "/mnt/scratch_dir/viljanem/xlearn_fold1/test_setting1_{fold}_{inner}.xgb".format(fold=j+1,inner=i+1)
            dtest = xgb.DMatrix(test_fn)
            dtest.save_binary(test_buffer)

            train_fn = "/mnt/scratch_dir/viljanem/xlearn_fold1/train_setting2_{fold}_{inner}.svm".format(fold=j+1,inner=i+1)
            train_buffer = "/mnt/scratch_dir/viljanem/xlearn_fold1/train_setting2_{fold}_{inner}.xgb".format(fold=j+1,inner=i+1)
            dtrain = xgb.DMatrix(train_fn)
            dtrain.save_binary(train_buffer)

            test_fn = "/mnt/scratch_dir/viljanem/xlearn_fold1/test_setting2_{fold}_{inner}.svm".format(fold=j+1,inner=i+1)
            test_buffer = "/mnt/scratch_dir/viljanem/xlearn_fold1/test_setting2_{fold}_{inner}.xgb".format(fold=j+1,inner=i+1)
            dtest = xgb.DMatrix(test_fn)
            dtest.save_binary(test_buffer)

In [16]:
create_binary_sets()