In [1]:
import numpy as np
import pandas as pd
import pickle
import time
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv('train.csv')
train

Unnamed: 0,SMILES,3CLPro_pocket1,ADRP-ADPR_pocket1,ADRP-ADPR_pocket5,ADRP_pocket1,ADRP_pocket12,ADRP_pocket13,COV_pocket1,COV_pocket2,COV_pocket8,COV_pocket10,NSP9_pocket2,NSP9_pocket7,NSP15_pocket1,ORF7A_pocket2,PLPro_chainA_pocket3,PLPro_chainA_pocket23,PLPro_pocket6,PLPro_pocket50
0,COc1ccc(nn1)CN1CCOC2(C1)CCCCCC2,-5.61,-7.85,-7.83,-5.52,-5.67,-7.21,-3.76,-3.03,-6.24,-6.48,-5.85,-6.02,-8.10,-4.82,-5.22,-7.36,-7.54,-8.21
1,CCc1cccc(c1NC(=O)CN(S(=O)(=O)c1ccc(cc1)Cl)C)C,-8.03,-9.01,-8.74,-4.96,-4.77,-7.62,-2.80,-2.56,-6.49,-6.83,-6.89,-6.53,-7.69,-4.97,-5.07,-7.43,-7.87,-8.08
2,CCCOC1CCCN(CC1)c1nccc(n1)c1nccnc1,-4.91,-7.37,-7.55,-5.49,-5.57,-6.22,-2.25,-2.21,-5.56,-5.18,-5.88,-5.31,-7.55,-5.04,-5.37,-7.39,-7.46,-8.26
3,CCN(C(=O)Cn1c(C)cc2c1cccc2)Cc1ccc2c(c1)OCO2,-5.49,-8.71,-8.73,-5.45,-5.52,-7.43,-3.00,-2.59,-7.12,-6.93,-6.93,-6.94,-7.89,-5.25,-5.11,-9.13,-8.01,-8.80
4,COc1cc(Cl)c(cc1NC(=O)CCc1ccc(o1)c1ccccc1)OC,-8.03,-8.87,-9.26,-5.45,-5.52,-7.16,-2.54,-2.60,-6.64,-6.75,-7.33,-7.30,-6.94,-5.10,-5.17,-8.16,-7.95,-9.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
269995,CSc1cccc(c1)N1CC(CC1=O)c1nc2c(n1CCc1ccccc1)cccc2,-9.73,-10.17,-10.99,-6.21,-7.48,-8.49,-3.80,-3.72,-8.48,-8.28,-8.01,-7.92,-8.52,-6.05,-6.59,-9.19,-9.43,-10.87
269996,O=C(c1cccn1CCc1ccccc1)N1CCCC(C1)n1cccn1,-5.88,-8.99,-8.67,-4.90,-4.91,-7.36,-2.58,-2.29,-7.35,-7.27,-7.28,-6.01,-7.95,-5.21,-5.43,-8.00,-8.27,-9.35
269997,Fc1cccc(c1)NC(=O)c1c(F)cccc1Cl,-6.61,-6.66,-6.66,-4.58,-4.59,-5.95,-2.86,-2.86,-5.45,-6.02,-6.41,-5.69,-6.53,-4.71,-4.81,-6.92,-6.85,-7.84
269998,O=C(Nc1sc2c(c1Cl)cccc2)Nc1cscc1,-5.31,-8.10,-8.10,-5.20,-4.96,-6.96,-2.92,-2.94,-7.02,-6.52,-7.10,-6.71,-6.82,-5.04,-5.11,-7.41,-7.72,-8.41


In [None]:
split_idx = np.load('task_2_idx_split.npy', allow_pickle=True)
embs = np.load('fp_MACCS.npy') #load the fingerprints
y_names = train.columns[1:]

# XGB fine-tune

In [None]:
# CV to find best n_estimator
def modelfit(alg, train_idx, y_idx, useTrainCV=True, cv_folds=5, early_stopping_rounds=10):
    labels = np.array(train[y_names[y_idx]][train_idx])  
    training_data = embs[train_idx,:]
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(training_data, label=labels)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                   metrics='mae', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(training_data, labels, eval_metric='mae')
        
    #Predict training set:
    preds = alg.predict(training_data)
        
    #Print model report:
    print("\nModel Report")
    print("MAE : %.4g" % metrics.mean_absolute_error(labels, preds))


In [None]:
tic = time.time()
train_idx = split_idx[0][0]
y_idx = 0
xgb1 = XGBRegressor(learning_rate =0.3,
                    n_estimators=30,
                    max_depth=6,
                    min_child_weight=1,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'reg:squarederror',
                    nthread=4,
                    scale_pos_weight=1, seed=42)
modelfit(xgb1, train_idx, y_idx)
toc = time.time()
print((toc-tic)/60)

In [None]:
preds = xgb1.predict(embs[split_idx[0][1],:])
metrics.mean_absolute_error(np.array(train[y_names[0]][split_idx[0][1]]), preds)

In [None]:
mae_dict = {}
best_param = {}
min_error = 10e5

for y_idx in range(18):
    mae_test = []
    for i in range(1): # to save time use the first data split
        train_idx = split_idx[i][0]
        labels = np.array(train[y_names[y_idx]][train_idx])  
        training_data = embs[train_idx,:]
        
        param_test1 = {
         'max_depth':range(20,50,10),
         'min_child_weight':range(5,20,5)
        }
        tic = time.time()
        gsearch1 = GridSearchCV(estimator = XGBRegressor(learning_rate =0.2, n_estimators=30, max_depth=5,
                                            min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                            objective= 'reg:squarederror', nthread=4, scale_pos_weight=1, seed=27), 
                   param_grid = param_test1, scoring='neg_mean_absolute_error',n_jobs=4, cv=5)
        gsearch1.fit(training_data, labels)
        print(gsearch1.best_params_)
        best_param[(y_idx,i)] = gsearch1.best_params_
        toc = time.time()
        
        print('grid search time for split {} column {}:'.format(i, y_idx), (toc-tic)/60)
        
        xgb = gsearch1.best_estimator_      
        #modelfit(xgb_best, train_idx, y_idx)
        tic = time.time()        
        xgb.fit(training_data, labels)
        toc = time.time()
        print('fitting time:', (toc-tic)/60)

        # model testing
        test_idx = split_idx[i][1]
        test_data = embs[test_idx,:]
        preds = xgb.predict(test_data)
        
        test_label = np.array(train[y_names[y_idx]][test_idx])  
        mae = metrics.mean_absolute_error(test_label, preds)
        mae_test.append(mae)
        
        if mae < min_error:
            xgb_best = xgb
            min_error = mae
    mae_dict[y_idx] = mae_test
    print(mae_test)
    xgb_best.save_model('bestXGB_maccs_col{}'.format(y_idx))
        

In [None]:
# Model testing
preds_all = []
for i in range(18):
    model = XGBRegressor()
    model.load_model('bestXGB_maccs_col{}'.format(i))
    #print(model)
    mae_test = []
    for j in range(1):
        test_idx = split_idx[j][1]
        test_data = embs[test_idx,:]
        test_label = np.array(train[y_names[i]][test_idx])  
        
        preds = model.predict(test_data)
        preds_all.append(preds)
        mae_test.append(metrics.mean_absolute_error(test_label, preds))
    print(np.mean(mae_test))
    #print(mae_test)

In [None]:
pickle.dump(preds_all, open('MACCS_preds.pk', 'wb'))

# Point estimation

In [None]:
model_idx = split_idx[0][0]
test_idx = split_idx[0][1]
data = embs[model_idx,:]
test_data = embs[test_idx,:]

In [None]:
mae = []
scores = {}
for y in range(18):
    tic = time.time()
    xgb = XGBRegressor(learning_rate =0.2,
                        n_estimators=50,
                        max_depth=20,
                        min_child_weight=15,
                        gamma=0,
                        subsample=0.8,
                        colsample_bytree=0.8,
                        objective= 'reg:squarederror',
                        nthread=4,
                        scale_pos_weight=1, seed=42)
    xgb.fit(data, np.array(train[y_names[y]][model_idx]))
    preds = xgb.predict(test_data)
    toc = time.time()
    print((toc-tic)/60)
    scores[y] = preds
    mae.append(metrics.mean_absolute_error(np.array(train[y_names[y]][test_idx]), preds))
    print(y)
    xgb.save_model('XGB_maccs_col{}'.format(y))
print(mae)