In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("./data_train.csv")
test = pd.read_csv("./smiles_test.csv")

In [3]:
df = df.drop(labels = "Unnamed: 0", axis = 1)
df = df[(df == 0).sum(1) < 11]

In [4]:
smiles = df["smiles"].to_list()
y = df.drop(labels = "smiles", axis = 1)
print(y.shape)

(11531, 11)


In [5]:
#Morgan Fingerprints
def get_morgan_df(smiles):
    from rdkit.Chem import AllChem

    mutag = [Chem.rdmolfiles.MolFromSmiles(smile) for smile in smiles]

    # Initialize variables
    fp_length = 2048
    desc_mtx = np.zeros((len(mutag), fp_length)) * np.nan
    compounds = [''] * len(mutag)

    # Calculate Morgan fingerprints (equivalent to ECFP fingerprints)
    for i, mol in enumerate(mutag): 
        if mol is not None:
            desc_mtx[i] = AllChem.GetMorganFingerprintAsBitVect(mol, radius=3, nBits=fp_length)
            
    return pd.DataFrame(desc_mtx)

df = get_morgan_df(smiles)

In [6]:
X_train = df
y_train = y

X_train = X_train.reset_index(drop=True).fillna(0)
y_train = y_train.reset_index(drop=True).fillna(0)

print(X_train.shape, y_train.shape)

(11531, 2048) (11531, 11)


In [7]:
submission_x = get_morgan_df(test["smiles"].to_list())

print(submission_x.shape)

(5896, 2048)


In [12]:
from sklearn.svm import SVC, SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier,BaggingRegressor
from sklearn.model_selection import GridSearchCV


models = {}
rocs = []

for i,col in enumerate(y_train.columns):
    
    y = y_train[col]
    idx = y.index[y != 0].tolist()
    y = y[idx]
    x = X_train.iloc[idx]
            
    print(" ")
    print("Model " + str(i) + " ------------------------")
    print("Data Shape: " + str(x.shape))
    print(" ")
    
    X_train_, X_test_, y_train_, y_test_ = train_test_split(x, y, test_size=0.2)
    
    param_grid = {
        'C': [0.1,1, 10, 100], 
        'gamma': [1,0.1,0.01,0.001],
        'kernel': ['rbf', 'poly', 'sigmoid']
    }
    
    svr_grid = GridSearchCV(SVR(), param_grid,refit=True,verbose=1, scoring="roc_auc")
    
    grid_fit = svr_grid.fit(X_train_, y_train_)  
    best_svr = grid_fit.best_estimator_
    best_params = grid_fit.best_params_
    svr = SVR(**best_params)
            
    bagg = BaggingRegressor(base_estimator=svr, n_estimators=20)
    bagg.fit(X_train_, y_train_)
    
    yhat = bagg.predict(X_test_)
    
    rocs.append(roc_auc_score(y_test_, yhat))
    
    models[i] = bagg
    
    print("Test ROC: " , roc_auc_score(y_test_, yhat))

print("")
print("Mean Test ROC: ",np.mean(np.array(rocs)))  

 
Model 0 ------------------------
Data Shape: (1010, 2048)
 
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  7.6min finished


Test ROC:  0.6296296296296295
 
Model 1 ------------------------
Data Shape: (1023, 2048)
 
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  8.2min finished


Test ROC:  0.8609022556390977
 
Model 2 ------------------------
Data Shape: (1314, 2048)
 
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 11.3min finished


Test ROC:  0.8430773705642816
 
Model 3 ------------------------
Data Shape: (953, 2048)
 
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  6.5min finished


Test ROC:  0.7957650273224044
 
Model 4 ------------------------
Data Shape: (632, 2048)
 
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  3.7min finished


Test ROC:  0.5782108945527236
 
Model 5 ------------------------
Data Shape: (632, 2048)
 
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  3.4min finished


Test ROC:  0.7578065498857578
 
Model 6 ------------------------
Data Shape: (610, 2048)
 
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  2.6min finished


Test ROC:  0.7744252873563219
 
Model 7 ------------------------
Data Shape: (3804, 2048)
 
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 61.4min finished


Test ROC:  0.7667878787878788
 
Model 8 ------------------------
Data Shape: (3626, 2048)
 
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 70.4min finished


Test ROC:  0.8123752989726494
 
Model 9 ------------------------
Data Shape: (4311, 2048)
 
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed: 85.4min finished


Test ROC:  0.7803190567019247
 
Model 10 ------------------------
Data Shape: (962, 2048)
 
Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:  7.1min finished


Test ROC:  0.5428808339256102

Mean Test ROC:  0.7401981893943891


In [13]:
predictions = []
print(models)

for i, model in enumerate(models):
    print(i)

    y_hat = models[i].predict(submission_x)
    predictions.append(y_hat)
    
print(np.array(predictions), np.array(predictions).shape)

{0: BaggingRegressor(base_estimator=SVR(C=10, gamma=0.01), n_estimators=20), 1: BaggingRegressor(base_estimator=SVR(C=1, gamma=0.01), n_estimators=20), 2: BaggingRegressor(base_estimator=SVR(C=0.1, gamma=1, kernel='poly'),
                 n_estimators=20), 3: BaggingRegressor(base_estimator=SVR(C=100, gamma=0.01), n_estimators=20), 4: BaggingRegressor(base_estimator=SVR(C=1, gamma=0.001, kernel='poly'),
                 n_estimators=20), 5: BaggingRegressor(base_estimator=SVR(C=1, gamma=0.01), n_estimators=20), 6: BaggingRegressor(base_estimator=SVR(C=0.1, gamma=1, kernel='poly'),
                 n_estimators=20), 7: BaggingRegressor(base_estimator=SVR(C=1, gamma=0.01, kernel='poly'),
                 n_estimators=20), 8: BaggingRegressor(base_estimator=SVR(C=100, gamma=0.01, kernel='poly'),
                 n_estimators=20), 9: BaggingRegressor(base_estimator=SVR(C=0.1, gamma=0.1, kernel='poly'),
                 n_estimators=20), 10: BaggingRegressor(base_estimator=SVR(C=10, gamma=

In [14]:
predictions = np.array(predictions)
predictions_ = np.array(predictions).reshape((predictions.shape[0], predictions.shape[1]))
predictions_ = pd.DataFrame(predictions_.T)

In [15]:
predictions_.columns = y_train.columns
predictions_.to_csv("subm_20_04_21_50.csv", float_format='%.6f')