In [103]:
import logging
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from dataset import create_dataset
from ml_model import train_model

from utils import read_yaml


In [61]:
seed=42

In [5]:

logging.basicConfig()
logging.root.setLevel(logging.INFO)
log = logging.getLogger('Hack4NF')


log.info('Reading config files')
# Read configuration files
config_path = "config/resources.yaml"
config = read_yaml(config_path)
genie = config["genie"]


# Read synapses credentials
secrets_path = ".secrets/synapses_credentials.yaml"
credentials = read_yaml(secrets_path)

INFO:Hack4NF:Reading config files


In [9]:
log.info('Creating dataset')
dataset = create_dataset(genie)

INFO:Hack4NF:Creating dataset
INFO:dataset:Reading dataset from: data//dataset.csv
Columns (4,5,9,10,12) have mixed types. Specify dtype option on import or set low_memory=False.


In [10]:
ds = dataset.dropna(subset=genie["targets"])

In [11]:
with open('data/genie_mutations_features.txt') as f:
    mutation_cols = f.read().splitlines()
features = set(mutation_cols) - set(genie["targets"])
features = list(features)

In [100]:
X_train, X_test, y_train, y_test = train_test_split(ds[features].astype(int).values, ds[genie["targets"]].astype(int).values,
                                                    test_size=0.20, random_state=seed)


In [104]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

In [105]:
set_seed(seed)

In [106]:
x_train = X_train.reshape(-1, X_train.shape[1]).astype('float32')
y_train = y_train.astype('float32')
x_test = X_test.reshape(-1, X_test.shape[1]).astype('float32')
y_test = y_test.astype('float32')

In [107]:
x_train.shape

(105034, 5428)

In [108]:
x_test.shape

(26259, 5428)

In [109]:
y_test.shape

(26259, 10)

In [110]:
# x_val = torch.from_numpy(x_val)
# y_val = torch.from_numpy(y_val)

In [111]:
# https://www.kaggle.com/code/nicohrubec/pytorch-multilabel-neural-network/notebook

In [113]:
from torch.utils.data import Dataset, DataLoader
class Data(Dataset):
    def __init__(self, X_data, y_data):
        self.x=torch.from_numpy(X_data)
        self.y=torch.from_numpy(y_data)
        self.len=self.x.shape[0]
    def __getitem__(self,index):      
        return self.x[index], self.y[index]
    def __len__(self):
        return self.len

In [115]:
train_set=Data(x_train, y_train)
test_set=Data(x_test, y_test)

In [116]:
trainloader=DataLoader(dataset=train_set,batch_size=64)
testloader=DataLoader(dataset=test_set)

In [117]:


class Net(nn.Module):
    def __init__(self,D_in,H_1,H_2,D_out):
        super(Net,self).__init__()
        self.linear1=nn.Linear(D_in,H_1)
        self.linear2=nn.Linear(H_1,H_2)
        self.linear3=nn.Linear(H_2,D_out)

        
    def forward(self,x):
        x=torch.sigmoid(self.linear1(x))  
        x=torch.sigmoid(self.linear2(x))  
        x=self.linear3(x)
        return x



input_dim=len(features)     # how many Variables are in the dataset
hidden_dim_1 = 2000 # hidden layers
hidden_dim_2 = 1000 # hidden layers
output_dim=len(genie["targets"])    # number of classes
input_dim



5428

In [118]:
# Instantiate model
model=Net(input_dim,hidden_dim_1,hidden_dim_2,output_dim)

In [119]:


print('W:',list(model.parameters())[0].size())
print('b',list(model.parameters())[1].size())

print('W:',list(model.parameters())[2].size())
print('b',list(model.parameters())[3].size())

W: torch.Size([2000, 5428])
b torch.Size([2000])
W: torch.Size([1000, 2000])
b torch.Size([1000])


In [120]:

learning_rate=0.1
# criterion=nn.CrossEntropyLoss()
# optimizer=torch.optim.SGD(model.parameters(), lr=learning_rate)
# https://discuss.pytorch.org/t/multi-label-classification-in-pytorch/905/15
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)



In [121]:
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

In [126]:
# https://www.kaggle.com/code/schmiddey/multiclass-classification-with-pytorch
n_epochs=1000#1000
loss_list=[]

#n_epochs
for epoch in range(n_epochs):
    for x, y in trainloader:
      

        #clear gradient 
        optimizer.zero_grad()
        #make a prediction 
        z=model(x)
        # calculate loss, da Cross Entropy benutzt wird muss ich in den loss Klassen vorhersagen, 
        # also Wahrscheinlichkeit pro Klasse. Das mach torch.max(y,1)[1])
        loss=criterion(z,y)
        # calculate gradients of parameters 
        loss.backward()
        # update parameters 
        optimizer.step()
        
        loss_list.append(loss.data)
        
        
        #print('epoch {}, loss {}'.format(epoch, loss.item()))


KeyboardInterrupt



In [None]:
x

In [None]:
z=model(x)

In [None]:
yhat=torch.max(z.data,1)
yhat

In [None]:
y

In [14]:
model = torch.nn.Linear(len(features), len(genie["targets"])) # predict logits for 5 classes

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)

for epoch in range(20):
    optimizer.zero_grad()
    output = model(X_test)
    loss = criterion(output, y_test)
    loss.backward()
    optimizer.step()
    print('Loss: {:.3f}'.format(loss.item()))

TypeError: linear(): argument 'input' (position 1) must be Tensor, not numpy.ndarray

In [8]:
model = torch.nn.Linear(20, 5) # predict logits for 5 classes
x = torch.randn(1, 20)
y = torch.tensor([[1., 0., 1., 0., 0.]]) # get classA and classC as active

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)

for epoch in range(20):
    optimizer.zero_grad()
    output = model(x)
    loss = criterion(output, y)
    loss.backward()
    optimizer.step()
    print('Loss: {:.3f}'.format(loss.item()))

Loss: 0.795
Loss: 0.687
Loss: 0.597
Loss: 0.521
Loss: 0.459
Loss: 0.407
Loss: 0.364
Loss: 0.328
Loss: 0.297
Loss: 0.272
Loss: 0.250
Loss: 0.231
Loss: 0.214
Loss: 0.200
Loss: 0.187
Loss: 0.176
Loss: 0.166
Loss: 0.157
Loss: 0.148
Loss: 0.141


In [8]:
len(dataset)

146565

In [9]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,PATIENT_ID,SEX,PRIMARY_RACE,SECONDARY_RACE,TERTIARY_RACE,ETHNICITY,BIRTH_YEAR,CENTER,INT_CONTACT,...,ZNRF3_DNP,ZNRF3_INS,ZNRF3_ONP,ZNRF3_SNP,ZRSR2_DEL,ZRSR2_DNP,ZRSR2_INS,ZRSR2_SNP,ZSWIM4_DEL,ZSWIM4_SNP
0,0,GENIE-VICC-101416,Female,White,Not collected,Not collected,Non-Spanish/non-Hispanic,1961.0,VICC,19225,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,GENIE-VICC-102225,Female,White,Not collected,Not collected,Non-Spanish/non-Hispanic,1982.0,VICC,12057,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,GENIE-VICC-102424,Female,White,Not collected,Not collected,Non-Spanish/non-Hispanic,1952.0,VICC,23505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,GENIE-VICC-102966,Male,White,Not collected,Not collected,Non-Spanish/non-Hispanic,1951.0,VICC,23426,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,GENIE-VICC-103244,Female,Unknown,Not collected,Not collected,Unknown,1964.0,VICC,18267,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
dataset[genie["targets"]].isna().sum()

NF1_DEL    15272
NF1_DNP    15272
NF1_INS    15272
NF1_ONP    15272
NF1_SNP    15272
NF2_DEL    15272
NF2_DNP    15272
NF2_INS    15272
NF2_ONP    15272
NF2_SNP    15272
dtype: int64

In [11]:
dataset[dataset[genie["targets"]].isna()].head()

Unnamed: 0.1,Unnamed: 0,PATIENT_ID,SEX,PRIMARY_RACE,SECONDARY_RACE,TERTIARY_RACE,ETHNICITY,BIRTH_YEAR,CENTER,INT_CONTACT,...,ZNRF3_DNP,ZNRF3_INS,ZNRF3_ONP,ZNRF3_SNP,ZRSR2_DEL,ZRSR2_DNP,ZRSR2_INS,ZRSR2_SNP,ZSWIM4_DEL,ZSWIM4_SNP
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [13]:
len(ds)

131293

In [15]:
len(mutation_cols)

5438

In [16]:
len(features)

5428

In [25]:
from xgboost import XGBClassifier

In [32]:
param = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'auc'
num_round = 100

In [38]:
clf = xgb.XGBClassifier(tree_method="hist")


In [39]:
y_test.shape

(26259, 10)

In [40]:
y.shape

(32, 5)

In [47]:
from sklearn.ensemble import RandomForestClassifier


clf = RandomForestClassifier(max_depth=16, random_state=0)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
predictions.sum(axis=0)

In [65]:
# https://medium.com/the-innovation/multi-label-classification-example-with-multioutputclassifier-and-xgboost-in-python-98c84c7d379f
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report

In [62]:
classifier = MultiOutputClassifier(XGBClassifier())

clf = Pipeline([('classify', classifier)])

print (clf)



Pipeline(steps=[('classify',
                 MultiOutputClassifier(estimator=XGBClassifier(base_score=None,
                                                               booster=None,
                                                               colsample_bylevel=None,
                                                               colsample_bynode=None,
                                                               colsample_bytree=None,
                                                               enable_categorical=False,
                                                               gamma=None,
                                                               gpu_id=None,
                                                               importance_type=None,
                                                               interaction_constraints=None,
                                                               learning_rate=None,
                                                      

In [64]:
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))



0.9235690620358734


In [67]:
yhat = clf.predict(X_test)

In [68]:
auc_y1 = roc_auc_score(y_test[:,0],yhat[:,0])
auc_y2 = roc_auc_score(y_test[:,1],yhat[:,1])
auc_y3 = roc_auc_score(y_test[:,2],yhat[:,2])
auc_y4 = roc_auc_score(y_test[:,3],yhat[:,3])
auc_y5 = roc_auc_score(y_test[:,4],yhat[:,4])

print("ROC AUC y1: %.4f, y2: %.4f, y3: %.4f, y4: %.4f, y5: %.4f" % (auc_y1, auc_y2, auc_y3, auc_y4, auc_y5))


ROC AUC y1: 0.4999, y2: 0.5000, y3: 0.5043, y4: 0.5000, y5: 0.5736


In [71]:
cm_y1 = confusion_matrix(y_test[:,0],yhat[:,0])
cm_y2 = confusion_matrix(y_test[:,1],yhat[:,1])
cm_y3 = confusion_matrix(y_test[:,2],yhat[:,2])
cm_y4 = confusion_matrix(y_test[:,3],yhat[:,3])
cm_y5 = confusion_matrix(y_test[:,4],yhat[:,4])

print (cm_y1)


[[25888     7]
 [  364     0]]


In [72]:
cr_y1 = classification_report(y_test[:,0],yhat[:,0])
cr_y2 = classification_report(y_test[:,1],yhat[:,1])
cr_y3 = classification_report(y_test[:,2],yhat[:,2])
cr_y4 = classification_report(y_test[:,3],yhat[:,3])
cr_y5 = classification_report(y_test[:,4],yhat[:,4])

print (cr_y1)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99     25895
           1       0.00      0.00      0.00       364

    accuracy                           0.99     26259
   macro avg       0.49      0.50      0.50     26259
weighted avg       0.97      0.99      0.98     26259



Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


In [50]:
# clf.fit(X_train, y_train,
#         eval_set=[(X_train, y_train), (X_test, y_test)],
#         eval_metric='mlogloss',
#         verbose=True)

In [44]:
from sklearn.datasets import make_multilabel_classification
import numpy as np

X, y = make_multilabel_classification(
    n_samples=32, n_classes=5, n_labels=3, random_state=0
)
clf = xgb.XGBClassifier(tree_method="hist")
clf.fit(X, y)
np.testing.assert_allclose(clf.predict(X), y)

ValueError: y should be a 1d array, got an array of shape (32, 5) instead.

In [25]:
y

array([[0, 0, 1, 1, 1],
       [1, 1, 0, 1, 0],
       [0, 1, 0, 1, 1],
       [0, 0, 0, 0, 1],
       [1, 1, 1, 1, 1],
       [0, 1, 0, 1, 0],
       [1, 1, 1, 1, 1],
       [0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0],
       [1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0],
       [1, 1, 0, 0, 0],
       [1, 0, 1, 1, 0],
       [1, 0, 0, 0, 1],
       [0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 0, 1, 1, 1],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 1, 1, 1, 0],
       [1, 1, 1, 1, 0],
       [1, 0, 0, 1, 0],
       [1, 1, 1, 1, 1],
       [1, 0, 0, 0, 1],
       [1, 0, 1, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 1, 1, 1, 1],
       [1, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 1, 1, 0, 1]])

In [13]:
# model_path = f"model_{target}"
# create_folder(model_path)
ds = ds.fillna(0)


X_train, X_test, y_train, y_test = train_test_split(ds[features], ds[target],
                                                    test_size=0.20, random_state=42)

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

params_k = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'subsample': 0.8,
    'subsample_freq': 1,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'max_bin': 300,
    'n_estimators': 2000,
    'boost_from_average': False,
    "random_seed": 42}

model_gbm = lgb.train(params_k, train_data, valid_sets=[test_data],
                      num_boost_round=5000, early_stopping_rounds=25,
                      verbose_eval=50)
joblib.dump(model_gbm, f'{model_path}/model_lgb.pkl')

# evaluate_model(model_gbm, X_test, y_test, model_path)

# model_interpretability(model_gbm, X_test, model_path)

NameError: name 'target' is not defined