In [1]:
import warnings, random
#warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

#from Functions import prepro
from Models.DNN import DNN_model

from tensorflow.keras.callbacks import EarlyStopping


pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)
%matplotlib inline

def metric(y_true, y_pred):
    res = []
    for i in range(0, y_true.shape[1]):
        y = y_true[:,i]
        pred = y_pred[:,i]
        res.append(log_loss(y, pred))
    return np.mean(res)

In [2]:
train_df = pd.read_csv("../Data/Raw/train_features.csv")
#train_df = pd.read_csv("../input/lish-moa/train_features.csv")

test_df = pd.read_csv("../Data/Raw/test_features.csv")
#test_df = pd.read_csv("../input/lish-moa/test_features.csv")

y = pd.read_csv("../Data/Raw/train_targets_scored.csv")
#y = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
tag_size = y.shape[1]-1
print(y.shape)

y_non = pd.read_csv("../Data/Raw/train_targets_nonscored.csv").drop("sig_id", axis=1)
print(y_non.shape)
#y = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")

y = pd.concat([y, y_non], axis=1)
y = y.drop("sig_id", axis=1).values
print(y.shape)

# g772, c100, 206クラス、402クラスの分類

(23814, 207)
(23814, 402)
(23814, 608)


In [3]:
train_df["cp_type"] = train_df["cp_type"].map(lambda x : 1 if x == "trt_cp" else 0)
test_df["cp_type"] = test_df["cp_type"].map(lambda x : 1 if x == "trt_cp" else 0)

di = {
    24 : 0, 
    48 : 1,
    72 : 2,
}
train_df["cp_time"] = train_df["cp_time"].map(lambda x : di[x])
test_df["cp_time"] = test_df["cp_time"].map(lambda x : di[x])

train_df["cp_dose"] = train_df["cp_dose"].map(lambda x : 1 if x == "D1" else 0)
test_df["cp_dose"] = test_df["cp_dose"].map(lambda x : 1 if x == "D1" else 0)

cols = [col for col in train_df.columns if "g-" in col or "c-" in col]
scaler = StandardScaler()
scaler.fit(pd.concat([train_df[cols], test_df[cols]]))
train_df[cols] = scaler.transform(train_df[cols])
test_df[cols] = scaler.transform(test_df[cols])

X = train_df.drop("sig_id", axis=1)
test_X = test_df.drop("sig_id", axis=1)

In [4]:
random.seed(random.randint(0, 10000))
K = 3
#kf = KFold(n_splits=K, random_state=random.randint(0, 10000))
kf = MultilabelStratifiedKFold(n_splits=K, random_state=random.randint(0, 10000), shuffle=True)


models = []
train_pred = []
folds = []

for itr, (train_index, valid_index) in enumerate(kf.split(X, y)):
    print("======================== fold {} ========================".format(itr+1))
    folds.append(valid_index)
    train_X = X.iloc[train_index].values
    train_y = y[train_index]
    valid_X = X.iloc[valid_index].values
    valid_y = y[valid_index]
    print(train_X.shape)
    print(train_y.shape)

    
    model = DNN_model(input_size=train_X.shape[1], output_size=train_y.shape[1])
    cb = EarlyStopping(monitor='val_loss', patience=1)
    model.fit(
        train_X, 
        train_y,
        batch_size=128,
        epochs=10,
        verbose=1,
        callbacks=[cb],
        validation_data=(valid_X, valid_y),
    )

    
    pred = model.predict(valid_X)
    train_pred.append(pred)
    models.append(model)
    
    

train_preds = np.empty(y.shape)
for i, fold in enumerate(folds):
    train_preds[fold] = train_pred[i]

print("cv score : {}".format(metric(y[:,:tag_size], train_preds[:,:tag_size])))



(15876, 875)
(15876, 608)
Train on 15876 samples, validate on 7938 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
(15876, 875)
(15876, 608)
Train on 15876 samples, validate on 7938 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
(15876, 875)
(15876, 608)
Train on 15876 samples, validate on 7938 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
cv score : 0.017647989626354943


In [5]:
preds = []

for i in range(K):
    model = models[i]
    pred = model.predict(test_X.values, batch_size=128)
    preds.append(pred)
    
preds = np.array(preds)
preds = np.mean(preds, axis=0)[:,:tag_size]

In [None]:
#sub_df = pd.read_csv("../Data/Raw/sample_submission.csv")
sub_df = pd.read_csv("../input/lish-moa/sample_submission.csv")
cols = [col for col in sub_df.columns if col != "sig_id"]
sub_df[cols] = preds
sub_df.to_csv("submission.csv", index=False)