cv score : 0.015197305813776704

In [1]:
import warnings, random
warnings.filterwarnings('ignore')
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
import tensorflow as tf

#from Functions import prepro
from Models.DNN import DNN_model

from tensorflow.keras.callbacks import EarlyStopping


pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)
%matplotlib inline

def metric(y_true, y_pred):
    res = []
    for i in range(0, y_true.shape[1]):
        y = y_true[:,i]
        pred = y_pred[:,i]
        res.append(log_loss(y, pred))
    return np.mean(res)

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    
seeds = [0, 1, 2, 3, 4]

In [2]:
train_df = pd.read_csv("../Data/Raw/train_features.csv")
#train_df = pd.read_csv("../input/lish-moa/train_features.csv")

test_df = pd.read_csv("../Data/Raw/test_features.csv")
#test_df = pd.read_csv("../input/lish-moa/test_features.csv")

y = pd.read_csv("../Data/Raw/train_targets_scored.csv")
#y = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
tag_size = y.shape[1]-1
print(y.shape)

y_non = pd.read_csv("../Data/Raw/train_targets_nonscored.csv").drop("sig_id", axis=1)
print(y_non.shape)
#y = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")

GENES = [col for col in train_df.columns if col.startswith("g-")]
CELLS = [col for col in train_df.columns if col.startswith("c-")]
CELLS_50 = CELLS[:50]
BIOS = GENES + CELLS

SCORED_MOAS = [col for col in y.columns if col != "sig_id"]
y = pd.concat([y, y_non], axis=1)
MOAS = [col for col in y.columns if col != "sig_id"]

y = y.drop("sig_id", axis=1).values
print(y.shape)

# g772, c100, 206クラス、402クラスの分類

(23814, 207)
(23814, 402)
(23814, 608)


In [3]:
train_df["cp_type"] = train_df["cp_type"].map(lambda x : 1 if x == "trt_cp" else 0)
test_df["cp_type"] = test_df["cp_type"].map(lambda x : 1 if x == "trt_cp" else 0)

di = {
    24 : 0, 
    48 : 1,
    72 : 2,
}
train_df["cp_time"] = train_df["cp_time"].map(lambda x : di[x])
test_df["cp_time"] = test_df["cp_time"].map(lambda x : di[x])

train_df["cp_dose"] = train_df["cp_dose"].map(lambda x : 1 if x == "D1" else 0)
test_df["cp_dose"] = test_df["cp_dose"].map(lambda x : 1 if x == "D1" else 0)

cols = [col for col in train_df.columns if "g-" in col or "c-" in col]
scaler = StandardScaler()
scaler.fit(pd.concat([train_df[cols], test_df[cols]]))
train_df[cols] = scaler.transform(train_df[cols])
test_df[cols] = scaler.transform(test_df[cols])

X = train_df.drop("sig_id", axis=1)
test_X = test_df.drop("sig_id", axis=1)

In [4]:
train_preds = np.zeros(y.shape)
preds = np.zeros((test_X.shape[0], y.shape[1]))

for seed in seeds:
    seed_everything(seed)
    K = 7
    kf = MultilabelStratifiedKFold(n_splits=K, random_state=seed, shuffle=True)
    train_pred = np.zeros(y.shape)
    
    for itr, (train_index, valid_index) in enumerate(kf.split(X, y)):
        print("======================== fold {} ========================".format(itr+1))
        train_X = X.iloc[train_index].values
        train_y = y[train_index]
        valid_X = X.iloc[valid_index].values
        valid_y = y[valid_index]
        #print(train_X.shape)
        #print(train_y.shape)
        model = DNN_model(input_size=train_X.shape[1], output_size=train_y.shape[1])
        cb = EarlyStopping(monitor='val_loss', patience=1)
        model.fit(
            train_X, 
            train_y,
            batch_size=128,
            epochs=10,
            verbose=0,
            callbacks=[cb],
            validation_data=(valid_X, valid_y),
        )
        train_pred[valid_index] += model.predict(valid_X, batch_size=128)
        preds += model.predict(test_X.values, batch_size=128) / (K*len(seeds))


    print("cv score : {}".format(metric(y[:,:tag_size], train_pred[:,:tag_size])))
    train_preds += train_pred/len(seeds)

print("cv score : {}".format(metric(y[:,:tag_size], train_preds[:,:tag_size])))

cv score : 0.01575215368851753
cv score : 0.0158014522934438
cv score : 0.016129858719962538
cv score : 0.015464318989524618
cv score : 0.01563467459803597
cv score : 0.015197305813776704


In [None]:
preds = [:,:tag_size]
sub_df = pd.read_csv("../Data/Raw/sample_submission.csv")
#sub_df = pd.read_csv("../input/lish-moa/sample_submission.csv")
cols = [col for col in sub_df.columns if col != "sig_id"]
sub_df[cols] = preds
#sub_df.to_csv("submission.csv", index=False)

In [None]:
t = train_preds.copy()
print(metric(y[:,:tag_size], t[:,:tag_size]))
t_ = train_df[train_df["cp_type"] == 0]
t[t_.index] = np.zeros((t_.shape[0], t.shape[1]))
t = np.where(t > 1, 1, t)
print(metric(y[:,:tag_size], t[:,:tag_size]))

In [None]:
li = []
not_li = []
for i in range(y.shape[0]):
    for j in range(y.shape[1]):
        if y[i][j] == 1:
            #print("====={}, {}====".format(i,j))
            rank = np.where(train_preds[i].argsort()[::-1] == j)[0][0]+1
            #print("rank {}".format(rank))
            if rank <= 20:
                li.append(j)
            else:
                not_li.append(j)
            #print(train_preds[i][j])

In [None]:
li = list(set(li))
not_li = list(set(not_li))
for i in li:
    if i not in not_li:
        print(i)

In [None]:
cols = pd.read_csv("../Data/Raw/train_targets_nonscored.csv").columns
pd.read_csv("../Data/Raw/train_targets_nonscored.csv")[cols[256-206]]