In [None]:
import warnings, random, os, sys, tqdm, time
sys.path.append("../")
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, MinMaxScaler, QuantileTransformer, RobustScaler
from sklearn.cluster import KMeans
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ReduceLROnPlateau

#from pytorch_tabnet.tab_model import TabNetRegressor
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

pd.set_option("display.max_columns", 1200)
pd.set_option("display.max_rows", 1200)
%matplotlib inline

In [None]:
def metric(y_true, y_pred):
    res = []
    for i in range(0, y_true.shape[1]):
        y = y_true[:,i]
        pred = y_pred[:,i]
        res.append(log_loss(y, pred))
    return np.mean(res)

def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True
seed_everything(42)
        
    
def make_scaler(flag, seed):
    if flag == "quantile":
        return QuantileTransformer(n_quantiles=100,random_state=seed, output_distribution="normal")
    elif flag == "gauss":
        return GaussRankScaler()
    elif flag == "standard":
        return StandardScaler()
    elif flag == "minmax":
        return MinMaxScaler()
    elif flag == "robust":
        return RobustScaler()
    

seeds = [0, 1, 2, 3, 4, 5, 6]
SCALE = "quantile"

    

In [None]:
# g772, c100, 206クラス、402クラスの分類

train_df = pd.read_csv("../../../Data/Raw/train_features.csv")
test_df = pd.read_csv("../../../Data/Raw/test_features.csv")
#pub_test_df = pd.read_csv("../input/moapublictest/test_features.csv")
pub_test_df = pd.read_csv("../../../Data/Raw/test_features.csv")
drug_df = pd.read_csv("../../../Data/Raw/train_drug.csv")#

y = pd.read_csv("../../../Data/Raw/train_targets_scored.csv")
y_non = pd.read_csv("../../../Data/Raw/train_targets_nonscored.csv")
y_all = pd.concat([y, y_non.drop("sig_id", axis=1)], axis=1)
y = y.merge(drug_df, on='sig_id', how='left') #

GENES = [col for col in train_df.columns if col.startswith("g-")]
CELLS = [col for col in train_df.columns if col.startswith("c-")]
BIOS = GENES + CELLS


SCORED_MOAS = [col for col in y.columns if col != "sig_id" and col != "drug_id"]#
NONSCORED_MOAS = [col for col in y_non.columns if col != "sig_id"]
ALL_MOAS = SCORED_MOAS + NONSCORED_MOAS


TR_SIZE = train_df.shape[0]
TE_SIZE = test_df.shape[0]

train_nonvehicle_index = train_df[train_df["cp_type"] != "ctl_vehicle"].index
test_nonvehicle_index = test_df[test_df["cp_type"] != "ctl_vehicle"].index

train_df["time_dose"] = train_df["cp_time"].astype(str) + " * " + train_df["cp_dose"]
test_df["time_dose"] = test_df["cp_time"].astype(str) + " * " + test_df["cp_dose"]
pub_test_df["time_dose"] = pub_test_df["cp_time"].astype(str) + " * " + pub_test_df["cp_dose"]

# remove cp_type = ctl_vehicle
mask = train_df["cp_type"] != "ctl_vehicle"
train_df = train_df[mask].drop("cp_type", axis=1).reset_index(drop=True)
test_df = test_df[test_df["cp_type"] != "ctl_vehicle"].drop("cp_type", axis=1).reset_index(drop=True)
pub_test_df = pub_test_df[pub_test_df["cp_type"] != "ctl_vehicle"].drop("cp_type", axis=1).reset_index(drop=True)
y_nonv = y[mask].reset_index(drop=True)#

scored = y_nonv.copy()#
y_nonv.drop("drug_id", axis=1, inplace=True)#
y.drop("drug_id", axis=1, inplace=True)#

TR_NONV_SIZE = train_df.shape[0]
TE_NONV_SHAPE = test_df.shape[0]