In [1]:
from source.iterate import Iterator
from source.train import run_training
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import numpy as np
import pandas as pd

from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.preprocessing import QuantileTransformer
from sklearn.feature_selection import VarianceThreshold

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
train_features = pd.read_csv('data/raw/train_features.csv')
train_targets_scored = pd.read_csv('data/raw/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('data/raw/train_targets_nonscored.csv')

test_features = pd.read_csv('data/raw/test_features.csv')
sample_submission = pd.read_csv('data/raw/sample_submission.csv')

In [3]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]

In [4]:
train = train_features.merge(train_targets_scored, on='sig_id')
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

target = train[train_targets_scored.columns]

train = train[[col for col in train if col not in target.columns or col == 'sig_id']]

print(train.shape)
print(test.shape)
print(target.shape)

(21948, 876)
(3624, 876)
(21948, 207)


In [5]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

# CV Folds

In [6]:
folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=7)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)



# Model

In [7]:
# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 30
BATCH_SIZE = 256
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 7
EARLY_STOPPING_STEPS = 0
EARLY_STOP = True

hidden_size=1024

train_args = {'train': folds, 'test': test, 'target_cols': target_cols, 'target': target,
              'g_comp': 28, 'c_comp': 5, 'g_feat': GENES, 'c_feat': CELLS, 'pca_add': True, 'thr': 0.9, 
              'batch_size': BATCH_SIZE, 'hidden_size': hidden_size, 'device': DEVICE, 
              'early_stopping_steps': EARLY_STOPPING_STEPS, 'learning_rate': LEARNING_RATE, 
              'epochs': EPOCHS, 'weight_decay': WEIGHT_DECAY}

In [8]:
iterator = Iterator(train=train, test=test, target_cols=target_cols, 
                    seeds=[1903, 1881], 
                    n_folds=NFOLDS, train_func=run_training, train_args=train_args)

In [9]:
oof, pred = iterator.it_seeds()

In [10]:
train[target_cols] = oof
test[target_cols] = pred

In [11]:
valid_results = train_targets_scored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
    
print("CV log_loss: ", score)

CV log_loss:  0.014628144289927201


In [12]:
train_features = pd.read_csv('data/raw/train_features.csv')
train_targets_scored = pd.read_csv('data/raw/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('data/raw/train_targets_nonscored.csv')

test_features = pd.read_csv('data/raw/test_features.csv')
sample_submission = pd.read_csv('data/raw/sample_submission.csv')

train = train_features.merge(train_targets_scored, on='sig_id')
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

target = train[train_targets_scored.columns]

train = train[[col for col in train if col not in target.columns or col == 'sig_id']]

target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=7)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)

# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 30
BATCH_SIZE = 256
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 7
EARLY_STOPPING_STEPS = 0
EARLY_STOP = True

hidden_size=1024

train_args = {'train': folds, 'test': test, 'target_cols': target_cols, 'target': target,
              'g_comp': 28, 'c_comp': 5, 'g_feat': GENES, 'c_feat': CELLS, 'pca_add': True, 'thr': 0.4, 
              'batch_size': BATCH_SIZE, 'hidden_size': hidden_size, 'device': DEVICE, 
              'early_stopping_steps': EARLY_STOPPING_STEPS, 'learning_rate': LEARNING_RATE, 
              'epochs': EPOCHS, 'weight_decay': WEIGHT_DECAY}

iterator = Iterator(train=train, test=test, target_cols=target_cols, 
                    seeds=[1903, 1881], 
                    n_folds=NFOLDS, train_func=run_training, train_args=train_args)

oof, pred = iterator.it_seeds()

train[target_cols] = oof
test[target_cols] = pred

valid_results = train_targets_scored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
    
print("CV log_loss: ", score)



CV log_loss:  0.014620350834682704


In [13]:
train_features = pd.read_csv('data/raw/train_features.csv')
train_targets_scored = pd.read_csv('data/raw/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('data/raw/train_targets_nonscored.csv')

test_features = pd.read_csv('data/raw/test_features.csv')
sample_submission = pd.read_csv('data/raw/sample_submission.csv')

train = train_features.merge(train_targets_scored, on='sig_id')
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

target = train[train_targets_scored.columns]

train = train[[col for col in train if col not in target.columns or col == 'sig_id']]

target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=7)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)

# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 30
BATCH_SIZE = 256
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 7
EARLY_STOPPING_STEPS = 0
EARLY_STOP = True

hidden_size=1024

train_args = {'train': folds, 'test': test, 'target_cols': target_cols, 'target': target,
              'g_comp': 28, 'c_comp': 5, 'g_feat': GENES, 'c_feat': CELLS, 'pca_add': True, 'thr': 0.1, 
              'batch_size': BATCH_SIZE, 'hidden_size': hidden_size, 'device': DEVICE, 
              'early_stopping_steps': EARLY_STOPPING_STEPS, 'learning_rate': LEARNING_RATE, 
              'epochs': EPOCHS, 'weight_decay': WEIGHT_DECAY}

iterator = Iterator(train=train, test=test, target_cols=target_cols, 
                    seeds=[1903, 1881], 
                    n_folds=NFOLDS, train_func=run_training, train_args=train_args)

oof, pred = iterator.it_seeds()

train[target_cols] = oof
test[target_cols] = pred

valid_results = train_targets_scored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
    
print("CV log_loss: ", score)



CV log_loss:  0.014611888216730002


In [14]:
train_features = pd.read_csv('data/raw/train_features.csv')
train_targets_scored = pd.read_csv('data/raw/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('data/raw/train_targets_nonscored.csv')

test_features = pd.read_csv('data/raw/test_features.csv')
sample_submission = pd.read_csv('data/raw/sample_submission.csv')

train = train_features.merge(train_targets_scored, on='sig_id')
train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

target = train[train_targets_scored.columns]

train = train[[col for col in train if col not in target.columns or col == 'sig_id']]

target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

folds = train.copy()

mskf = MultilabelStratifiedKFold(n_splits=7)

for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
    folds.loc[v_idx, 'kfold'] = int(f)

folds['kfold'] = folds['kfold'].astype(int)

# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 30
BATCH_SIZE = 256
LEARNING_RATE = 3e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 7
EARLY_STOPPING_STEPS = 0
EARLY_STOP = True

hidden_size=1024

train_args = {'train': folds, 'test': test, 'target_cols': target_cols, 'target': target,
              'g_comp': 28, 'c_comp': 5, 'g_feat': GENES, 'c_feat': CELLS, 'pca_add': True, 'thr': 1.3, 
              'batch_size': BATCH_SIZE, 'hidden_size': hidden_size, 'device': DEVICE, 
              'early_stopping_steps': EARLY_STOPPING_STEPS, 'learning_rate': LEARNING_RATE, 
              'epochs': EPOCHS, 'weight_decay': WEIGHT_DECAY}

iterator = Iterator(train=train, test=test, target_cols=target_cols, 
                    seeds=[1903, 1881], 
                    n_folds=NFOLDS, train_func=run_training, train_args=train_args)

oof, pred = iterator.it_seeds()

train[target_cols] = oof
test[target_cols] = pred

valid_results = train_targets_scored.drop(columns=target_cols).merge(train[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)

y_true = train_targets_scored[target_cols].values
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
    
print("CV log_loss: ", score)



CV log_loss:  0.01471371012115755
