In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import scale
from sklearn.metrics import roc_auc_score
from torch.optim import Adam
from torch.optim.lr_scheduler import MultiStepLR
from torch.nn.modules.loss import BCEWithLogitsLoss, BCELoss
import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
import random

In [2]:
train_df = pd.read_csv('train.csv')

In [3]:
label = train_df.target

train = train_df.drop(['ID_code','target'],axis=1)

In [4]:
test = pd.read_csv('test.csv')

In [5]:
test_filtered = pd.read_pickle('test_filtred.pkl')

In [6]:
test_filtered = test_filtered.loc[:,train.columns]

In [7]:
test = test.drop(['ID_code'],axis=1)

In [8]:
train_test = pd.concat([train,test_filtered]).reset_index(drop=True)

In [9]:
vcs_train_test = {}


for col in tqdm(train.columns):

    vcs_train_test[col] = train_test.loc[:,col].value_counts()/300000

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [10]:
for col in tqdm(train.columns):

    vtraintest = vcs_train_test[col]
    
    t = vtraintest[train[col]].fillna(0).values
    train[col+'_train_test_sum_vcs'] = t
    
    train[col+'_train_test_sum_vcs_product'] = train[col]*t

    t = vtraintest[test[col]].fillna(0).values
    test[col+'_train_test_sum_vcs'] = t
    
    test[col+'_train_test_sum_vcs_product'] = test[col]*t


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [11]:
random_seed = 42

np.random.seed(random_seed)
random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings


In [13]:
scaler = StandardScaler()

In [14]:
cols = train.columns

In [15]:
X_train = pd.DataFrame(scaler.fit_transform(train),columns=cols)
test = pd.DataFrame(scaler.transform(test),columns=cols)

In [16]:
device = torch.device('cuda:3')


In [17]:
class NN(torch.nn.Module):
    random_seed = 42


    def __init__(self, D_in=3, features = 200):
        np.random.seed(random_seed)
        random.seed(random_seed)
        torch.manual_seed(random_seed)
        torch.cuda.manual_seed(random_seed)
        
        super(NN, self).__init__()
        self.layer = []
        layer_size = D_in
        enc_out = 30
        for i in range(features):
            
            layer = torch.nn.Sequential(torch.nn.Linear(layer_size, enc_out//2),
                                       torch.nn.ReLU(),
                                       torch.nn.Linear(enc_out//2, enc_out),
                                       torch.nn.ReLU())
            setattr(self, 'layer_' + str(i), layer)
        

        self.linear3 = torch.nn.Linear(features*enc_out,1)        

    def forward(self, y):
        res = []
        for i in range(200):
            layer = getattr(self, 'layer_' + str(i))
            res.append(layer(y[:,i,:]) )
        y = torch.cat(res,1)
        y = self.linear3(y)
        return y
    



loss_f = BCEWithLogitsLoss()

batch_size = 2048

In [None]:
import warnings
warnings.filterwarnings('ignore')
random_seed = 42

np.random.seed(random_seed)
random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=99999)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train.values, label.values)):
    print("Fold {}".format(fold_))
    
    
    train_tensors = []
    val_tensors = []
    test_tensors = []

    for fff in range(200):
        train_t = X_train.loc[trn_idx,[f'var_{fff}',f'var_{fff}_train_test_sum_vcs',
                                       f'var_{fff}_train_test_sum_vcs_product']].values
        val_t = X_train.loc[val_idx,[f'var_{fff}',f'var_{fff}_train_test_sum_vcs',
                                     f'var_{fff}_train_test_sum_vcs_product']].values
        test_t =  test[[f'var_{fff}',f'var_{fff}_train_test_sum_vcs',f'var_{fff}_train_test_sum_vcs_product']].values
        train_tensors.append(torch.tensor(train_t, requires_grad=False, device=device, dtype=torch.float32))
        val_tensors.append(torch.tensor(val_t, requires_grad=False, device=device, dtype=torch.float32))

        test_tensors.append(torch.tensor(test_t, requires_grad=False, device=device, dtype=torch.float32))
    
    train_tensors = torch.cat(train_tensors,1).view((-1,200,3))
    val_tensors = torch.cat(val_tensors,1).view((-1,200,3))
    test_tensors = torch.cat(test_tensors,1).view((-1,200,3))
    
    y_train_t = torch.tensor(label[trn_idx].values, requires_grad=False, device=device, dtype=torch.float32)
    y_val_t = torch.tensor(label[val_idx].values, requires_grad=False, device=device, dtype=torch.float32)
    
    dataset = TensorDataset(train_tensors,y_train_t)
    nn = NN().to(device)
    optimizer = Adam(params=nn.parameters(), lr = 0.005)
    scheduler = MultiStepLR(optimizer, milestones=[15, 25, 35, 55], gamma=0.5)
    best_AUC = 0
    early_stop = 0



    for epoch in tqdm(range(100)):
        dl = DataLoader(dataset, batch_size=batch_size, shuffle=True,num_workers=0)
        for data,label_t in dl:
            pred = nn(data)
            loss = loss_f(pred, torch.unsqueeze(label_t,-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        with torch.no_grad():
            val_pred = nn(val_tensors)
            AUC = roc_auc_score(label[val_idx].values,val_pred.detach().cpu().numpy())
            print('EPOCH {}'.format(epoch))
            print('LOSS: ',loss_f(val_pred, torch.unsqueeze(y_val_t,-1)).detach().cpu().numpy())
            print('AUC: ',AUC)
            print('='*50)
            scheduler.step()
            
            if AUC > best_AUC:
                early_stop = 0
                best_AUC = AUC
                torch.save(nn, 'best_auc_nn.pkl')
            else:
                print('SCORE IS NOT THE BEST. Early stop counter: {}'.format(early_stop))
                early_stop += 1
            
            if early_stop == 15:
                print('EARLY_STOPPING')
                best_model = torch.load('best_auc_nn.pkl')
                break
    
    with torch.no_grad():
        oof[val_idx] = best_model(val_tensors).data.cpu().numpy().flatten()
        
        batch_size = 20000
        blobs = []

        for batch in torch.split(test_tensors,batch_size):
            blob = best_model(batch).data.cpu().numpy().flatten()
            blobs.append(blob)
    predictions_test = np.concatenate(blobs)
    
    predictions += predictions_test / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(label, oof)))

Fold 0


HBox(children=(IntProgress(value=0), HTML(value='')))

EPOCH 0
LOSS:  0.24902473
AUC:  0.8360336045240018


In [19]:
print("CV score: {:<8.5f}".format(roc_auc_score(label, oof)))

CV score: 0.91843 


In [20]:
predictions

array([-2.20609248, -0.82340122, -1.79933187, ..., -5.8816942 ,
       -1.69825268, -4.62188035])

In [22]:
tst_sub = pd.read_csv('sample_submission.csv')

In [23]:
tst_sub['target'] = predictions

In [24]:
tst_sub.to_csv('200in_nn_kfold.csv',index=False)