In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import scale
from sklearn.metrics import roc_auc_score
from torch.optim import Adam
from torch.optim.lr_scheduler import MultiStepLR,ReduceLROnPlateau
from torch.nn.modules.loss import BCEWithLogitsLoss, BCELoss
import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
import random

In [2]:
train_df = pd.read_csv('train.csv')

In [3]:
label = train_df.target

train = train_df.drop(['ID_code','target'],axis=1)

In [4]:
test = pd.read_csv('test.csv')

In [5]:
test_filtered = pd.read_pickle('test_filtred.pkl')

In [6]:
test_filtered = test_filtered.loc[:,train.columns]

In [7]:
test = test.drop(['ID_code'],axis=1)

In [8]:
train_test = pd.concat([train,test_filtered]).reset_index(drop=True)

In [9]:
vcs_train_test = {}
vcs_target_1 = {}
vcs_target_0 = {}

for col in tqdm(train.columns):
#     vcs_target_1[col] = train[label == 1].loc[:,col].value_counts()/len(label[label == 1])
#     vcs_target_0[col] = train[label == 0].loc[:,col].value_counts()/len(label[label == 0])    
    vcs_train_test[col] = train_test.loc[:,col].value_counts()/300000

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [10]:
cols = train.columns

In [11]:
class UpsamplingPreprocessor():
    def __init__(self):
        self.times = 10
        self.neg_class_balancer = 2

    # Data augmentation
    def augment_class(self, X):
        X_new = X.copy()
        ids = np.arange(X.shape[0])

        for c in range(X.shape[1]):
            np.random.shuffle(ids)
            X_new[:,c] = X[ids][:,c]

        return X_new

    def augment(self, X, y, t=2):
        np.random.seed(42)

        t_pos = t
        t_neg = t // self.neg_class_balancer

        X_pos_orig = X[y == 1]
        X_neg_orig = X[y == 0]
        X_pos = np.zeros((t_pos, *X_pos_orig.shape), dtype=X.dtype)
        X_neg = np.zeros((t_neg, *X_neg_orig.shape), dtype=X.dtype)

        for i in tqdm(range(t_pos)):
            X_pos[i] = self.augment_class(X_pos_orig)

        for i in tqdm(range(t_neg)):
            X_neg[i] = self.augment_class(X_neg_orig)

        X_pos = np.vstack(X_pos)
        X_neg = np.vstack(X_neg)
        y_pos = np.ones(X_pos.shape[0])
        y_neg = np.zeros(X_neg.shape[0])
        X = np.vstack((X, X_pos, X_neg))
        y = np.concatenate((y, y_pos, y_neg))

        return X, y

    def fit_transform(self, X, y=None):
        var_cols = ['var_{}'.format(x) for x in range(200)]
        X_augmented, y = self.augment(X.values, y, t=self.times)
        return pd.DataFrame(X_augmented, columns=var_cols), y

    def transform(self, X):
        return X
ups = UpsamplingPreprocessor()

In [12]:
# train,label = ups.fit_transform(train,label)

In [13]:
train.shape

(200000, 200)

In [14]:
def generate_features(df):
    for col in tqdm(cols):
        vtraintest = vcs_train_test[col]
        t = vtraintest[df[col]].fillna(0).values
        
#         df[col+'_train_test_sum_vcs'] = t        
#         df[col+'_train_test_sum_vcs_product'] = df[col]*t
        df[col+'_1_flag'] = (t == 1/300000).astype(int)
        
# generate_features(train)    
generate_features(test)    

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [15]:
random_seed = 42

np.random.seed(random_seed)
random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

In [16]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings


In [17]:
gpu = torch.device('cuda:3')
cpu = torch.device('cpu')


In [26]:
class NN(torch.nn.Module):
    random_seed = 42


    def __init__(self, D_in=5, features = 200):
        np.random.seed(random_seed)
        random.seed(random_seed)
        torch.manual_seed(random_seed)
        torch.cuda.manual_seed(random_seed)
        
        super(NN, self).__init__()
        self.layer = []
        layer_size = D_in
        enc_out = 30
        for i in range(features):
            
            layer = torch.nn.Sequential(torch.nn.Linear(layer_size, enc_out//2),
                                       torch.nn.Linear(enc_out//2, enc_out),
                                       torch.nn.ReLU()
                                       )
            setattr(self, 'layer_' + str(i), layer)
        

        self.linear3 = torch.nn.Linear(features*enc_out,1)        

    def forward(self, y):
        res = []
        for i in range(200):
            layer = getattr(self, 'layer_' + str(i))
            res.append(layer(y[:,i,:]) )
        y = torch.cat(res,1)
        y = self.linear3(y)
        return y
    

def batch_iter(X, y, batch_size=64):
    """
    X: feature tensor (shape: num_instances x num_features)
    y: target tensor (shape: num_instances)
    """
    idxs = torch.randperm(X.size(0))
    if X.is_cuda:
        idxs = idxs.cuda()
    for batch_idxs in idxs.split(batch_size):
        yield X[batch_idxs], y[batch_idxs]

loss_f = BCEWithLogitsLoss()

batch_size = 16384

In [27]:
# torch.cuda.empty_cache()
import warnings
warnings.filterwarnings('ignore')
random_seed = 42
N_IN = 2
np.random.seed(random_seed)
random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

folds = StratifiedKFold(n_splits=5, shuffle=False, random_state=99999)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
# label = label.values
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, label)):
    print("Fold {}".format(fold_))
    
    X_train,Train_label = ups.fit_transform(train.loc[trn_idx],label.loc[trn_idx])
    X_val, Val_label = train.loc[val_idx],label.loc[val_idx]
    generate_features(X_train)
    generate_features(X_val)
    cols_new = X_train.columns
    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train),columns=cols_new)
    X_val = pd.DataFrame(scaler.transform(X_val),columns=cols_new)
    test_new = pd.DataFrame(scaler.transform(test),columns=cols_new)
    
    
    train_tensors = []
    val_tensors = []
    test_tensors = []

    for fff in range(200):
        cols_to_use = [f'var_{fff}',f'var_{fff}_1_flag']
        train_t = X_train.loc[:,cols_to_use].values
        val_t = X_val.loc[:,cols_to_use].values
        test_t =  test_new.loc[:,cols_to_use].values
        train_tensors.append(torch.tensor(train_t, requires_grad=False, device=cpu, dtype=torch.float32))
        val_tensors.append(torch.tensor(val_t, requires_grad=False, device=cpu, dtype=torch.float32))

        test_tensors.append(torch.tensor(test_t, requires_grad=False, device=gpu, dtype=torch.float32))
    
    train_tensors = torch.cat(train_tensors,1).view((-1,200,N_IN))
    val_tensors = torch.cat(val_tensors,1).view((-1,200,N_IN))
    test_tensors = torch.cat(test_tensors,1).view((-1,200,N_IN))
    try:
        y_train_t = torch.tensor(Train_label, requires_grad=False, device=cpu, dtype=torch.float32)
    except:
        y_train_t = torch.tensor(Train_label.values, requires_grad=False, device=cpu, dtype=torch.float32)        
    
    
    try:
        y_val_t = torch.tensor(Val_label, requires_grad=False, device=cpu, dtype=torch.float32)
    except:
        y_val_t = torch.tensor(Val_label.values, requires_grad=False, device=cpu, dtype=torch.float32)
    
    nn = NN(D_in=N_IN).to(gpu)
    optimizer = Adam(params=nn.parameters(), lr = 0.005)
    scheduler = ReduceLROnPlateau(optimizer, 'max',factor=0.5, patience=4,min_lr=0.0001, verbose=True)
    best_AUC = 0
    early_stop = 0



    for epoch in tqdm(range(200)):
        dl = batch_iter(train_tensors,y_train_t, batch_size=batch_size)
        for data,label_t in dl:
            pred = nn(data.to(gpu))
            loss = loss_f(pred, torch.unsqueeze(label_t.to(gpu),-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        with torch.no_grad():
            blobs = []
            for batch in torch.split(val_tensors,batch_size):
                blob = nn(batch.to(gpu)).data.cpu().numpy().flatten()
                blobs.append(blob)
            val_pred = np.concatenate(blobs)
            AUC = roc_auc_score(label[val_idx],val_pred)
            print('EPOCH {}'.format(epoch))
            print('LOSS: ',loss_f(torch.tensor(val_pred), y_val_t))
            print('AUC: ',AUC)
            scheduler.step(AUC)
            
            if AUC > best_AUC:
                early_stop = 0
                best_AUC = AUC
                torch.save(nn, 'best_auc_nn.pkl')
            else:
                early_stop += 1
                print('SCORE IS NOT THE BEST. Early stop counter: {}'.format(early_stop))
                
            
            if early_stop == 10:
                print('EARLY_STOPPING')
                break
            print('='*50)
            
        best_model = torch.load('best_auc_nn.pkl')
   
    with torch.no_grad():
        blobs = []
        for batch in torch.split(val_tensors,batch_size):
            blob = best_model(batch.to(gpu)).data.cpu().numpy().flatten()
            blobs.append(blob)
            
        oof[val_idx] = np.concatenate(blobs)
        
        blobs = []
        for batch in torch.split(test_tensors,batch_size):
            blob = best_model(batch).data.cpu().numpy().flatten()
            blobs.append(blob)
    predictions_test = np.concatenate(blobs)
    
    predictions += predictions_test / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(label, oof)))

Fold 0


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

EPOCH 0
LOSS:  tensor(0.2413)
AUC:  0.8777310191766494
EPOCH 1
LOSS:  tensor(0.2070)
AUC:  0.9041280839071921
EPOCH 2
LOSS:  tensor(0.1956)
AUC:  0.911540346542765
EPOCH 3
LOSS:  tensor(0.1984)
AUC:  0.9145464729104539
EPOCH 4
LOSS:  tensor(0.1926)
AUC:  0.9158600738836598
EPOCH 5
LOSS:  tensor(0.1880)
AUC:  0.9172077897386695
EPOCH 6
LOSS:  tensor(0.1931)
AUC:  0.9183467684229696
EPOCH 7
LOSS:  tensor(0.1899)
AUC:  0.9186104129584147
EPOCH 8
LOSS:  tensor(0.1863)
AUC:  0.9193599690052006
EPOCH 9
LOSS:  tensor(0.1879)
AUC:  0.9192077811658751
SCORE IS NOT THE BEST. Early stop counter: 1
EPOCH 10
LOSS:  tensor(0.2013)
AUC:  0.9198348084761706
EPOCH 11
LOSS:  tensor(0.1847)
AUC:  0.9200567228613332
EPOCH 12
LOSS:  tensor(0.2144)
AUC:  0.9198716853187164
SCORE IS NOT THE BEST. Early stop counter: 1
EPOCH 13
LOSS:  tensor(0.1917)
AUC:  0.9199267620652746
SCORE IS NOT THE BEST. Early stop counter: 2
EPOCH 14
LOSS:  tensor(0.1946)
AUC:  0.9198569663839995
SCORE IS NOT THE BEST. Early stop co

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

EPOCH 0
LOSS:  tensor(0.2375)
AUC:  0.8818309476767796
EPOCH 1
LOSS:  tensor(0.2047)
AUC:  0.9049750345020402
EPOCH 2
LOSS:  tensor(0.1961)
AUC:  0.9119136398826302
EPOCH 3
LOSS:  tensor(0.1975)
AUC:  0.9146374655169721
EPOCH 4
LOSS:  tensor(0.1888)
AUC:  0.9168686976999054
EPOCH 5
LOSS:  tensor(0.1903)
AUC:  0.9174202671365663
EPOCH 6
LOSS:  tensor(0.1920)
AUC:  0.9182949963503403
EPOCH 7
LOSS:  tensor(0.1872)
AUC:  0.919211569787869
EPOCH 8
LOSS:  tensor(0.1871)
AUC:  0.9188528985931076
SCORE IS NOT THE BEST. Early stop counter: 1
EPOCH 9
LOSS:  tensor(0.1873)
AUC:  0.9195667392727036
EPOCH 10
LOSS:  tensor(0.2096)
AUC:  0.9191397276976336
SCORE IS NOT THE BEST. Early stop counter: 1
EPOCH 11
LOSS:  tensor(0.1835)
AUC:  0.919035896640308
SCORE IS NOT THE BEST. Early stop counter: 2
EPOCH 12
LOSS:  tensor(0.1991)
AUC:  0.919989723708519
EPOCH 13
LOSS:  tensor(0.1930)
AUC:  0.919628667341152
SCORE IS NOT THE BEST. Early stop counter: 1
EPOCH 14
LOSS:  tensor(0.1901)
AUC:  0.91905847973

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

EPOCH 0
LOSS:  tensor(0.2422)
AUC:  0.8823627277730303
EPOCH 1
LOSS:  tensor(0.2011)
AUC:  0.9086556620731805
EPOCH 2
LOSS:  tensor(0.1967)
AUC:  0.9151336597999441
EPOCH 3
LOSS:  tensor(0.1935)
AUC:  0.918427038653315
EPOCH 4
LOSS:  tensor(0.1941)
AUC:  0.9206142646965285
EPOCH 5
LOSS:  tensor(0.1862)
AUC:  0.9211787401237281
EPOCH 6
LOSS:  tensor(0.2022)
AUC:  0.9221654235769458
EPOCH 7
LOSS:  tensor(0.1887)
AUC:  0.9227038480471461
EPOCH 8
LOSS:  tensor(0.1883)
AUC:  0.9226918665427726
SCORE IS NOT THE BEST. Early stop counter: 1
EPOCH 9
LOSS:  tensor(0.1941)
AUC:  0.9230018542639774
EPOCH 10
LOSS:  tensor(0.1907)
AUC:  0.9233573965912517
EPOCH 11
LOSS:  tensor(0.1845)
AUC:  0.9232981631586371
SCORE IS NOT THE BEST. Early stop counter: 1
EPOCH 12
LOSS:  tensor(0.1873)
AUC:  0.9232038217749498
SCORE IS NOT THE BEST. Early stop counter: 2
EPOCH 13
LOSS:  tensor(0.1839)
AUC:  0.9237406595427533
EPOCH 14
LOSS:  tensor(0.1867)
AUC:  0.9235199730917397
SCORE IS NOT THE BEST. Early stop co

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

EPOCH 0
LOSS:  tensor(0.2250)
AUC:  0.8875126293518792
EPOCH 1
LOSS:  tensor(0.2078)
AUC:  0.9093240438932303
EPOCH 2
LOSS:  tensor(0.1948)
AUC:  0.9151547727505023
EPOCH 3
LOSS:  tensor(0.1908)
AUC:  0.9181786043807202
EPOCH 4
LOSS:  tensor(0.1881)
AUC:  0.9193750785768711
EPOCH 5
LOSS:  tensor(0.1877)
AUC:  0.9201151050022124
EPOCH 6
LOSS:  tensor(0.1954)
AUC:  0.9205411005616596
EPOCH 7
LOSS:  tensor(0.1843)
AUC:  0.920761261025139
EPOCH 8
LOSS:  tensor(0.1902)
AUC:  0.9211559330257432
EPOCH 9
LOSS:  tensor(0.1841)
AUC:  0.921142188556552
SCORE IS NOT THE BEST. Early stop counter: 1
EPOCH 10
LOSS:  tensor(0.1956)
AUC:  0.9213437084078532
EPOCH 11
LOSS:  tensor(0.1892)
AUC:  0.9210556243336094
SCORE IS NOT THE BEST. Early stop counter: 1
EPOCH 12
LOSS:  tensor(0.1838)
AUC:  0.9210803920399779
SCORE IS NOT THE BEST. Early stop counter: 2
EPOCH 13
LOSS:  tensor(0.1915)
AUC:  0.9210856166671346
SCORE IS NOT THE BEST. Early stop counter: 3
EPOCH 14
LOSS:  tensor(0.1841)
AUC:  0.921401953

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

EPOCH 0
LOSS:  tensor(0.2281)
AUC:  0.8884005116884348
EPOCH 1
LOSS:  tensor(0.2143)
AUC:  0.9111755777621611
EPOCH 2
LOSS:  tensor(0.1992)
AUC:  0.9164467597699146
EPOCH 3
LOSS:  tensor(0.1900)
AUC:  0.9192030566039772
EPOCH 4
LOSS:  tensor(0.1848)
AUC:  0.9214078354331655
EPOCH 5
LOSS:  tensor(0.1855)
AUC:  0.9222722086763803
EPOCH 6
LOSS:  tensor(0.1900)
AUC:  0.9225520391536532
EPOCH 7
LOSS:  tensor(0.1817)
AUC:  0.9228059435856447
EPOCH 8
LOSS:  tensor(0.1929)
AUC:  0.9228546387704539
EPOCH 9
LOSS:  tensor(0.1809)
AUC:  0.9231431066525166
EPOCH 10
LOSS:  tensor(0.2003)
AUC:  0.9239198714389031
EPOCH 11
LOSS:  tensor(0.1924)
AUC:  0.923533878335826
SCORE IS NOT THE BEST. Early stop counter: 1
EPOCH 12
LOSS:  tensor(0.1824)
AUC:  0.9242362397289915
EPOCH 13
LOSS:  tensor(0.1884)
AUC:  0.9238683374593251
SCORE IS NOT THE BEST. Early stop counter: 1
EPOCH 14
LOSS:  tensor(0.1814)
AUC:  0.9235464471774635
SCORE IS NOT THE BEST. Early stop counter: 2
EPOCH 15
LOSS:  tensor(0.1887)
AUC: 

In [28]:
# 0 fold AUC:  0.9211992827613138

In [29]:
print("CV score: {:<8.5f}".format(roc_auc_score(label, oof)))

CV score: 0.92259 


In [30]:
oof.shape

(200000,)

In [31]:
predictions.shape

(200000,)

In [32]:
tst_sub = pd.read_csv('sample_submission.csv')

In [33]:
tst_sub['target'] = predictions

In [34]:
tst_sub.to_csv('nn_aug_92259.csv',index=False)

In [None]:
#with no first relu 0.91826 enc 30

In [25]:
label

0         0
1         0
2         0
3         0
4         0
5         0
6         0
7         0
8         0
9         0
10        0
11        0
12        0
13        1
14        0
15        0
16        0
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        0
28        0
29        1
         ..
199970    0
199971    0
199972    0
199973    0
199974    0
199975    0
199976    1
199977    0
199978    0
199979    0
199980    0
199981    1
199982    0
199983    0
199984    0
199985    0
199986    1
199987    0
199988    0
199989    0
199990    1
199991    0
199992    0
199993    0
199994    0
199995    0
199996    0
199997    0
199998    0
199999    0
Name: target, Length: 200000, dtype: int64

In [36]:
np.save(f"nn_{roc_auc_score(label, oof)}_oof.npy", oof)