In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import scale
from sklearn.metrics import roc_auc_score
from torch.optim import Adam
from torch.optim.lr_scheduler import MultiStepLR
from torch.nn.modules.loss import BCEWithLogitsLoss, BCELoss
import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader
import random

In [2]:
train_df = pd.read_csv('train.csv')

In [3]:
label = train_df.target
train = train_df.drop(['ID_code','target'],axis=1)

In [4]:
test = pd.read_csv('test.csv')

In [5]:
test_filtered = pd.read_pickle('test_filtred.pkl')

In [6]:
test_filtered = test_filtered.loc[:,train.columns]

In [7]:
test = test.drop(['ID_code'],axis=1)

In [8]:
train_test = pd.concat([train,test_filtered]).reset_index(drop=True)

In [9]:
vcs_train_test = {}


for col in tqdm(train.columns):
    vcs_train_test[col] = train_test.loc[:,col].value_counts()/300000

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [10]:
for col in tqdm(train.columns):
    vtraintest = vcs_train_test[col]
    
    t = vtraintest[train[col]].fillna(0).values
    train[col+'_train_test_sum_vcs'] = t
    
    train[col+'_train_test_sum_vcs_product'] = train[col]*t



    t = vtraintest[test[col]].fillna(0).values
    test[col+'_train_test_sum_vcs'] = t
    
    test[col+'_train_test_sum_vcs_product'] = test[col]*t


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [11]:
random_seed = 42

np.random.seed(random_seed)
random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
scaler = StandardScaler()

In [14]:
cols = train.columns

In [15]:
X_train = pd.DataFrame(scaler.fit_transform(train),columns=cols)

In [16]:
test = pd.DataFrame(scaler.transform(test),columns=cols)

In [17]:
device = torch.device('cuda:3')
# device = torch.device('cpu')

In [18]:
train_tensors = []
test_tensors = []

for fff in tqdm(range(200)):
    train_t = X_train[[f'var_{fff}',f'var_{fff}_train_test_sum_vcs',f'var_{fff}_train_test_sum_vcs_product']].values
    test_t =  test[[f'var_{fff}',f'var_{fff}_train_test_sum_vcs',f'var_{fff}_train_test_sum_vcs_product']].values
    train_tensors.append(torch.tensor(train_t, requires_grad=False, device=device, dtype=torch.float32))
    test_tensors.append(torch.tensor(test_t, requires_grad=False, device=device, dtype=torch.float32))


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [19]:
train_tensors = torch.cat(train_tensors,1).view((-1,200,3))

In [20]:
test_tensors = torch.cat(test_tensors,1).view((-1,200,3))

In [22]:
y_train_t = torch.tensor(label.values, requires_grad=False, device=device, dtype=torch.float32)

In [23]:
class NN(torch.nn.Module):
    random_seed = 42


    def __init__(self, D_in=3, features = 200):
        np.random.seed(random_seed)
        random.seed(random_seed)
        torch.manual_seed(random_seed)
        torch.cuda.manual_seed(random_seed)
        
        super(NN, self).__init__()
        self.layer = []
        layer_size = D_in
        enc_out = 30
        for i in range(features):
            
            layer = torch.nn.Sequential(torch.nn.Linear(layer_size, enc_out//2),
                                       torch.nn.ReLU(),
                                       torch.nn.Linear(enc_out//2, enc_out),
                                       torch.nn.ReLU())
            setattr(self, 'layer_' + str(i), layer)
        

        self.linear3 = torch.nn.Linear(features*enc_out,1)        

    def forward(self, y):
        res = []
        for i in range(200):
            layer = getattr(self, 'layer_' + str(i))
            res.append(layer(y[:,i,:]) )
        y = torch.cat(res,1)
        y = self.linear3(y)
        return y
    

dataset = TensorDataset(train_tensors,y_train_t)
nn = NN().to(device)
loss_f = BCEWithLogitsLoss()

optimizer = Adam(params=nn.parameters(), lr = 0.005)
scheduler = MultiStepLR(optimizer, milestones=[15, 25, 35, 55], gamma=0.5)
batch_size = 2048

In [24]:
random_seed = 42

np.random.seed(random_seed)
random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

for epoch in tqdm(range(27)):
    dl = DataLoader(dataset, batch_size=batch_size, shuffle=True,num_workers=0)
    for data,label in dl:
        pred = nn(data)
        loss = loss_f(pred, torch.unsqueeze(label,-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

HBox(children=(IntProgress(value=0, max=27), HTML(value='')))




In [25]:
batch_size = 20000
blobs = []
with torch.no_grad():
    for batch in tqdm(torch.split(test_tensors,batch_size)):
        blob = nn(batch).data.cpu().numpy()
        blobs.append(blob)
predictions = np.concatenate(blobs)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




In [28]:
predictions

array([[-2.7483144],
       [-1.3068379],
       [-1.5198404],
       ...,
       [-6.073699 ],
       [-1.828666 ],
       [-4.4224715]], dtype=float32)

In [29]:
tst_sub = pd.read_csv('sample_submission.csv')

In [30]:
tst_sub['target'] = predictions

In [32]:
tst_sub.to_csv('200_in_nn_sub.csv',index=False)

In [None]:
pd.options.display.max_rows = 1000
pd.Series(model.feature_importances_,index=model.feature_names_).sort_values(ascending=False)