In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.init
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms


import gc

from tqdm.autonotebook import tqdm
import os
tqdm.pandas()



In [2]:
!nvidia-smi

Fri Jun 21 09:48:09 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 415.27       Driver Version: 415.27       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:81:00.0 Off |                  N/A |
|  0%   45C    P3    50W / 270W |      1MiB / 10989MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
train_dir = './train-data'
train_tfidf = np.load(os.path.join(train_dir, 'train_tfidf.npy'))[()]
test_tfidf = np.load(os.path.join(train_dir, 'test_tfidf.npy'))[()]
# Class will start with 0
train_y = pd.read_csv(os.path.join(train_dir, 'train_y.csv')) - 1
test_uid = pd.read_csv(os.path.join(train_dir, 'test_x.csv'))['uid']

In [5]:
class TfidfDataset(Dataset):
    def __init__(self, x: np.ndarray, y=None, **kwargs):

      self.num_samples = x.shape[0]

      if y is not None:
          # Train
          self.y = y.values.reshape(-1, 1)
      else:
          # Test
          self.y = np.zeros((self.num_samples, 1))
        
      self.x = x
    
      self.init_kwargs = kwargs

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return [self.x[idx].astype('float32').toarray(), self.y[idx].squeeze()]

In [6]:
class ConvNN(nn.Module):
    def __init__(self, input_length, num_classes, conv_layers, lin_layers, dropout):
        super().__init__()
        
        # Conv
        # conv over the whole vector
        conv1 = nn.Conv1d(1, 50, input_length)
        avg_pool1 = nn.AdaptiveAvgPool1d(1)
        max_pool1 = nn.AdaptiveMaxPool1d(1)
        
        self.conv_list = nn.ModuleList([conv1] + [nn.Conv1d(1, out_channel, kernel, stride=1) 
                                              for out_channel, kernel in conv_layers])
        self.avg_pool_list = nn.ModuleList([avg_pool1] + [nn.AdaptiveAvgPool1d(1) 
                                              for out_channel, kernel in conv_layers])
        self.max_pool_list = nn.ModuleList([max_pool1] + [nn.AdaptiveMaxPool1d(1) 
                                              for out_channel, kernel in conv_layers])
        
        for layer in self.conv_list:
            nn.init.kaiming_normal_(layer.weight.data)
        
        conv_out_num = (50 + sum([out_channels for out_channels, _ in conv_layers])) * 2
        
        # FC
        fc1 = nn.Linear(conv_out_num, lin_layers[0])
        self.fc_list = nn.ModuleList([fc1] + [nn.Linear(lin_layers[i], lin_layers[i+1]) 
                                              for i in range(len(lin_layers) - 1)])
        
        for layer in self.fc_list:
            nn.init.kaiming_normal_(layer.weight.data)
            
        self.fc_out = nn.Linear(lin_layers[-1], num_classes)
        nn.init.kaiming_normal_(self.fc_out.weight.data)
        
        # BN
        self.conv_bn_layers = nn.ModuleList([nn.BatchNorm1d(50)] + [nn.BatchNorm1d(out_channel)
                                        for out_channel, _ in conv_layers])
        self.lin_bn_layers = nn.ModuleList([nn.BatchNorm1d(layer)
                                        for layer in lin_layers])

        # Dropout
        self.embed_dropout = nn.Dropout(dropout)
        self.droput_layers = nn.ModuleList([nn.Dropout(dropout) for layer in lin_layers])
        
    def forward(self, input):
        conv_out = torch.Tensor()
        for conv, avg_pool, max_pool, bn in zip(self.conv_list, self.avg_pool_list, 
                                                self.max_pool_list, self.conv_bn_layers):
            o = conv(input)
            o = bn(o)
            avg_o = avg_pool(o)
            max_o = max_pool(o)
            conv_out = torch.cat([conv_out.cpu(), avg_o.cpu(), max_o.cpu()], 1).to(device)
        
        x = F.relu(conv_out).flatten(1)
        for fc, bn, dropout in zip(self.fc_list, self.lin_bn_layers, self.droput_layers):
            x = F.relu(fc(x))
            x = bn(x)
            x = dropout(x)
            
        return F.softmax(self.fc_out(x), dim=1)
            

In [7]:
def train(dataset, loader, net, optimizer):
    net.train()
    total_loss = 0
    for x, y in tqdm(loader, leave=False):
        x = x.float().to(device)
        y = y.long().to(device)
        optimizer.zero_grad()

        pred = net(x)
#         print(y.shape)
#         print(pred.shape)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss/float(len(loader))

def predict(dataset, loader, net):
    with torch.no_grad():
        net.eval()
        corrects = eval_loss = 0
        result = []
        for x, y in tqdm(loader):
            x = x.float().to(device)
            pred = net(x)

            result.append(torch.max(pred, 1)[1].view(y.size()).data)
        return result

def evaluate(dataset, loader, net):
    with torch.no_grad():
        net.eval()
        corrects = eval_loss = 0

        for x, y in tqdm(loader):
            x = x.float().to(device)
            y = y.long().to(device)
            pred = net(x)
            loss = criterion(pred, y)

            eval_loss += loss.item()
            corrects += (torch.max(pred, 1)[1].view(y.size()).data == y.data).sum()
        #loss, correct count, accuracy
        return eval_loss/float(len(loader)), corrects, corrects*100/len(dataset)

In [8]:
batch_size = 200
lr = 0.01
epoch = 1
criterion = torch.nn.CrossEntropyLoss()

In [9]:
import sklearn
import sklearn.model_selection

In [10]:

# splits = 3
# kfold = sklearn.model_selection.StratifiedKFold(splits, shuffle=True)
# for train_index, test_index in kfold.split(train_tfidf, train_y):
#     net = ConvNN(train_tfidf.shape[1], len(train_y['age_group'].unique()), 
#                  [(128, 1), (64, 3), (32, 5), (20, 1000)], [900, 300], 0.5)
#     net = net.to(device)
#     train_dataset = TfidfDataset(train_tfidf[train_index], train_y.iloc[train_index])
#     train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
#     validation_dataset = TfidfDataset(train_tfidf[test_index], train_y.iloc[test_index])
#     validation_loader = DataLoader(validation_dataset, batch_size, shuffle=False)
#     optimizer = torch.optim.Adam(net.parameters(), lr=lr)
#     # learning rate decay
#     for i in tqdm(range(epoch)):
#         print(train(train_dataset, train_loader, net, optimizer))
#         print(evaluate(validation_dataset, validation_loader, net))
#     print("train acc:", evaluate(train_dataset, train_loader, net))
#     print("validation acc:", evaluate(validation_dataset, validation_loader, net))

In [11]:
batch_size = 200
epoch = 1
criterion = torch.nn.CrossEntropyLoss()

splits = 20
kfold = sklearn.model_selection.StratifiedKFold(splits, shuffle=True)
train_index, valid_index = next(iter(kfold.split(train_tfidf, train_y)))

train_dataset = TfidfDataset(train_tfidf[train_index], train_y.iloc[train_index])
train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
validation_dataset = TfidfDataset(train_tfidf[valid_index], train_y.iloc[valid_index])
validation_loader = DataLoader(validation_dataset, batch_size, shuffle=False)
test_dataset = TfidfDataset(test_tfidf)
test_loader = DataLoader(test_dataset, batch_size, shuffle=False)



In [12]:
net = ConvNN(train_tfidf.shape[1], len(train_y['age_group'].unique()), 
                 [(128, 1), (64, 2), (48, 3), (32, 5), (20, 1000)], [800, 360], 0.3)
# net.load_state_dict(torch.load('./nn-model.pkl')['state_dict'])
net = net.to(device)
# print(net)

In [13]:
# lr = 0.005
# optimizer = torch.optim.Adam(net.parameters(), lr=lr)


In [14]:
# for i in tqdm(range(epoch)):
#     print(train(train_dataset, train_loader, net, optimizer))
# #     print("train acc:", evaluate(train_dataset, train_loader, net))
#     print("validation acc:", evaluate(validation_dataset, validation_loader, net))


### Switch to fastai

In [15]:
import fastai
from fastai.basic_train import *
from fastai.basic_data import *
from fastai.train import *
import fastai.metrics as metrics

In [16]:
data = DataBunch(train_loader, validation_loader)

In [17]:
learn = Learner(data, net, loss_func=criterion, metrics=[metrics.accuracy])

In [39]:
learn.fit_one_cycle(1, 1e-6)

epoch,train_loss,valid_loss,accuracy,time
0,1.403561,1.432169,0.605652,42:50


In [40]:
print("validation acc:", evaluate(validation_dataset, validation_loader, net))

HBox(children=(IntProgress(value=0, max=503), HTML(value='')))


validation acc: (1.4321704553088421, tensor(60868, device='cuda:0'), tensor(60, device='cuda:0'))


In [41]:
torch.save({'state_dict': net.state_dict()}, './model/nn-model.pkl')

In [42]:
results = predict(test_dataset, test_loader, net)

HBox(children=(IntProgress(value=0, max=2513), HTML(value='')))




In [43]:
results_list = []
for res in results:
    for item in res:
        results_list.append(item.item())

In [44]:
len(results_list)

502500

In [45]:
result = pd.DataFrame(test_uid)
result.columns = ['id']
result['label'] = results_list
result['label'] = result['label'] + 1

In [46]:
result.to_csv('submission.csv', index=False)

In [47]:
!nvidia-smi

Fri Jun 21 16:05:12 2019       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 415.27       Driver Version: 415.27       CUDA Version: 10.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:81:00.0 Off |                  N/A |
|  0%   52C    P2    63W / 270W |   9190MiB / 10989MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    