In [1]:
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data
import spacy
import re
import numpy as np
import pandas as pd

In [2]:
SEED = 12345
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
def text_clean(text):
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text) # remove non alphanumeric character
    text = re.sub(r'https?:/\/\S+', ' ', text) # remove links
    return text.strip()

In [None]:
#!python -m spacy download en #if needed, please uncomment and download english model for spacy

In [4]:
# tokenizer function using spacy
nlp = spacy.load('en',disable=['parser', 'tagger', 'ner'])
min_len = 5
def tokenizer(s): 
    tokenized = [w.text.lower() for w in nlp(text_clean(s))]
    if len(tokenized) < min_len: #make sure that length is at least 6 
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    return tokenized

In [5]:
txt_field = data.Field(sequential=True, 
                       tokenize=tokenizer, 
                       use_vocab=True)

label_field = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                         unk_token=None)

In [6]:
train_val_fields = [
    ('Unique ID', None), #this feature is not processed 
    ('Type', None), #this feature is not processed 
    ('Text', txt_field), #process it as a text
    ('l_3.1.1', label_field), #process it as a label
    ('l_3.1.2', label_field),
    ('l_3.2.1', label_field),
    ('l_3.2.2', label_field),
    ('l_3.3.1', label_field),
    ('l_3.3.2', label_field),
    ('l_3.3.3', label_field),
    ('l_3.3.4', label_field),
    ('l_3.3.5', label_field),
    ('l_3.4.1', label_field),
    ('l_3.4.2', label_field),
    ('l_3.5.1', label_field),
    ('l_3.5.2', label_field),
    ('l_3.6.1', label_field),
    ('l_3.7.1', label_field),
    ('l_3.7.2', label_field),
    ('l_3.8.1', label_field),
    ('l_3.8.2', label_field),
    ('l_3.9.1', label_field),
    ('l_3.9.2', label_field),
    ('l_3.9.3', label_field),
    ('l_3.a.1', label_field),
    ('l_3.b.1', label_field),
    ('l_3.b.2', label_field),
    ('l_3.b.3', label_field),
    ('l_3.c.1', label_field),
    ('l_3.d.1', label_field)    
]

In [7]:
train_data, val_data = data.TabularDataset.splits(path='./',
                                                  format='csv', 
                                                  train='train_clean.csv', 
                                                  validation='my_val_set.csv', 
                                                  fields=train_val_fields, 
                                                  skip_header=True)

Let's build the vocabulary and load the pre-trained word embeddings.

In [8]:
#txt_field.build_vocab(train_data, val_data, max_size=25000, vectors="glove.6B.300d")
txt_field.build_vocab(train_data, val_data, max_size=25000, vectors="fasttext.en.300d")
label_field.build_vocab(train_data)

In [9]:
BATCH_SIZE = 64

train_iterator, valid_iterator = data.BucketIterator.splits(
                                (train_data, val_data), 
                                sort_key=lambda x: len(x.Text),
                                sort_within_batch=True,
                                batch_size=BATCH_SIZE, 
                                device=DEVICE)

In [10]:
def get_target_labels(batch, device, train_val_fields):
    target_labels = np.zeros((len(batch),0))
    for i in range(27): #27 labels 
        label_name = train_val_fields[i+3][0] #get a name of label, label starts from index=3 
        single_label = getattr(batch,label_name).cpu().numpy()
        single_label.shape = (len(batch),1)
        target_labels = np.hstack((target_labels,single_label))
    return torch.from_numpy(target_labels).to(device)    

# CNN model

In [72]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.conv_0 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[0],embedding_dim))
        self.conv_1 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[1],embedding_dim))
        self.conv_2 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[2],embedding_dim))
        self.conv_3 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[3],embedding_dim))
        self.conv_4 = nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(filter_sizes[4],embedding_dim))
    
        self.fc = nn.Linear(len(filter_sizes)*n_filters, output_dim)    
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [sent len, batch size]
        
        x = x.permute(1, 0)
                
        #x = [batch size, sent len]
        
        embedded = self.embedding(x)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        #print (embedded.size())
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved_0 = F.relu(self.conv_0(embedded).squeeze(3))
        conved_1 = F.relu(self.conv_1(embedded).squeeze(3))
        conved_2 = F.relu(self.conv_2(embedded).squeeze(3))
        conved_3 = F.relu(self.conv_3(embedded).squeeze(3))
        conved_4 = F.relu(self.conv_4(embedded).squeeze(3))
        
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        pooled_3 = F.max_pool1d(conved_3, conved_3.shape[2]).squeeze(2)
        pooled_4 = F.max_pool1d(conved_4, conved_4.shape[2]).squeeze(2)
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2, pooled_3, pooled_4), dim=1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [91]:
INPUT_DIM = len(txt_field.vocab)
EMBEDDING_DIM = 300
N_FILTERS = 300
FILTER_SIZES = [1,2,3,4,5]
OUTPUT_DIM = 27
DROPOUT = 0.5


model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

Let's load pretrained embeddings

In [92]:
pretrained_embeddings = txt_field.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0653, -0.0930, -0.0176,  ...,  0.1664, -0.1308,  0.0354],
        ...,
        [ 0.1434,  0.1650, -0.3431,  ...,  0.1982,  0.3606,  0.0768],
        [-0.1984,  0.1341, -0.3664,  ...,  0.1997,  0.4324,  0.2593],
        [ 0.0289,  0.2313, -0.3855,  ...,  0.1142,  0.2038, -0.3233]])

# Training

In [93]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)
#optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.BCEWithLogitsLoss()
model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

In [94]:
def calculate_HW(predictions, target_labels):
    #rounded_preds = (torch.sigmoid(predictions)>0.25).float()
    rounded_preds = torch.round(torch.sigmoid(predictions))
    incorrects = (rounded_preds != target_labels.float())
    HW = incorrects.cpu().numpy().sum()
    return HW

In [95]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    metric = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        predictions = model(batch.Text)
        target_labels = get_target_labels(batch,DEVICE,train_val_fields)
        
        loss = criterion(predictions, target_labels.float())
        
        HW = calculate_HW(predictions, target_labels) #caculate Hamming weight in batch
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        metric = metric + HW 
        
    return epoch_loss/len(iterator), metric/(len(train_data)*27)

In [96]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    metric = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.Text)
            target_labels = get_target_labels(batch,DEVICE,train_val_fields)
        
            loss = criterion(predictions, target_labels.float())
            
            HW = calculate_HW(predictions, target_labels) #caculate Hamming Weight in batch

            epoch_loss += loss.item()
            metric = metric + HW
        
    #return epoch_loss/len(iterator), metric/(len(train_data)*27)
    return epoch_loss/len(iterator), metric/(len(val_data)*27)

In [108]:
N_EPOCHS = 15

for epoch in range(N_EPOCHS):

    train_loss, train_metric = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_metric = evaluate(model, valid_iterator, criterion)
    #valid_loss, valid_metric = evaluate(model, train_iterator, criterion)
    
    #print(f'| Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Val. Loss: {valid_loss:.3f} |')
    print (epoch)
    print (train_loss)
    print (train_metric)
    print (valid_loss)
    print (valid_metric)
    print ('--------------')

0
0.03119965690247556
0.009608606937488407
0.016027373261749744
0.0049382716049382715
--------------
1
0.030069927982193358
0.009695170963952266
0.01240212531411089
0.0036087369420702755
--------------


# Generating submission

In [98]:
df_train_clean = pd.read_csv('train_clean.csv', low_memory=False)
df_test_clean =  pd.read_csv('test_clean.csv', low_memory=False)
df_submission =  pd.read_csv('Devex_submission_format.csv', low_memory=False)

In [99]:
def predict(sentence, min_len=6):
    #tokenized = [tok.text for tok in nlp.tokenizer(text_clean(sentence))]
    tokenized = tokenizer(sentence)
    if len(tokenized) < min_len:
        tokenized += ['<pad>'] * (min_len - len(tokenized))
    indexed = [txt_field.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(DEVICE)
    tensor = tensor.unsqueeze(1)
    predictions = torch.sigmoid(model(tensor))
    preds_rounded = torch.round(predictions)

    return preds_rounded.detach()

In [109]:
model.eval()

for i in range(len(df_submission)):
    id = df_submission.at[i,'ID']
    index = df_test_clean.index[df_test_clean['Unique ID']==id].tolist()
    text = df_test_clean.at[index[0],'Text']
    predictions = predict(text,5) #calculate predictions, padded to 5 if needed   
    for j in range (1,28): #labels starts from 1 in df_submission
        df_submission.iloc[i,j]=predictions.cpu().numpy()[0][j-1]

In [110]:
df_submission = df_submission.astype(int)

In [111]:
df_submission.to_csv('my_submission.csv', index=False)