In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("ready_to_serve_train.csv")
dev = pd.read_csv("ready_to_serve_dev.csv")

In [3]:
train.head(3)

Unnamed: 0,id,original,edit,grades,meanGrade,grade_round,grades_0,grades_1,grades_2,grades_3,grades_4,edited_head_line,original_cleaned
0,14530,France is ‘ hunting down its citizens who join...,twins,10000,0.2,0,1,0,0,0,0,france is hunting down its citizens who joined...,france is hunting down its citizens who joined...
1,13034,"Pentagon claims 2,000 % increase in Russian tr...",bowling,33110,1.6,2,3,3,1,1,0,"pentagon claims 2,000 % increase in russian tr...","pentagon claims 2,000 % increase in russian tr..."
2,8731,Iceland PM Calls Snap Vote as Pedophile Furor ...,party,22100,1.0,1,2,2,1,0,0,iceland pm calls snap vote as pedophile furor ...,iceland pm calls snap vote as pedophile furor ...


In [4]:
max([len(i.split(" ")) for i in train["edited_head_line"].values])

27

In [5]:
max([len(i.split(" ")) for i in dev["edited_head_line"].values])

27

In [6]:
dev.head(3)

Unnamed: 0,id,original,edit,edited_head_line,original_cleaned
0,1723,Thousands of gay and bisexual <men/> convicted...,swans,thousands of gay and bisexual swans convicted ...,thousands of gay and bisexual men convicted of...
1,12736,Special <prosecutor/> appointed to Trump Russia,chef,special chef appointed to trump russia,special prosecutor appointed to trump russia
2,12274,Spanish police detain man and search Ripoll ad...,squad,spanish police detain man and search ripoll ad...,spanish police detain man and search ripoll ad...


In [7]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=2)

In [8]:
for train_index,test_index in skf.split(train,train["grades_0"]):
    train.iloc[train_index][["edited_head_line","grades_0"]].to_csv("train_grade0.csv",index=False,header=False)
    train.iloc[test_index][["edited_head_line","grades_0"]].to_csv("test_grade0.csv",index=False,header=False)

In [9]:
dev.edited_head_line.to_csv("dev_edited.csv",index=False,header=False)

In [10]:
import spacy
from torchtext import data
spacy_en = spacy.load("en")

def tokenizer(text): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text)]

TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True,fix_length=28)
LABEL = data.Field(sequential=False, use_vocab=False)

In [11]:
train, val = data.TabularDataset.splits(
        path='.', train='train_grade0.csv',
        validation='test_grade0.csv', format='csv',
        fields=[('Text', TEXT), ('Label', LABEL)])

In [12]:
TEXT.build_vocab(train,dev, vectors="fasttext.en.300d")


In [13]:
train.examples[0].Text

['france',
 'is',
 'hunting',
 'down',
 'its',
 'citizens',
 'who',
 'joined',
 'twins',
 '’',
 'without',
 'trial',
 'in',
 'iraq']

In [14]:
TEXT.vocab.vectors.shape

torch.Size([8600, 300])

In [15]:
import torch
train_iter, val_iter = data.Iterator.splits(
        (train, val), sort_key=lambda x: len(x.Text),
        batch_sizes=(32, 32, 32), device=torch.device(0))

In [16]:
# from torch import nn
# emb_dim = 300
# vocab = TEXT.vocab
# self.embed = nn.Embedding(len(vocab), emb_dim)
# self.embed.weight.data.copy_(vocab.vectors)

In [17]:
TEXT.vocab.vectors.shape

torch.Size([8600, 300])

In [18]:
TEXT.vocab.itos[55]

'how'

In [19]:
from torch import nn
class SimpleLinearModel(nn.Module):
    
    def __init__(self,vocab,max_seq_length):
        super(SimpleLinearModel, self).__init__()
        emb_dim = TEXT.vocab.vectors.shape[1]
        self.embed = nn.Embedding(len(vocab), emb_dim)
        self.embed.weight.data.copy_(vocab.vectors)
        
        self.linear1 = nn.Linear(emb_dim * max_seq_length,2000)
        self.dropout = nn.Dropout(0.7)
        self.linear2 = nn.Linear(2000,4)
        
    def forward(self,inputs):
        embeddings = self.embed(inputs)
        return self.linear2(self.dropout(self.linear1(embeddings.reshape(32,8400))))
        

In [20]:
from simple_models import *

In [21]:
import numpy as np
from sklearn.metrics import precision_score , recall_score , f1_score
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)

def train_model(model, train_iter, epoch):
    total_epoch_loss = 0
    total_epoch_acc = 0
    total_epoch_pre = 0
    total_epoch_recal = 0
    total_epoch_f1 = 0
    
    model.cuda()
    optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
    steps = 0
    model.train()
    for idx, batch in enumerate(train_iter):
        text = batch.Text.T
        target = batch.Label
        target = torch.autograd.Variable(target).long()
        if torch.cuda.is_available():
            text = text.cuda()
            target = target.cuda()
        if (text.size()[0] is not 32):# One of the batch returned by BucketIterator has length different than 32.
            continue
        optim.zero_grad()
        prediction = model(text)
        loss = loss_fn(prediction, target)
        num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
        acc = 100.0 * num_corrects/len(batch)
        predictions_classes = np.argmax(prediction.clone().cpu().detach().numpy(),axis=1)
        target_classes = target.clone().cpu().detach().numpy()
        ps = precision_score(predictions_classes,target_classes,average='micro')
        rs = recall_score(predictions_classes,target_classes,average='micro')
        f1_s = f1_score(predictions_classes,target_classes,average='micro')
        loss.backward()
#         clip_gradient(model, 1e-1)
        optim.step()
        steps += 1
        
        if steps % 100 == 0:
            print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')
        
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()
        total_epoch_pre += ps
        total_epoch_recal += rs
        total_epoch_f1 += f1_s
        
    return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter) , total_epoch_pre/len(train_iter) , total_epoch_recal/len(train_iter) ,total_epoch_f1/len(train_iter)

In [22]:
def eval_model(model, val_iter):
    total_epoch_loss = 0
    total_epoch_acc = 0
    total_epoch_pre = 0
    total_epoch_recal = 0
    total_epoch_f1 = 0

    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(val_iter):
            text = batch.Text.T
            if (text.size()[0] is not 32):
                continue    
            target = batch.Label
            target = torch.autograd.Variable(target).long()
            if torch.cuda.is_available():
                text = text.cuda()
                target = target.cuda()
            prediction = model(text)
            loss = loss_fn(prediction, target)
            predictions_classes = np.argmax(prediction.clone().cpu().detach().numpy(),axis=1)
            target_classes = target.clone().cpu().detach().numpy()

            ps = precision_score(predictions_classes,target_classes,average='micro')
            rs = recall_score(predictions_classes,target_classes,average='micro')
            f1_s = f1_score(predictions_classes,target_classes,average='micro')

            num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()
            acc = 100.0 * num_corrects/len(batch)
            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()
            total_epoch_pre += ps
            total_epoch_recal += rs
            total_epoch_f1 += f1_s

            
    return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter) ,total_epoch_pre/len(val_iter) , total_epoch_recal/len(val_iter) ,total_epoch_f1/len(val_iter)

## Try Various simple model archs

### Simple linear model

In [23]:
# import torch
# learning_rate = 1e-7
# batch_size = 32
# output_size = 2
# hidden_size = 256
# embedding_length = 300
# import torch.nn.functional as F

# model = SimpleLinearModel(TEXT.vocab,28)
# loss_fn = F.cross_entropy
# print(model)
# for epoch in range(10):
#     train_loss, train_acc , train_precision, train_recall , train_f1 = train_model(model, train_iter, epoch)
#     val_loss, val_acc , val_precision, val_recall , val_f1= eval_model(model, val_iter)
    
#     print(f'Epoch: {epoch+1:01}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Train Prec: {train_precision:.2f},Train Recal: {train_recall:.2f},Train F1: {train_f1:.2f},\
#            Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}, Val Prec: {val_precision:.2f}, Val Recal: {val_recall:.2f}, Val F1: {val_f1:.2f}')
 
# #     print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')

# # test_loss, test_acc = eval_model(model, test_iter)
# # print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')


## Conv model

In [24]:
# import torch
# learning_rate = 1e-7
# batch_size = 32
# output_size = 4
# hidden_size = 256
# embedding_length = 300
# import torch.nn.functional as F

# model = CNN(batch_size,output_size,1,1,[5,3,1],1,1,0.7,TEXT.vocab)
# loss_fn = F.cross_entropy
# print(model)
# for epoch in range(10):
#     train_loss, train_acc , train_precision, train_recall , train_f1 = train_model(model, train_iter, epoch)
#     val_loss, val_acc , val_precision, val_recall , val_f1= eval_model(model, val_iter)
    
#     print(f'Epoch: {epoch+1:01}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Train Prec: {train_precision:.2f},Train Recal: {train_recall:.2f},Train F1: {train_f1:.2f},\
#            Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}, Val Prec: {val_precision:.2f}, Val Recal: {val_recall:.2f}, Val F1: {val_f1:.2f}')
 
# #     print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')

# # test_loss, test_acc = eval_model(model, test_iter)
# # print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')


## LSTM classifier

In [29]:
import torch
learning_rate = 5e-7
batch_size = 32
output_size = 4
hidden_size = 256
embedding_length = 300
import torch.nn.functional as F

model = LSTMClassifier(batch_size,output_size,256,TEXT.vocab)
loss_fn = F.cross_entropy
print(model)
for epoch in range(10):
    train_loss, train_acc , train_precision, train_recall , train_f1 = train_model(model, train_iter, epoch)
    val_loss, val_acc , val_precision, val_recall , val_f1= eval_model(model, val_iter)
    
    print(f'Epoch: {epoch+1:01}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Train Prec: {train_precision:.2f},Train Recal: {train_recall:.2f},Train F1: {train_f1:.2f},\
           Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}, Val Prec: {val_precision:.2f}, Val Recal: {val_recall:.2f}, Val F1: {val_f1:.2f}')
 
#     print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')

# test_loss, test_acc = eval_model(model, test_iter)
# print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}%')


LSTMClassifier(
  (word_embeddings): Embedding(8600, 300)
  (lstm): LSTM(300, 256)
  (label): Linear(in_features=256, out_features=4, bias=True)
)
Epoch: 1, Idx: 100, Training Loss: 1.1504, Training Accuracy:  53.12%
Epoch: 1, Train Loss: 1.236, Train Acc: 35.68%, Train Prec: 0.36,Train Recal: 0.36,Train F1: 0.36,           Val. Loss: 1.235837, Val. Acc: 37.62, Val Prec: 0.38, Val Recal: 0.38, Val F1: 0.38
Epoch: 2, Idx: 100, Training Loss: 1.1952, Training Accuracy:  53.12%
Epoch: 2, Train Loss: 1.228, Train Acc: 37.50%, Train Prec: 0.38,Train Recal: 0.38,Train F1: 0.38,           Val. Loss: 1.227356, Val. Acc: 37.62, Val Prec: 0.38, Val Recal: 0.38, Val F1: 0.38
Epoch: 3, Idx: 100, Training Loss: 1.1660, Training Accuracy:  46.88%
Epoch: 3, Train Loss: 1.224, Train Acc: 37.46%, Train Prec: 0.37,Train Recal: 0.37,Train F1: 0.37,           Val. Loss: 1.225438, Val. Acc: 37.62, Val Prec: 0.38, Val Recal: 0.38, Val F1: 0.38
Epoch: 4, Idx: 100, Training Loss: 1.2109, Training Accuracy:  4