In [1]:
import pandas as pd

In [2]:
from inc import stop_words as sw
from inc import extra_stopwords as esw

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
from sqlalchemy import create_engine
table_name = 'Training_set'
db_path = 'db/training.sqlite'
db_path = 'sqlite:///' + db_path
engine = create_engine(db_path, echo=True)

In [6]:
training_data_df = pd.read_sql_table(table_name, engine)

2020-12-16 18:47:42,290 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2020-12-16 18:47:42,291 INFO sqlalchemy.engine.base.Engine ()
2020-12-16 18:47:42,292 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2020-12-16 18:47:42,293 INFO sqlalchemy.engine.base.Engine ()
2020-12-16 18:47:42,295 INFO sqlalchemy.engine.base.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2020-12-16 18:47:42,295 INFO sqlalchemy.engine.base.Engine ()
2020-12-16 18:47:42,297 INFO sqlalchemy.engine.base.Engine SELECT name FROM sqlite_master WHERE type='view' ORDER BY name
2020-12-16 18:47:42,298 INFO sqlalchemy.engine.base.Engine ()
2020-12-16 18:47:42,300 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_xinfo("Training_set")
2020-12-16 18:47:42,301 INFO sqlalchemy.engine.base.Engine ()
2020-12-16 18:47:42,302 INFO sqlalchemy.engine.base.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNI

In [7]:
training_data_df = training_data_df.rename(columns={"class": "label"})
training_data_df = training_data_df.convert_dtypes()

In [8]:
training_data_df = training_data_df.drop(columns=['index'])
training_data_df.dtypes

abstract    string
label        Int64
dtype: object

In [18]:
class Sequences(Dataset):
    def __init__(self, df, max_seq_len):
        self.max_seq_len = max_seq_len
        sw.STOP_WORDS = sw.STOP_WORDS.union(esw.common_stopwords)
        sw.STOP_WORDS = sw.STOP_WORDS.union(esw.extra_stopwords)
        vectorizer = CountVectorizer(stop_words=sw.STOP_WORDS, max_df=0.99, min_df=0.005)
        vectorizer.fit(df.abstract.tolist())
        
        self.token2idx = vectorizer.vocabulary_
        self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1

        tokenizer = vectorizer.build_analyzer()
        self.encode = lambda x: [self.token2idx[token] for token in tokenizer(x)
                                 if token in self.token2idx]
        self.pad = lambda x: x + (max_seq_len - len(x)) * [self.token2idx['<PAD>']]
        
        sequences = [self.encode(sequence)[:max_seq_len] for sequence in df.abstract.tolist()]
        sequences, self.labels = zip(*[(sequence, label) for sequence, label
                                    in zip(sequences, df.label.tolist()) if sequence])
        self.sequences = [self.pad(sequence) for sequence in sequences]

    def __getitem__(self, i):
        assert len(self.sequences[i]) == self.max_seq_len
        return self.sequences[i], self.labels[i]
    
    def __len__(self):
        return len(self.sequences)

In [20]:
dataset = Sequences(training_data_df, max_seq_len=128)

In [22]:
len(dataset.token2idx)

2327

In [23]:
def collate(batch):
    inputs = torch.LongTensor([item[0] for item in batch])
    target = torch.FloatTensor([item[1] for item in batch])
    return inputs, target

batch_size = 2048
train_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate)

In [26]:
class RNN(nn.Module):
    def __init__(
        self,
        vocab_size,
        batch_size,
        embedding_dimension=100,
        hidden_size=128, 
        n_layers=1,
        device='gpu',
    ):
        super(RNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.device = device
        self.batch_size = batch_size
        
        self.encoder = nn.Embedding(vocab_size, embedding_dimension)
        self.rnn = nn.GRU(
            embedding_dimension,
            hidden_size,
            num_layers=n_layers,
            batch_first=True,
        )
        self.decoder = nn.Linear(hidden_size, 15)
        
    def init_hidden(self):
        return torch.randn(self.n_layers, self.batch_size, self.hidden_size).to(self.device)
    
    def forward(self, inputs):
        # Avoid breaking if the last batch has a different size
        batch_size = inputs.size(0)
        if batch_size != self.batch_size:
            self.batch_size = batch_size
            
        encoded = self.encoder(inputs)
        output, hidden = self.rnn(encoded, self.init_hidden())
        output = self.decoder(output[:, :, -1]).squeeze()
        return output

In [27]:
model = RNN(
    hidden_size=128,
    vocab_size=len(dataset.token2idx),
    device=device,
    batch_size=batch_size,
)
model = model.to(device)
model

RNN(
  (encoder): Embedding(2327, 100)
  (rnn): GRU(100, 128, batch_first=True)
  (decoder): Linear(in_features=128, out_features=15, bias=True)
)

In [28]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.001)

In [29]:
import pprint as pp
from tqdm.notebook import tqdm

In [30]:
model.train()
train_losses = []
for epoch in range(10):
    progress_bar = tqdm(train_loader, leave=False)
    losses = []
    total = 0
    for inputs, target in progress_bar:
        inputs, target = inputs.to(device), target.to(device
                                                     )
        model.zero_grad()
        
        output = model(inputs)
    
        loss = criterion(output, target)
        
        loss.backward()
              
        nn.utils.clip_grad_norm_(model.parameters(), 3)

        optimizer.step()
        
        progress_bar.set_description(f'Loss: {loss.item():.3f}')
        
        losses.append(loss.item())
        total += 1
    
    epoch_loss = sum(losses) / total
    train_losses.append(epoch_loss)

    tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))

ValueError: Target size (torch.Size([1533])) must be the same as input size (torch.Size([1533, 15]))

In [19]:
model.train()
train_losses = []
for epoch in range(10):
    progress_bar = tqdm(train_loader, leave=False)
    losses = []
    total = 0
    for inputs, target in progress_bar:
        model.zero_grad()

        output = model(inputs)
        pp.pprint('output')
        pp.pprint(output)
        pp.pprint('target')
        pp.pprint(target)
        loss = criterion(output, target)
        
        loss.backward()
              
        nn.utils.clip_grad_norm_(model.parameters(), 3)

        optimizer.step()
        
        progress_bar.set_description(f'Loss: {loss.item():.3f}')
        
        losses.append(loss.item())
        total += 1
    
    epoch_loss = sum(losses) / total
    train_losses.append(epoch_loss)
        
    tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=103.0), HTML(value='')))

'output'
tensor([[ 2.5047e-01, -5.9947e-02,  1.0230e-02, -2.5555e-01, -2.0220e-01,
          4.4057e-03,  1.4518e-01,  1.2040e-01,  1.0143e-01, -3.0175e-02,
         -5.8386e-02, -1.2564e-02, -1.0053e-02,  7.2277e-02,  2.3042e-01],
        [ 2.6239e-01, -4.1282e-02,  8.7380e-02, -1.4541e-01, -1.2756e-01,
          7.7238e-02,  4.7882e-02,  1.0111e-01, -8.5092e-02, -1.8413e-02,
         -8.0269e-02,  7.3408e-02,  5.9483e-02,  6.3335e-02,  1.3883e-01],
        [ 2.9831e-01, -1.4500e-02,  1.3441e-01, -2.7705e-01, -5.7040e-02,
          4.7354e-02,  5.2511e-02,  1.1467e-01, -3.0563e-02, -8.4339e-02,
         -6.9332e-02,  1.4165e-01,  1.6759e-02,  1.6161e-01,  3.0702e-01],
        [ 1.2285e-01, -5.5215e-02,  6.8204e-02, -2.0136e-01, -4.9490e-02,
          4.8596e-02,  1.4752e-01,  2.0903e-01, -6.1400e-02,  3.2615e-02,
         -1.1471e-01,  2.1627e-02,  6.1680e-02,  4.5400e-02,  2.3308e-01],
        [ 2.0266e-01, -1.3471e-02,  9.6764e-02, -2.0628e-01, -1.2545e-01,
          2.7798e-02,  6.

ValueError: Target size (torch.Size([15])) must be the same as input size (torch.Size([15, 15]))