In [1]:
import pandas as pd

In [2]:
from inc import stop_words as sw
from inc import extra_stopwords as esw

In [3]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [5]:
from sqlalchemy import create_engine
table_name = 'Training_set'
db_path = 'db/training.sqlite'
db_path = 'sqlite:///' + db_path
engine = create_engine(db_path, echo=True)

In [6]:
training_data_df = pd.read_sql_table(table_name, engine)

2020-12-16 19:02:41,939 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2020-12-16 19:02:41,940 INFO sqlalchemy.engine.base.Engine ()
2020-12-16 19:02:41,942 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2020-12-16 19:02:41,943 INFO sqlalchemy.engine.base.Engine ()
2020-12-16 19:02:41,945 INFO sqlalchemy.engine.base.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2020-12-16 19:02:41,946 INFO sqlalchemy.engine.base.Engine ()
2020-12-16 19:02:41,948 INFO sqlalchemy.engine.base.Engine SELECT name FROM sqlite_master WHERE type='view' ORDER BY name
2020-12-16 19:02:41,948 INFO sqlalchemy.engine.base.Engine ()
2020-12-16 19:02:41,950 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_xinfo("Training_set")
2020-12-16 19:02:41,951 INFO sqlalchemy.engine.base.Engine ()
2020-12-16 19:02:41,952 INFO sqlalchemy.engine.base.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNI

In [7]:
training_data_df = training_data_df.rename(columns={"class": "label"})
training_data_df = training_data_df.convert_dtypes()

In [81]:
training_data_df = training_data_df.sample(frac=1)

In [82]:
training_data_df = training_data_df.drop(columns=['index'])
training_data_df.dtypes

KeyError: "['index'] not found in axis"

In [9]:
class Sequences(Dataset):
    def __init__(self, df):
        sw.STOP_WORDS = sw.STOP_WORDS.union(esw.common_stopwords)
        sw.STOP_WORDS = sw.STOP_WORDS.union(esw.extra_stopwords)
        self.vectorizer = CountVectorizer(stop_words=sw.STOP_WORDS, max_df=0.99, min_df=0.005)
        self.sequences = self.vectorizer.fit_transform(df.abstract.tolist())
        self.labels = df.label.tolist()
        self.token2idx = self.vectorizer.vocabulary_
        self.idx2token = {idx: token for token, idx in self.token2idx.items()}
        
    def __getitem__(self, i):
        return self.sequences[i, :].toarray(), self.labels[i]
    
    def __len__(self):
        return self.sequences.shape[0]

In [None]:
class Dataset(torch.utils.data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, list_IDs, labels):
        'Initialization'
        self.labels = labels
        self.list_IDs = list_IDs

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.list_IDs)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        ID = self.list_IDs[index]

        # Load data and get label
        X = torch.load('data/' + ID + '.pt')
        y = self.labels[ID]

        return X, y

In [83]:
batch_size = 10
dataset = Sequences(training_data_df)
train_loader = DataLoader(dataset, batch_size=batch_size)

print(dataset[5][0].shape)

(1, 2326)


In [57]:
# dataset.__getitem__(1000)

In [84]:
dim = len(dataset.token2idx)

In [85]:
class BagOfWordsClassifier(nn.Module):
    def __init__(self, vocab_size, hidden1, hidden2):
        super(BagOfWordsClassifier, self).__init__()
        self.fc1 = nn.Linear(vocab_size, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 15)
    
    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float()))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [86]:
model = BagOfWordsClassifier(len(dataset.token2idx), 128, 64)
model

BagOfWordsClassifier(
  (fc1): Linear(in_features=2326, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=15, bias=True)
)

In [87]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.001)

In [88]:
import pprint as pp
from tqdm.notebook import tqdm

In [89]:
import numpy as np
from math import ceil

In [130]:
training_data_df.label

543     14
1071     8
777      4
1231    11
1125     9
        ..
1385    16
1020     8
343     11
236      7
997      8
Name: label, Length: 1533, dtype: Int64

In [105]:
target_split = np.array_split(training_data_df.label.values, ceil(dim / 15.))

In [135]:
len(target_split)

156

In [127]:
stacked = np.vstack(target_split[120:135])
pp.pprint(torch.from_numpy(stacked.astype(float)))

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 10 and the array at index 9 has size 9

In [126]:
model.train()
train_losses = []
for epoch in range(10):
    progress_bar = tqdm(train_loader, leave=False)
    losses = []
    total = 0
    count = 0
    for inputs, target in progress_bar:
        model.zero_grad()

        output = model(inputs)
        pp.pprint('output')
        pp.pprint(output)
        
        stacked = np.stack(target_split[15*count:15*(count+1)])
        print(' target')
        stacked = stacked.T
        pp.pprint(target_batch)
        target_batch = torch.from_numpy(stacked.astype(float))
        count += 1
        
        loss = criterion(output, target_batch)
        
        loss.backward()
              
        nn.utils.clip_grad_norm_(model.parameters(), 3)

        optimizer.step()
        
        progress_bar.set_description(f'Loss: {loss.item():.3f}')
        
        losses.append(loss.item())
        total += 1
    
    epoch_loss = sum(losses) / total
    train_losses.append(epoch_loss)
        
    tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=154.0), HTML(value='')))

'output'
tensor([[ 9.1280,  7.3236,  6.0056, 13.1826,  8.2901, 10.8844,  7.5277,  8.1969,
         10.7310,  6.0438,  7.5591, 12.6305,  8.3105, 14.0736,  8.9967],
        [ 6.1685,  4.8066,  4.0866,  8.9273,  5.6747,  6.9252,  5.0936,  5.7102,
          7.1163,  3.9768,  5.1489,  8.4046,  5.4390,  9.4187,  5.9548],
        [ 5.1794,  4.2525,  3.5261,  7.6678,  4.7623,  6.0590,  4.3582,  5.0220,
          6.2190,  3.5486,  4.2973,  7.3956,  4.6648,  8.1004,  5.1877],
        [ 4.7989,  3.6091,  3.0985,  6.7967,  4.3190,  5.3443,  3.8441,  4.3850,
          5.4253,  3.1233,  3.8732,  6.4106,  4.1051,  7.2005,  4.5874],
        [ 3.3698,  2.6143,  2.1912,  4.8741,  3.1303,  3.7373,  2.7667,  3.2799,
          3.8731,  2.2212,  2.7647,  4.6519,  2.8758,  5.0448,  3.3468],
        [ 2.9508,  2.2373,  1.9330,  4.1141,  2.5419,  3.2365,  2.3572,  2.7965,
          3.3477,  1.9336,  2.3421,  3.8981,  2.4515,  4.3540,  2.7688],
        [ 5.1577,  3.8503,  3.3644,  7.2520,  4.6435,  5.8587,  3.8

ValueError: all input arrays must have the same shape