In [None]:
# add dense layer
# balance
EPOCHS = 50
NBINS = 2
BATCH_SIZE = 32
SEQ_SIZE = 100
EMB_SIZE = 256
LSTM_SIZE = 128
HIDDEN_SIZE = 4
LR = 0.001
RS = 51
V = 2683

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import json

df = pd.read_json("gdrive/MyDrive/data/models/painters/allpainters.json").T
# df = df.sample(500)
df = df[["desc", "nbmuseum"]]
df["nbmuseum"] = df.nbmuseum.apply(lambda x: int(x))
df["inmuseum"] = df.nbmuseum > 0
df["inmuseum"] = df.inmuseum.apply(lambda x: float(x))
print(df.inmuseum.value_counts())
dfp=df[df.inmuseum==1.0]
dfn=df[df.inmuseum==0.0].sample(len(dfp), random_state=RS)
df=pd.concat([dfp,dfn])
print(df.inmuseum.value_counts())


0.0    21445
1.0     2416
Name: inmuseum, dtype: int64
1.0    2416
0.0    2416
Name: inmuseum, dtype: int64


In [None]:
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np

# est = KBinsDiscretizer(n_bins=NBINS, encode='ordinal', strategy="kmeans")
# ndf=pd.DataFrame()

X = np.array(df.desc)
# df["inmuseum"] = est.fit_transform(df.gross.to_frame())
y = pd.get_dummies(df.inmuseum).apply(lambda x: np.array(x)).values

y

array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]], dtype=uint8)

In [None]:
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
import torch

tokenizer = get_tokenizer('basic_english')

voc = torch.load("gdrive/MyDrive/data/models/painters/voc_2683")
print("The length of the new vocab is", len(voc))
print("The index of 'film' is", voc['painting'])
print("The token at index 123 is", voc.get_itos()[123])

The length of the new vocab is 2683
The index of 'film' is 127
The token at index 123 is artists


In [None]:
from torch.utils.data import Dataset
from torch.nn.functional import pad
import math

class PainterDataset(Dataset):
    def __init__(self, X, y, vocab, tokenizer, text_size):
        self.labels = y
        self.texts = X
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.text_size = text_size

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        t = torch.tensor(self.vocab(self.tokenizer(self.texts[idx])))
        if len(t) < self.text_size: t=pad(t,(0,self.text_size-len(t)))
        if len(t) > self.text_size: t=t[:self.text_size]
        return t, torch.tensor(self.labels[idx])

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class MyModel(nn.Module):
    def __init__(self,voc_size,emb_size,lstm_size,hidden_size,output_size):
        super(MyModel,self).__init__()
        self.embedding = nn.Embedding(num_embeddings=voc_size,embedding_dim=emb_size)
        self.lstm = nn.LSTM(emb_size,lstm_size,bidirectional=True,batch_first=True)
        # mean and max pooling in both directions => *4
        self.hidden = nn.Linear(lstm_size*4,hidden_size)
        self.relu = nn.ReLU()
        self.out  = nn.Linear(hidden_size, output_size)
        self.sm = nn.LogSoftmax(dim=1)

    def forward(self,x):
        x = self.embedding(x)
        x,_ = self.lstm(x)
        avg_pool = torch.mean(x,1)
        max_pool,_ = torch.max(x,1)
        out = torch.cat((avg_pool,max_pool),1)
        out = self.hidden(out)
        out = self.relu(out)
        out = self.out(out)
        out = self.sm(out)
        return out

    def embs(self,x):
      return self.embedding(x)

In [None]:
import torch

model = MyModel(len(voc), EMB_SIZE, LSTM_SIZE, HIDDEN_SIZE, NBINS)
model.load_state_dict(torch.load("gdrive/MyDrive/data/models/painters/model_sd_NB2_E5_RS51_BS64_LR0.001_V2683_100x256x128x4_VA0.7263321262286602"))

<All keys matched successfully>

In [None]:
from torch.utils.data import DataLoader

def get_activation(name):
     def hook(model, input, output):
         if type(output) != torch.Tensor: activation[name] = output
         else: activation[name] = output.detach()
     return hook

for k in model.__dict__["_modules"]:
   getattr(model,k).register_forward_hook(get_activation(k))

dataset = PainterDataset(X, y, voc, tokenizer, SEQ_SIZE)

class IndexDataset(Dataset):
    def __init__(self, originalDataset):
        self.dataset = originalDataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
       return self.dataset[idx], idx

idataset = IndexDataset(dataset)

loader = DataLoader(idataset, batch_size=128)

# DOC: Assumes dataset returns a tuple, maybe should have Xs only?
actdata = {"idx": [], "targets": [], "preds": [], "activations": {}}
with torch.no_grad():
 for d,t in loader:
  activation = {}
  out = model(d[0])
  for i,ut in enumerate(t):
    actdata["idx"].append(int(ut))
    actdata["targets"].append(d[1][i])
    actdata["preds"].append(out[i])
    for k in activation:
      if type(activation[k]) == torch.Tensor:
         if k not in actdata["activations"]: actdata["activations"][k] = []
         actdata["activations"][k].append(activation[k][i])
      else:
         if k not in actdata["activations"]: actdata["activations"][k] = []
         actdata["activations"][k].append(activation[k][0][i])
for k in actdata["activations"]:
  print(k, len(actdata["activations"][k]))

embedding 4832
lstm 4832
hidden 4832
relu 4832
out 4832
sm 4832


In [None]:
torch.save(actdata,"gdrive/MyDrive/data/models/painters/actdata")