In [None]:
# add dense layer
# balance
EPOCHS = 50
NBINS = 2
BATCH_SIZE = 64
SEQ_SIZE = 100
EMB_SIZE = 256
LSTM_SIZE = 128
HIDDEN_SIZE = 4
LR = 0.001
RS = 51

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import json

df = pd.read_json("gdrive/MyDrive/data/models/painters/allpainters.json").T
# df = df.sample(500)
df = df[["desc", "nbmuseum"]]
df["nbmuseum"] = df.nbmuseum.apply(lambda x: int(x))
df["inmuseum"] = df.nbmuseum > 0
df["inmuseum"] = df.inmuseum.apply(lambda x: float(x))
print(df.inmuseum.value_counts())
dfp=df[df.inmuseum==1.0]
dfn=df[df.inmuseum==0.0].sample(len(dfp), random_state=RS)
df=pd.concat([dfp,dfn])
print(df.inmuseum.value_counts())

0.0    21445
1.0     2416
Name: inmuseum, dtype: int64
1.0    2416
0.0    2416
Name: inmuseum, dtype: int64


In [None]:
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np

# est = KBinsDiscretizer(n_bins=NBINS, encode='ordinal', strategy="kmeans")
# ndf=pd.DataFrame()

X = np.array(df.desc)
# df["inmuseum"] = est.fit_transform(df.gross.to_frame())
y = pd.get_dummies(df.inmuseum).apply(lambda x: np.array(x)).values

y

array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]], dtype=uint8)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=RS)

In [None]:
import torch
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab

tokenizer = get_tokenizer('basic_english')

counter = Counter()
for text in X_train:
  counter.update(tokenizer(text))
voc = vocab(counter, min_freq=10, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))
voc.set_default_index(voc['<unk>'])

print("The length of the new vocab is", len(voc))
print("The index of 'painting' is", voc['painting'])
print("The token at index 123 is", voc.get_itos()[123])

torch.save(voc, f"gdrive/MyDrive/data/models/painters/voc_{len(voc)}")

The length of the new vocab is 2683
The index of 'painting' is 127
The token at index 123 is artists


In [None]:
from torch.utils.data import Dataset
from torch.nn.functional import pad
import math

class PainterDataset(Dataset):
    def __init__(self, X, y, vocab, tokenizer, text_size):
        self.labels = y
        self.texts = X
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.text_size = text_size

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        t = torch.tensor(self.vocab(self.tokenizer(self.texts[idx])))
        if len(t) < self.text_size: t=pad(t,(0,self.text_size-len(t)))
        if len(t) > self.text_size: t=t[:self.text_size]
        return t, torch.tensor(self.labels[idx])

In [None]:
from torch.utils.data import DataLoader

traindata = PainterDataset(X_train, y_train, voc, tokenizer, SEQ_SIZE)
testdata = PainterDataset(X_test, y_test, voc, tokenizer, SEQ_SIZE)

train_loader = DataLoader(traindata, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(testdata, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class MyModel(nn.Module):
    def __init__(self,voc_size,emb_size,lstm_size,hidden_size,output_size):
        super(MyModel,self).__init__()
        self.embedding = nn.Embedding(num_embeddings=voc_size,embedding_dim=emb_size)
        self.lstm = nn.LSTM(emb_size,lstm_size,bidirectional=True,batch_first=True)
        # mean and max pooling in both directions => *4
        self.hidden = nn.Linear(lstm_size*4,hidden_size)
        self.relu = nn.ReLU()
        self.out  = nn.Linear(hidden_size, output_size)
        self.sm = nn.LogSoftmax(dim=1)

    def forward(self,x):
        x = self.embedding(x)
        x,_ = self.lstm(x)
        avg_pool = torch.mean(x,1)
        max_pool,_ = torch.max(x,1)
        out = torch.cat((avg_pool,max_pool),1)
        out = self.hidden(out)
        out = self.relu(out)
        out = self.out(out)
        out = self.sm(out)
        return out

    def embs(self,x):
      return self.embedding(x)

In [None]:
import torch
import torch.nn as nn

def train(data_loader,model,optimizer):
    final_predictions = []
    final_targets = []
    criterion = nn.CrossEntropyLoss()
    model.train()
    for description,targets in data_loader:
        optimizer.zero_grad()
        predictions = model(description) # float??
        # for i,c in enumerate(predictions):
        #   for cc in c: print(float(cc), end=" ")
        #   print()
        #   for cc in targets[i]:
        #     print(float(cc), end=" ")
        #   print()
        loss = criterion(predictions,targets.float()) # check
        loss.backward()
        optimizer.step()
        predictions = predictions.detach().cpu().numpy().tolist()
        targets = targets.detach().cpu().numpy().tolist()
        final_predictions.extend(predictions)
        final_targets.extend(targets)
    return final_predictions,final_targets

def evaluate(data_loader,model):
  with torch.no_grad():
    final_predictions = []
    final_targets = []
    model.eval()
    for description,targets in data_loader:
        predictions = model(description)
        predictions = predictions.detach().cpu().numpy().tolist()
        targets = targets.detach().cpu().numpy().tolist()
        final_predictions.extend(predictions)
        final_targets.extend(targets)
    return final_predictions,final_targets


In [None]:
model = MyModel(len(voc), EMB_SIZE, LSTM_SIZE, HIDDEN_SIZE, NBINS)
optimizer = torch.optim.Adam(model.parameters(),lr = LR)

test_text = next(iter(train_loader))[0][0]
print(test_text)
print(model.embedding(test_text))

tensor([   0,    0,    4,  139,  241,    0,   21,    0,    0,  916,  206,   21,
         728,   26,  181,    5,    6,   66,  260,    9,   61,    9,   19,   17,
         137,   19,    0,   10,    0,  704,   26,    0,    0,   26,    6,    7,
           9,   21,    0,   26,  156,   41,  459,   96,    7, 1683,   19,   17,
         247,   19,    0,    0,   10,    0,   49,    7, 1619, 1287, 2062,    0,
          21,  728,   26,   61, 1088,   47,   57,    7,  127,    0,   26,  620,
        1716,   47,   21,   17,  638,   57, 1687,    0,    0,   10,   21, 2348,
          26,   41,  582,    7,  119,   69,  161,   20,   54,   17,    0,   87,
          69,  557,    0,   19])
tensor([[ 1.3280,  1.3864,  1.4667,  ..., -0.3152,  2.5433, -1.7031],
        [ 1.3280,  1.3864,  1.4667,  ..., -0.3152,  2.5433, -1.7031],
        [ 0.8225, -0.9836,  0.9226,  ..., -0.2233,  2.3221,  0.5981],
        ...,
        [ 1.5412, -0.5212,  0.8850,  ..., -0.9201, -0.7455, -0.0340],
        [ 1.3280,  1.3864,  1.4667

In [None]:
import copy
print(model)
init_model = copy.deepcopy(model.state_dict())

MyModel(
  (embedding): Embedding(2683, 256)
  (lstm): LSTM(256, 128, batch_first=True, bidirectional=True)
  (hidden): Linear(in_features=512, out_features=4, bias=True)
  (relu): ReLU()
  (out): Linear(in_features=4, out_features=2, bias=True)
  (sm): LogSoftmax(dim=1)
)


In [None]:
from sklearn import metrics
import time

def acc(t,p):
  tt = []
  pp = []
  for i,v in enumerate(t):
    tt.append(np.argmax(v))
    pp.append(np.argmax(p[i]))
  return metrics.accuracy_score(tt,pp), metrics.balanced_accuracy_score(tt,pp)

print("Traning model")
best_va = {"accuracy": 0, "model": None, "epoch": 0}
best_ta = {"accuracy": 0, "model": None, "epoch": 0}
best_ba = {"accuracy": 0, "model": None, "epoch": 0}
for epoch in range(1,EPOCHS+1):
        t = time.time()
        toutputs,ttargets = train(train_loader,model,optimizer)
        outputs,targets = evaluate(test_loader,model)
        accuracy,baccuracy = acc(targets,outputs)
        taccuracy,btaccuracy = acc(ttargets,toutputs)
        print(f"{epoch} ({round((time.time()-t)*1000)}ms): Accuracy Score: {accuracy}/{baccuracy} ({taccuracy}/{btaccuracy})")
        if accuracy>best_va["accuracy"]:
            best_va["accuracy"] = accuracy
            best_va["model"] = copy.deepcopy(model.state_dict())
            best_va["epoch"] = epoch
        if taccuracy>best_ta["accuracy"]:
            best_ta["accuracy"] = taccuracy
            best_ta["model"] = copy.deepcopy(model.state_dict())
            best_ta["epoch"] = epoch
        if baccuracy>best_ba["accuracy"]:
            best_ba["accuracy"] = baccuracy
            best_ba["model"] = copy.deepcopy(model.state_dict())
            best_ba["epoch"] = epoch

Traning model
1 (18980ms): Accuracy Score: 0.6254526642524573/0.6232669557304213 (0.58364953432218/0.5834066325301835)
2 (18066ms): Accuracy Score: 0.6968442834971547/0.695619613511054 (0.7054156605726113/0.7052517922605568)
3 (21848ms): Accuracy Score: 0.7097775478530781/0.7096231465124994 (0.7640565712314591/0.764270972851691)
4 (26903ms): Accuracy Score: 0.7139161924469736/0.7141807183769606 (0.8220075888237324/0.8222158918370397)
5 (19658ms): Accuracy Score: 0.7263321262286602/0.7253872919008618 (0.8682304242842359/0.8683755850846606)
6 (20628ms): Accuracy Score: 0.7128815312984997/0.7130458754884642 (0.9220420834770611/0.9221284087696302)
7 (19704ms): Accuracy Score: 0.7159855147439214/0.7157951929768214 (0.9565367368057951/0.9565554708331072)
8 (20270ms): Accuracy Score: 0.7056389032591827/0.7053749799261282 (0.9782683684028975/0.9782372780676427)
9 (20878ms): Accuracy Score: 0.7046042421107087/0.7040035330014454 (0.985857192135219/0.9858142255654272)
10 (20456ms): Accuracy Score

KeyboardInterrupt: ignored

In [None]:
print(best_va["accuracy"], best_va["epoch"])
print(best_ta["accuracy"], best_ta["epoch"])
print(best_ba["accuracy"], best_ba["epoch"])

0.7263321262286602 5
0.9917212832011039 17
0.7253872919008618 5


In [None]:
import torch

torch.save(best_va["model"], f"gdrive/MyDrive/data/models/painters/model_sd_NB{NBINS}_E{best_va['epoch']}_RS{RS}_BS{BATCH_SIZE}_LR{LR}_V{len(voc)}_{SEQ_SIZE}x{EMB_SIZE}x{LSTM_SIZE}x{HIDDEN_SIZE}_VA{best_va['accuracy']}")
torch.save(best_ta["model"], f"gdrive/MyDrive/data/models/painters/model_sd_NB{NBINS}_E{best_ta['epoch']}_RS{RS}_BS{BATCH_SIZE}_LR{LR}_V{len(voc)}_{SEQ_SIZE}x{EMB_SIZE}x{LSTM_SIZE}x{HIDDEN_SIZE}_TA{best_ta['accuracy']}")
