In [8]:
from flair.data import Sentence
from flair.embeddings import FlairEmbeddings, DocumentPoolEmbeddings
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [9]:
df = pd.read_csv('../data/clean_book_data.csv')
classes = list(df['genre'].unique())

In [10]:
layers = []
layers.append(nn.Linear(2048, 256))
layers.append(nn.ReLU())
layers.append(nn.Linear(256, 256))
layers.append(nn.ReLU())
layers.append(nn.Linear(256, len(classes)))

In [11]:
model = nn.Sequential(*layers)

In [12]:
lm_tuned = FlairEmbeddings('../models/best-lm.pt')
doc_tuned = DocumentPoolEmbeddings([lm_tuned])

lm_embed = FlairEmbeddings('news-forward')
doc_embed = DocumentPoolEmbeddings([lm_embed])

In [13]:
def shorten_desc(desc):
    if len(desc) > 300:
        desc = desc[0:300]
    return desc

In [14]:
df['book_desc'] = df['book_desc'].astype(str)
df['book_desc'] = df['book_desc'].apply(shorten_desc)

In [15]:
def get_sentence_embedding(sent, embed):
    try:
        sent = Sentence(sent)
        embed.embed(sent)
        return sent.embedding.detach().cpu().numpy()
    except:
        return np.zeros(2048)

In [16]:
desc_feats = []
for i in tqdm(range(df.shape[0])):
    desc_feats.append(get_sentence_embedding(df['book_desc'][i], doc_embed))

HBox(children=(IntProgress(value=0, max=18837), HTML(value='')))




In [17]:
desc_feats = np.stack(desc_feats)

In [36]:
def load_data(x, y, i):
        x_tens = torch.FloatTensor(x[i, :]).view(1, -1)
        y_tens = torch.LongTensor([y[i]])
        if torch.cuda.is_available():
            x_tens = x_tens.cuda()
            y_tens = y_tens.cuda()
        return x_tens, y_tens

In [19]:
desc_feats = pd.DataFrame(desc_feats)
x_train, x_test, y_train, y_test = train_test_split(desc_feats, df.genre,
                                                    test_size=.4, stratify=df.genre, random_state=0)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=.5,
                                                stratify=y_test, random_state=0)

In [20]:
x_train, y_train, x_val, y_val, x_test, y_test =x_train.values, y_train.values, x_val.values, y_val.values, x_test.values, y_test.values 

In [49]:
def train_model(model, criterion, optimizer, epochs, x_train, y_train, x_val, y_val, model_name):
    best_loss = np.inf
    if torch.cuda.is_available():
        model = model.cuda()
        
    for epoch in range(epochs):
        train_loss = 0
        val_loss = 0
        
        for i in range(x_train.shape[0]):
            x_tens, y_tens = load_data(x_train, y_train, i)
            
            optimizer.zero_grad()
            out = model(x_tens)
            loss = criterion(out, y_tens)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()

        model.eval()
        for i in range(x_val.shape[0]):
            x_tens, y_tens = load_data(x_val, y_val, i)
            
            out = model(x_tens)
            loss = criterion(out, y_tens)

            val_loss += loss.item()
            
        train_loss = train_loss/x_train.shape[0]
        val_loss = val_loss/x_val.shape[0]
            
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(epoch+1, train_loss, val_loss))
        
        if val_loss < best_loss:
            no_improvement = 0
            best_loss = val_loss
            print('Improved Model Score - Updating Best Model Parameters...')
            torch.save(model.state_dict(), f'../models/{model_name}.pt')
        else:
            no_improvement +=1
            if no_improvement==10:
                print('No Improvement for 10 epochs, Early Stopping')
                break

In [50]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.0003)
train_model(model, criterion, optimizer, 200, x_train, y_train, x_val, y_val, 'flair_best_classifier')

Epoch: 1 	Training Loss: 117.246827 	Validation Loss: 133.428352
Improved Model Score - Updating Best Model Parameters...
Epoch: 2 	Training Loss: 109.298677 	Validation Loss: 126.479640
Improved Model Score - Updating Best Model Parameters...
Epoch: 3 	Training Loss: 106.226356 	Validation Loss: 118.421920
Improved Model Score - Updating Best Model Parameters...
Epoch: 4 	Training Loss: 103.538589 	Validation Loss: 116.812415
Improved Model Score - Updating Best Model Parameters...
Epoch: 5 	Training Loss: 101.711574 	Validation Loss: 114.175899
Improved Model Score - Updating Best Model Parameters...
Epoch: 6 	Training Loss: 99.567965 	Validation Loss: 94.645014
Improved Model Score - Updating Best Model Parameters...
Epoch: 7 	Training Loss: 97.255853 	Validation Loss: 96.876549
Epoch: 8 	Training Loss: 94.979653 	Validation Loss: 101.389036
Epoch: 9 	Training Loss: 92.948724 	Validation Loss: 97.093445
Epoch: 10 	Training Loss: 89.608612 	Validation Loss: 94.411525
Improved Model S

Epoch: 89 	Training Loss: 5.126591 	Validation Loss: 13.440791
Improved Model Score - Updating Best Model Parameters...
Epoch: 90 	Training Loss: 4.899946 	Validation Loss: 13.505231
Epoch: 91 	Training Loss: 4.795920 	Validation Loss: 12.939801
Improved Model Score - Updating Best Model Parameters...
Epoch: 92 	Training Loss: 4.589914 	Validation Loss: 12.613907
Improved Model Score - Updating Best Model Parameters...
Epoch: 93 	Training Loss: 4.441484 	Validation Loss: 12.925271
Epoch: 94 	Training Loss: 4.244018 	Validation Loss: 12.493393
Improved Model Score - Updating Best Model Parameters...
Epoch: 95 	Training Loss: 4.110940 	Validation Loss: 11.858325
Improved Model Score - Updating Best Model Parameters...
Epoch: 96 	Training Loss: 4.002974 	Validation Loss: 11.583519
Improved Model Score - Updating Best Model Parameters...
Epoch: 97 	Training Loss: 3.822378 	Validation Loss: 11.313494
Improved Model Score - Updating Best Model Parameters...
Epoch: 98 	Training Loss: 3.682752

In [51]:
model.load_state_dict(torch.load('../models/flair_best_classifier.pt'))

model = model.cuda()
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for i in range(x_test.shape[0]):
        x_tens, y_tens = load_data(x_test, y_test, i)
        outputs = model(x_tens)
        _, predicted = torch.max(outputs.data, 1)
        total += y_tens.size(0)
        correct += (predicted == y_tens).sum().item()
          
    print('Test Accuracy: {} %'.format(100 * correct / total))

Test Accuracy: 21.629511677282377 %


In [52]:
desc_feats = []
for i in tqdm(range(df.shape[0])):
    desc_feats.append(get_sentence_embedding(df['book_desc'][i], doc_tuned))
    
desc_feats = np.stack(desc_feats)

HBox(children=(IntProgress(value=0, max=18837), HTML(value='')))




In [53]:
model = nn.Sequential(*layers)

In [55]:
desc_feats = pd.DataFrame(desc_feats)
x_train, x_test, y_train, y_test = train_test_split(desc_feats, df.genre,
                                                    test_size=.4, stratify=df.genre, random_state=0)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=.5,
                                                stratify=y_test, random_state=0)

x_train, y_train, x_val, y_val, x_test, y_test =x_train.values, y_train.values, x_val.values, y_val.values, x_test.values, y_test.values

In [59]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.0003)
train_model(model, criterion, optimizer, 200, x_train, y_train, x_val, y_val, 'tuned_flair_best_classifier')

Epoch: 1 	Training Loss: 7.857734 	Validation Loss: 9.922993
Improved Model Score - Updating Best Model Parameters...
Epoch: 2 	Training Loss: 4.986064 	Validation Loss: 7.485709
Improved Model Score - Updating Best Model Parameters...
Epoch: 3 	Training Loss: 4.053153 	Validation Loss: 6.611289
Improved Model Score - Updating Best Model Parameters...
Epoch: 4 	Training Loss: 3.536738 	Validation Loss: 6.105238
Improved Model Score - Updating Best Model Parameters...
Epoch: 5 	Training Loss: 3.226733 	Validation Loss: 5.701527
Improved Model Score - Updating Best Model Parameters...
Epoch: 6 	Training Loss: 2.981042 	Validation Loss: 5.628391
Improved Model Score - Updating Best Model Parameters...
Epoch: 7 	Training Loss: 2.798364 	Validation Loss: 5.339369
Improved Model Score - Updating Best Model Parameters...
Epoch: 8 	Training Loss: 2.656885 	Validation Loss: 5.092162
Improved Model Score - Updating Best Model Parameters...
Epoch: 9 	Training Loss: 2.547188 	Validation Loss: 5.07

In [60]:
model.load_state_dict(torch.load('../models/tuned_flair_best_classifier.pt'))

model = model.cuda()
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for i in range(x_test.shape[0]):
        x_tens, y_tens = load_data(x_test, y_test, i)
        outputs = model(x_tens)
        _, predicted = torch.max(outputs.data, 1)
        total += y_tens.size(0)
        correct += (predicted == y_tens).sum().item()
          
    print('Test Accuracy: {} %'.format(100 * correct / total))

Test Accuracy: 23.673036093418258 %
