In [51]:
from flair.data import Sentence
from flair.embeddings import FlairEmbeddings, DocumentPoolEmbeddings
import pandas as pd
import torch.nn as nn
import numpy as np
import torch
from sklearn.decomposition import PCA
import joblib
from torchvision import models, transforms
from tqdm import tqdm_notebook as tqdm
import cv2

In [2]:
df = pd.read_csv('../data/clean_book_data.csv')

In [3]:
df.head()

Unnamed: 0,index,book_desc,book_format,book_isbn,book_pages,book_rating,book_rating_count,book_review_count,book_title,image_url,genre,author
0,0.jpg,Winning will make you famous. Losing means cer...,Hardcover,9780440000000.0,374 pages,4.33,5519135,160706,The Hunger Games,https://images.gr-assets.com/books/1447303603l...,76,Suzanne Collins
1,1.jpg,There is a door at the end of a silent corrido...,Paperback,9780440000000.0,870 pages,4.48,2041594,33264,Harry Potter and the Order of the Phoenix,https://images.gr-assets.com/books/1255614970l...,25,J.K. Rowling
2,2.jpg,The unforgettable novel of a childhood in a sl...,Paperback,9780060000000.0,324 pages,4.27,3745197,79450,To Kill a Mockingbird,https://images.gr-assets.com/books/1361975680l...,13,Harper Lee
3,3.jpg,«È cosa ormai risaputa che a uno scapolo in po...,Paperback,9780680000000.0,279 pages,4.25,2453620,54322,Pride and Prejudice,https://images.gr-assets.com/books/1320399351l...,13,Jane Austen
4,4.jpg,About three things I was absolutely positive.F...,Paperback,9780320000000.0,498 pages,3.58,4281268,97991,Twilight,https://images.gr-assets.com/books/1361039443l...,76,Stephenie Meyer


In [4]:
def shorten_desc(desc):
    if len(desc) > 300:
        desc = desc[0:300]
    return desc

In [5]:
df['book_desc'] = df['book_desc'].astype(str)
df['book_desc'] = df['book_desc'].apply(shorten_desc)

In [6]:
lm_tuned = FlairEmbeddings('../models/best-lm.pt')
doc_tuned = DocumentPoolEmbeddings([lm_tuned])

In [7]:
def get_sentence_embedding(sent):
    try:
        sent = Sentence(sent)
        doc_tuned.embed(sent)
        return sent.embedding.detach().cpu().numpy()
    except:
        return np.zeros(2048)

In [8]:
desc_feats = []
for i in tqdm(range(df.shape[0])):
    desc_feats.append(get_sentence_embedding(df['book_desc'][i]))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(IntProgress(value=0, max=18837), HTML(value='')))




In [9]:
desc_feats = np.stack(desc_feats)

In [10]:
transforms = transforms.Compose([transforms.ToPILImage(),
                                     transforms.Resize((224, 224)),
                                     transforms.ToTensor(),
                                     transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225])])

In [11]:
def get_img(img_path):
    img = cv2.imread(img_path)
    img = transforms(img).float()
    img = img.unsqueeze_(0)
    if torch.cuda.is_available():
        img = img.cuda()
    return img

In [39]:
vgg = models.vgg11(pretrained=False)
vgg.classifier[6] = nn.Linear(4096, 77)

vgg.load_state_dict(torch.load('../models/vgg_tuned.pt'))
vgg.classifier = nn.Sequential(*[vgg.classifier[i] for i in range(4)])

for p in vgg.parameters():
     p.requires_grad=False
    
if torch.cuda.is_available():
    vgg = vgg.cuda()

In [38]:
vgg(get_img('../data/goodreads-best-books/images/images/1.jpg'))

torch.Size([1, 4096])

In [42]:
path = '../data/goodreads-best-books/images/images/'
img_feats = []
for i in tqdm(range(df.shape[0])):
    img_path = path+df['index'][i]
    img_feats.append(vgg(get_img(img_path)).detach().cpu().numpy()) 

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, max=18837), HTML(value='')))




In [43]:
img_feats = np.stack(img_feats)

In [49]:
img_feats = img_feats.reshape((-1, 4096))

In [52]:
img_pca = PCA(.9)

In [55]:
img_feats = img_pca.fit_transform(img_feats)

In [56]:
desc_pca = PCA(.9)

In [58]:
desc_feats = desc_pca.fit_transform(desc_feats)

In [59]:
df = df[['book_pages', 'book_rating', 'book_rating_count', 'book_review_count', 'genre']]

In [None]:
img_feats = pd.DataFrame(img_feats)
desc_feats = pd.DataFrame(desc_feats)
img_feats.columns = ['img']